77a819d426026e8a6ac3d7965af8f44fbb9a0272 hiram Wed Jan 10 12:42:07 2024 -0800 better manage large genome construction and add in RepeatModeler track refs #29545 diff --git src/hg/utils/automation/doAssemblyHub.pl src/hg/utils/automation/doAssemblyHub.pl index 0c6090b..3d32a7c 100755 --- src/hg/utils/automation/doAssemblyHub.pl +++ src/hg/utils/automation/doAssemblyHub.pl @@ -717,41 +717,46 @@ \${asmId}_rm.run \\ \${asmId}_assembly_structure \\ \$asmId.2bit ln -s $assemblySource/\${asmId}_genomic.fna.gz . ln -s $assemblySource/\${asmId}_assembly_report.txt . if [ -s $assemblySource/\${asmId}_rm.out.gz ]; then ln -s $assemblySource/\${asmId}_rm.out.gz . fi if [ -s $assemblySource/\${asmId}_rm.run ]; then ln -s $assemblySource/\${asmId}_rm.run . fi if [ -d $assemblySource/\${asmId}_assembly_structure ]; then ln -s $assemblySource/\${asmId}_assembly_structure . fi - faToTwoBit \${asmId}_genomic.fna.gz \$asmId.2bit + export asmSize=`grep -v "^#" \${asmId}_assembly_report.txt | head | cut -f9 | ave stdin | grep total | awk '{printf "%d", \$NF}'` + export longArg="" + if [ "\$asmSize" -gt 4294967295 ]; then + longArg="-long" + fi + faToTwoBit \${longArg} \${asmId}_genomic.fna.gz \$asmId.2bit twoBitDup \$asmId.2bit > \$asmId.dups.txt if [ -s "\$asmId.dups.txt" ]; then printf "WARNING duplicate sequences found in \$asmId.2bit\\n" 1>&2 cat \$asmId.dups.txt 1>&2 awk '{print \$1}' \$asmId.dups.txt > \$asmId.remove.dups.list mv \${asmId}_genomic.fna.gz \${asmId}_genomic.fna.dups.gz faSomeRecords -exclude \${asmId}_genomic.fna.dups.gz \\ \$asmId.remove.dups.list stdout | gzip -c > \${asmId}_genomic.fna.gz rm -f \$asmId.2bit - faToTwoBit \${asmId}_genomic.fna.gz \$asmId.2bit + faToTwoBit \${longArg} \${asmId}_genomic.fna.gz \$asmId.2bit fi gzip -f \$asmId.dups.txt touch -r \${asmId}_genomic.fna.gz \$asmId.2bit else printf "# download step previously completed\\n" 1>&2 exit 0 fi _EOF_ ); $bossScript->execute(); } # doDownload ######################################################################### @@ -908,43 +913,48 @@ twoBitToFa ../download/\$asmId.2bit stdout | gzip -c > \$asmId.fa.gz hgFakeAgp -singleContigs -minContigGap=1 -minScaffoldGap=50000 \$asmId.fa.gz stdout | gzip -c > \$asmId.fake.agp.gz twoBitInfo ../download/\$asmId.2bit stdout | cut -f1 \\ | sed -e "s/\\.\\([0-9]\\+\\)/v\\1/;" \\ | sed -e 's/\\(.*\\)/\\1 \\1/;' | sed -e 's/v\\([0-9]\\+\$\\)/.\\1/;' \\ | awk '{printf "%s\\t%s\\n", \$1, \$2}' | sort > \$asmId.fake.names _EOF_ ); } } else { printf STDERR "partsDone: %d\n", $partsDone; } $bossScript->add(<<_EOF_ zcat *.agp.gz | gzip > ../\$dbName.agp.gz -faToTwoBit *.fa.gz ../\$dbName.2bit -faToTwoBit -noMask *.fa.gz ../\$dbName.unmasked.2bit +export asmSize=`zgrep -v "^#" ../\$dbName.agp.gz | cut -f3 | ave stdin | grep total | awk '{printf "%d", \$NF}'` +export longArg="" +if [ "\$asmSize" -gt 4294967295 ]; then + longArg="-long" +fi +faToTwoBit \${longArg} *.fa.gz ../\$dbName.2bit +faToTwoBit \${longArg} -noMask *.fa.gz ../\$dbName.unmasked.2bit twoBitDup ../\$dbName.unmasked.2bit > \$asmId.dups.txt if [ -s "\$asmId.dups.txt" ]; then printf "ERROR: duplicate sequences found in ../\$dbName.unmasked.2bit\\n" 1>&2 cat \$asmId.dups.txt 1>&2 awk '{print \$1}' \$asmId.dups.txt > \$asmId.remove.dups.list mv ../\$dbName.unmasked.2bit ../\$dbName.unmasked.dups.2bit twoBitToFa ../\$dbName.unmasked.dups.2bit stdout | faSomeRecords -exclude \\ stdin \$asmId.remove.dups.list stdout | gzip -c > \$asmId.noDups.fasta.gz rm -f ../\$dbName.2bit ../\$dbName.unmasked.2bit - faToTwoBit \$asmId.noDups.fasta.gz ../\$dbName.2bit - faToTwoBit -noMask \$asmId.noDups.fasta.gz ../\$dbName.unmasked.2bit + faToTwoBit \${longArg} \$asmId.noDups.fasta.gz ../\$dbName.2bit + faToTwoBit \${longArg} -noMask \$asmId.noDups.fasta.gz ../\$dbName.unmasked.2bit fi gzip -f \$asmId.dups.txt touch -r ../download/\$asmId.2bit ../\$dbName.2bit touch -r ../download/\$asmId.2bit ../\$dbName.unmasked.2bit touch -r ../download/\$asmId.2bit ../\$dbName.agp.gz twoBitInfo ../\$dbName.2bit stdout | sort -k2nr > ../\$dbName.chrom.sizes touch -r ../\$dbName.2bit ../\$dbName.chrom.sizes # verify everything is there twoBitInfo ../download/\$asmId.2bit stdout | sort -k2nr > source.\$asmId.chrom.sizes export newTotal=`ave -col=2 ../\$dbName.chrom.sizes | grep "^total"` export oldTotal=`ave -col=2 source.\$asmId.chrom.sizes | grep "^total"` if [ "\$newTotal" != "\$oldTotal" ]; then printf "# ERROR: sequence construction error: not same totals source vs. new:\\n" 1>&2 printf "# \$newTotal != \$oldTotal\\n" 1>&2 exit 255 @@ -1559,37 +1569,40 @@ twoBitToFa \$asmId.masked.2bit stdout | gzip -c > \$asmId.fa.gz touch -r \$asmId.masked.2bit \$asmId.fa.gz faSize \$asmId.fa.gz > \$asmId.masked.faSize.txt touch -r \$asmId.masked.2bit \$asmId.masked.faSize.txt bptForTwoBit \$asmId.masked.2bit \$asmId.masked.2bit.bpt touch -r \$asmId.masked.2bit \$asmId.masked.2bit.bpt cp -p \$asmId.fa.gz ../../\$asmId.fa.gz cp -p \$asmId.masked.faSize.txt ../../\$asmId.faSize.txt cp -p \$asmId.masked.2bit.bpt ../../\$asmId.2bit.bpt size=`grep -w bases \$asmId.masked.faSize.txt | cut -d' ' -f1` if [ \$size -lt 4294967297 ]; then ln \$asmId.masked.2bit \$accessionId.2bit gfServer -trans index ../../\$accessionId.trans.gfidx \$accessionId.2bit & gfServer -stepSize=5 index ../../\$accessionId.untrans.gfidx \$accessionId.2bit wait + else + ln \$asmId.masked.2bit \$accessionId.2bit + gfServerHuge -trans index ../../\$accessionId.trans.gfidx \$accessionId.2bit & + gfServerHuge -stepSize=5 index ../../\$accessionId.untrans.gfidx \$accessionId.2bit + wait + fi rm \$accessionId.2bit touch -r \$asmId.masked.2bit ../../\$accessionId.trans.gfidx touch -r \$asmId.masked.2bit ../../\$accessionId.untrans.gfidx else - printf "# genome \$asmId too large at \$size to make blat indexes\\n" 1>&2 - fi -else printf "# addMask step previously completed\\n" 1>&2 exit 0 fi _EOF_ ); $bossScript->execute(); } # addMask ######################################################################### # * step: windowMasker [workhorse] sub doWindowMasker { my $runDir = "$buildDir/trackData/windowMasker"; &HgAutomate::mustMkdir($runDir);