77a819d426026e8a6ac3d7965af8f44fbb9a0272
hiram
  Wed Jan 10 12:42:07 2024 -0800
better manage large genome construction and add in RepeatModeler track refs #29545

diff --git src/hg/utils/automation/doAssemblyHub.pl src/hg/utils/automation/doAssemblyHub.pl
index 0c6090b..3d32a7c 100755
--- src/hg/utils/automation/doAssemblyHub.pl
+++ src/hg/utils/automation/doAssemblyHub.pl
@@ -717,41 +717,46 @@
     \${asmId}_rm.run \\
     \${asmId}_assembly_structure \\
     \$asmId.2bit
 
   ln -s $assemblySource/\${asmId}_genomic.fna.gz .
   ln -s $assemblySource/\${asmId}_assembly_report.txt .
   if [ -s $assemblySource/\${asmId}_rm.out.gz ]; then
     ln -s $assemblySource/\${asmId}_rm.out.gz .
   fi
   if [ -s $assemblySource/\${asmId}_rm.run ]; then
     ln -s $assemblySource/\${asmId}_rm.run .
   fi
   if [ -d $assemblySource/\${asmId}_assembly_structure ]; then
     ln -s $assemblySource/\${asmId}_assembly_structure .
   fi
-  faToTwoBit \${asmId}_genomic.fna.gz \$asmId.2bit
+  export asmSize=`grep -v "^#" \${asmId}_assembly_report.txt | head | cut -f9 | ave stdin | grep total | awk '{printf "%d", \$NF}'`
+  export longArg=""
+  if [ "\$asmSize" -gt 4294967295 ]; then
+    longArg="-long"
+  fi
+  faToTwoBit \${longArg} \${asmId}_genomic.fna.gz \$asmId.2bit
   twoBitDup \$asmId.2bit > \$asmId.dups.txt
   if [ -s "\$asmId.dups.txt" ]; then
     printf "WARNING duplicate sequences found in \$asmId.2bit\\n" 1>&2
     cat \$asmId.dups.txt 1>&2
     awk '{print \$1}' \$asmId.dups.txt > \$asmId.remove.dups.list
     mv \${asmId}_genomic.fna.gz \${asmId}_genomic.fna.dups.gz
     faSomeRecords -exclude \${asmId}_genomic.fna.dups.gz \\
       \$asmId.remove.dups.list stdout | gzip -c > \${asmId}_genomic.fna.gz
     rm -f \$asmId.2bit
-    faToTwoBit \${asmId}_genomic.fna.gz \$asmId.2bit
+    faToTwoBit \${longArg} \${asmId}_genomic.fna.gz \$asmId.2bit
   fi
   gzip -f \$asmId.dups.txt
   touch -r \${asmId}_genomic.fna.gz \$asmId.2bit
 else
   printf "# download step previously completed\\n" 1>&2
   exit 0
 fi
 _EOF_
   );
   $bossScript->execute();
 
 } # doDownload
 
 
 #########################################################################
@@ -908,43 +913,48 @@
 twoBitToFa ../download/\$asmId.2bit stdout | gzip -c > \$asmId.fa.gz
 hgFakeAgp -singleContigs -minContigGap=1 -minScaffoldGap=50000 \$asmId.fa.gz stdout | gzip -c > \$asmId.fake.agp.gz
 twoBitInfo ../download/\$asmId.2bit stdout | cut -f1 \\
   | sed -e "s/\\.\\([0-9]\\+\\)/v\\1/;" \\
     | sed -e 's/\\(.*\\)/\\1 \\1/;' | sed -e 's/v\\([0-9]\\+\$\\)/.\\1/;' \\
       | awk '{printf "%s\\t%s\\n", \$1, \$2}' | sort > \$asmId.fake.names
 _EOF_
       );
     }
 } else {
 printf STDERR "partsDone: %d\n", $partsDone;
   }
 
   $bossScript->add(<<_EOF_
 zcat *.agp.gz | gzip > ../\$dbName.agp.gz
-faToTwoBit *.fa.gz ../\$dbName.2bit
-faToTwoBit -noMask *.fa.gz ../\$dbName.unmasked.2bit
+export asmSize=`zgrep -v "^#" ../\$dbName.agp.gz | cut -f3 | ave stdin | grep total | awk '{printf "%d", \$NF}'`
+export longArg=""
+if [ "\$asmSize" -gt 4294967295 ]; then
+  longArg="-long"
+fi
+faToTwoBit \${longArg} *.fa.gz ../\$dbName.2bit
+faToTwoBit \${longArg} -noMask *.fa.gz ../\$dbName.unmasked.2bit
 twoBitDup ../\$dbName.unmasked.2bit > \$asmId.dups.txt
 if [ -s "\$asmId.dups.txt" ]; then
   printf "ERROR: duplicate sequences found in ../\$dbName.unmasked.2bit\\n" 1>&2
   cat \$asmId.dups.txt 1>&2
   awk '{print \$1}' \$asmId.dups.txt > \$asmId.remove.dups.list
   mv ../\$dbName.unmasked.2bit ../\$dbName.unmasked.dups.2bit
   twoBitToFa ../\$dbName.unmasked.dups.2bit stdout | faSomeRecords -exclude \\
     stdin \$asmId.remove.dups.list stdout | gzip -c > \$asmId.noDups.fasta.gz
   rm -f ../\$dbName.2bit ../\$dbName.unmasked.2bit
-  faToTwoBit \$asmId.noDups.fasta.gz ../\$dbName.2bit
-  faToTwoBit -noMask \$asmId.noDups.fasta.gz ../\$dbName.unmasked.2bit
+  faToTwoBit \${longArg} \$asmId.noDups.fasta.gz ../\$dbName.2bit
+  faToTwoBit \${longArg} -noMask \$asmId.noDups.fasta.gz ../\$dbName.unmasked.2bit
 fi
 gzip -f \$asmId.dups.txt
 touch -r ../download/\$asmId.2bit ../\$dbName.2bit
 touch -r ../download/\$asmId.2bit ../\$dbName.unmasked.2bit
 touch -r ../download/\$asmId.2bit ../\$dbName.agp.gz
 twoBitInfo ../\$dbName.2bit stdout | sort -k2nr > ../\$dbName.chrom.sizes
 touch -r ../\$dbName.2bit ../\$dbName.chrom.sizes
 # verify everything is there
 twoBitInfo ../download/\$asmId.2bit stdout | sort -k2nr > source.\$asmId.chrom.sizes
 export newTotal=`ave -col=2 ../\$dbName.chrom.sizes | grep "^total"`
 export oldTotal=`ave -col=2 source.\$asmId.chrom.sizes | grep "^total"`
 if [ "\$newTotal" != "\$oldTotal" ]; then
   printf "# ERROR: sequence construction error: not same totals source vs. new:\\n" 1>&2
   printf "# \$newTotal != \$oldTotal\\n" 1>&2
   exit 255
@@ -1559,37 +1569,40 @@
   twoBitToFa \$asmId.masked.2bit stdout | gzip -c > \$asmId.fa.gz
   touch -r \$asmId.masked.2bit \$asmId.fa.gz
   faSize \$asmId.fa.gz > \$asmId.masked.faSize.txt
   touch -r \$asmId.masked.2bit \$asmId.masked.faSize.txt
   bptForTwoBit \$asmId.masked.2bit \$asmId.masked.2bit.bpt
   touch -r \$asmId.masked.2bit \$asmId.masked.2bit.bpt
   cp -p \$asmId.fa.gz ../../\$asmId.fa.gz
   cp -p \$asmId.masked.faSize.txt ../../\$asmId.faSize.txt
   cp -p \$asmId.masked.2bit.bpt ../../\$asmId.2bit.bpt
   size=`grep -w bases \$asmId.masked.faSize.txt | cut -d' ' -f1`
   if [ \$size -lt 4294967297 ]; then
     ln \$asmId.masked.2bit \$accessionId.2bit
     gfServer -trans index ../../\$accessionId.trans.gfidx \$accessionId.2bit &
     gfServer -stepSize=5 index ../../\$accessionId.untrans.gfidx \$accessionId.2bit
     wait
+  else
+    ln \$asmId.masked.2bit \$accessionId.2bit
+    gfServerHuge -trans index ../../\$accessionId.trans.gfidx \$accessionId.2bit &
+    gfServerHuge -stepSize=5 index ../../\$accessionId.untrans.gfidx \$accessionId.2bit
+    wait
+  fi
   rm \$accessionId.2bit
   touch -r \$asmId.masked.2bit ../../\$accessionId.trans.gfidx
   touch -r \$asmId.masked.2bit ../../\$accessionId.untrans.gfidx
 else
-    printf "# genome \$asmId too large at \$size to make blat indexes\\n" 1>&2
-  fi
-else
   printf "# addMask step previously completed\\n" 1>&2
   exit 0
 fi
 _EOF_
   );
 
   $bossScript->execute();
 } # addMask
 
 #########################################################################
 # * step: windowMasker [workhorse]
 sub doWindowMasker {
   my $runDir = "$buildDir/trackData/windowMasker";
 
   &HgAutomate::mustMkdir($runDir);