src/hg/utils/automation/doXenoRefGene.pl 0c48fdb70c330626acba56d84e4c4a48b8964693

0c48fdb70c330626acba56d84e4c4a48b8964693
hiram
  Thu Dec 2 11:40:57 2021 -0800
large performance improvement, better splitting procedure and smaller cluster batch sizes for high contig count assemblies no redmine

diff --git src/hg/utils/automation/doXenoRefGene.pl src/hg/utils/automation/doXenoRefGene.pl
index 8aa9818..ca80d0e 100755
--- src/hg/utils/automation/doXenoRefGene.pl
+++ src/hg/utils/automation/doXenoRefGene.pl
@@ -123,53 +123,58 @@
   my $runDir = "$buildDir";
 
   # First, make sure we're starting clean.
   if (-d "$runDir/target") {
     die "doXenoRefGene splitTarget step already done, remove directory 'target' to rerun,\n" .
       "or '-continue blatRun' to run next step.\n";
   }
 
   my $whatItDoes="split the masked 2bit file into fasta files for blat alignment processing.";
   my $bossScript = newBash HgRemoteScript("$runDir/doSplitTarget.bash", $workhorse,
 				      $runDir, $whatItDoes);
   $bossScript->add(<<_EOF_
 export asmId="$db"
 export maskedSeq="$maskedSeq"
 export queryCount=`cat "$mrnas/query.list" | wc -l`
-# aim for 100,000 cluster job batch size
+# aim for about 100,000 cluster job batch size, could end up less than this
 export targetPartCount=`echo \$queryCount | awk '{printf "%d", 1 + (100000 / \$1)}'`
 twoBitInfo \$maskedSeq stdout | sort -k2,2nr > \$asmId.chrom.sizes
 export targetParts=`cat \$asmId.chrom.sizes | wc -l`
 export maxChunk=`head -1 \$asmId.chrom.sizes | awk '{printf "%d", 1.1*\$(NF)}'`
+if [ \$maxChunk -lt 10000000 ]; then
+  maxChunk=10000000
+fi
 export seqLimit=`echo \$targetParts \$targetPartCount | awk '{printf "%d", 1 + (\$1 / \$2)}'`
+if [ \$seqLimit -lt 1000 ]; then
+  seqLimit=1000
+fi
 export totalJobs=`echo \$queryCount \$targetPartCount | awk '{printf "%d", \$1 * \$2}'`
-printf "# batch job count will be: \%d\\n", \$totalJobs
+printf "# batch job count will be approximately: \%d or even less than that.\\n", \$totalJobs
 rm -fr targetList
 ~/kent/src/hg/utils/automation/partitionSequence.pl -concise \\
    -lstDir=targetList \$maxChunk 0 \$maskedSeq \$asmId.chrom.sizes \$seqLimit
 rm -fr target
 mkdir target
 ls targetList/*.lst | while read partSpec
 do
-  export part=`basename \$partSpec | sed -e 's/.lst/.fa/;'`
-  export faFile="target/\$part"
+  export part=`basename \$partSpec | sed -e 's/.lst//;'`
+  export faFile="target/\$part.fa"
+  export seqList="target/\$part.lst"
   rm -f \$faFile
-  touch \$faFile
-  cat \$partSpec | while read seq
-  do
-    twoBitToFa \$seq stdout
-  done > \$faFile
+  cat \$partSpec | sed -e 's#.*2bit:##;' > \$seqList
+  twoBitToFa -seqList=\$seqList \$maskedSeq \$faFile
+  rm -f \$seqList
 done
 gzip target/*.fa
 ls target | sed -e 's/.fa.gz//;' > target.list
 _EOF_
   );
 
   $bossScript->execute();
 } # doSplitTarget
 
 #########################################################################
 # * step: blatRun [bigClusterHub]
 sub doBlatRun {
   # Set up and perform the cluster run to run the blat alignment of RefSeq
   #     mrnas to the split target fasta sequences.
   my $paraHub = $bigClusterHub;