src/hg/utils/automation/doXenoRefGene.pl cb83116ca0be25e3e45c93468d951fe53cb2c1e4

cb83116ca0be25e3e45c93468d951fe53cb2c1e4
hiram
  Fri Dec 27 00:49:20 2019 -0800
better partitioning of the target genome refs #24354

diff --git src/hg/utils/automation/doXenoRefGene.pl src/hg/utils/automation/doXenoRefGene.pl
index be16b2f..691a5c1 100755
--- src/hg/utils/automation/doXenoRefGene.pl
+++ src/hg/utils/automation/doXenoRefGene.pl
@@ -123,35 +123,58 @@
 # * step: splitTarget [workhorse]
 sub doSplitTarget {
   # run faSplit on the masked 2bit target sequence and prepare the target.list
   my $runDir = "$buildDir";
 
   # First, make sure we're starting clean.
   if (-d "$runDir/target") {
     die "doXenoRefGene splitTarget step already done, remove directory 'target' to rerun,\n" .
       "or '-continue blatRun' to run next step.\n";
   }
 
   my $whatItDoes="split the masked 2bit file into fasta files for blat alignment processing.";
   my $bossScript = newBash HgRemoteScript("$runDir/doSplitTarget.bash", $workhorse,
 				      $runDir, $whatItDoes);
   $bossScript->add(<<_EOF_
+export asmId="$db"
+export maskedSeq="$maskedSeq"
+export queryCount=`cat "$mrnas/query.list" | wc -l`
+# aim for 1,000,000 cluster job batch size
+export targetPartCount=`echo \$queryCount | awk '{printf "%d", 1 + (1000000 / \$1)}'`
+twoBitInfo \$maskedSeq stdout | sort -k2,2nr > \$asmId.chrom.sizes
+export targetParts=`cat \$asmId.chrom.sizes | wc -l`
+export maxChunk=`head -1 \$asmId.chrom.sizes | awk '{printf "%d", 1.1*\$(NF)}'`
+export seqLimit=`echo \$targetParts \$targetPartCount | awk '{printf "%d", 1 + (\$1 / \$2)}'`
+export totalJobs=`echo \$queryCount \$targetPartCount | awk '{printf "%d", \$1 * \$2}'`
+rm -fr targetList
+~/kent/src/hg/utils/automation/partitionSequence.pl -concise \\
+   -lstDir=targetList \$maxChunk 0 \$maskedSeq \$asmId.chrom.sizes \$seqLimit
+rm -fr target
 mkdir target
-twoBitToFa $maskedSeq stdout | faSplit byname stdin target/
+ls targetList/*.lst | while read partSpec
+do
+  export part=`basename \$partSpec | sed -e 's/.lst/.fa/;'`
+  export faFile="target/\$part"
+  rm -f \$faFile
+  touch \$faFile
+  cat \$partSpec | while read seq
+  do
+    twoBitToFa \$seq stdout
+  done > \$faFile
+done
 gzip target/*.fa
 ls target | sed -e 's/.fa.gz//;' > target.list
-faSize -detailed target/*.fa.gz | sort -k2,2nr > $db.chrom.sizes
 _EOF_
   );
 
   $bossScript->execute();
 } # doSplitTarget
 
 #########################################################################
 # * step: blatRun [bigClusterHub]
 sub doBlatRun {
   # Set up and perform the cluster run to run the blat alignment of RefSeq
   #     mrnas to the split target fasta sequences.
   my $paraHub = $bigClusterHub;
   my $runDir = "$buildDir/blatRun";
   # First, make sure previous step has completed,
   #       and starting clean without this step result present: