4da1a14630708d887c3481e3b16df5044c86c59a
hiram
  Tue Oct 13 11:16:59 2020 -0700
better batch size for xenoRefGene and fixup to handle assemblies without repeat masker and RM can not run on those species refs #26347

diff --git src/hg/utils/automation/doXenoRefGene.pl src/hg/utils/automation/doXenoRefGene.pl
index 45f384d..fc2ca9a 100755
--- src/hg/utils/automation/doXenoRefGene.pl
+++ src/hg/utils/automation/doXenoRefGene.pl
@@ -26,31 +26,31 @@
 my $stepper = new HgStepManager(
     [ { name => 'splitTarget',   func => \&doSplitTarget },
       { name => 'blatRun', func => \&doBlatRun },
       { name => 'filterPsl', func => \&doFilterPsl },
       { name => 'makeGp', func => \&doMakeGp },
       { name => 'cleanup', func => \&doCleanup },
     ]
 				);
 
 # Option defaults:
 my $bigClusterHub = 'ku';
 my $workhorse = 'hgwdev';
 my $dbHost = 'hgwdev';
 my $defaultWorkhorse = 'hgwdev';
 my $maskedSeq = "$HgAutomate::clusterData/\$db/\$db.2bit";
-my $mrnas = "/hive/data/genomes/asmHubs/VGP/xenoRefSeq";
+my $mrnas = "/hive/data/genomes/asmHubs/xenoRefSeq";
 my $noDbGenePredCheck = 1;    # default yes, use -db for genePredCheck
 my $augustusDir = "/hive/data/outside/augustus/augustus-3.3.1";
 my $augustusConfig="$augustusDir/config";
 
 my $base = $0;
 $base =~ s/^(.*\/)?//;
 
 sub usage {
   # Usage / help / self-documentation:
   my ($status, $detailed) = @_;
   # Basic help (for incorrect usage):
   print STDERR "
 usage: $base db
 options:
 ";
@@ -125,37 +125,38 @@
   my $runDir = "$buildDir";
 
   # First, make sure we're starting clean.
   if (-d "$runDir/target") {
     die "doXenoRefGene splitTarget step already done, remove directory 'target' to rerun,\n" .
       "or '-continue blatRun' to run next step.\n";
   }
 
   my $whatItDoes="split the masked 2bit file into fasta files for blat alignment processing.";
   my $bossScript = newBash HgRemoteScript("$runDir/doSplitTarget.bash", $workhorse,
 				      $runDir, $whatItDoes);
   $bossScript->add(<<_EOF_
 export asmId="$db"
 export maskedSeq="$maskedSeq"
 export queryCount=`cat "$mrnas/query.list" | wc -l`
-# aim for 1,000,000 cluster job batch size
-export targetPartCount=`echo \$queryCount | awk '{printf "%d", 1 + (1000000 / \$1)}'`
+# aim for 100,000 cluster job batch size
+export targetPartCount=`echo \$queryCount | awk '{printf "%d", 1 + (100000 / \$1)}'`
 twoBitInfo \$maskedSeq stdout | sort -k2,2nr > \$asmId.chrom.sizes
 export targetParts=`cat \$asmId.chrom.sizes | wc -l`
 export maxChunk=`head -1 \$asmId.chrom.sizes | awk '{printf "%d", 1.1*\$(NF)}'`
 export seqLimit=`echo \$targetParts \$targetPartCount | awk '{printf "%d", 1 + (\$1 / \$2)}'`
 export totalJobs=`echo \$queryCount \$targetPartCount | awk '{printf "%d", \$1 * \$2}'`
+printf "# batch job count will be: \%d\\n", \$totalJobs
 rm -fr targetList
 ~/kent/src/hg/utils/automation/partitionSequence.pl -concise \\
    -lstDir=targetList \$maxChunk 0 \$maskedSeq \$asmId.chrom.sizes \$seqLimit
 rm -fr target
 mkdir target
 ls targetList/*.lst | while read partSpec
 do
   export part=`basename \$partSpec | sed -e 's/.lst/.fa/;'`
   export faFile="target/\$part"
   rm -f \$faFile
   touch \$faFile
   cat \$partSpec | while read seq
   do
     twoBitToFa \$seq stdout
   done > \$faFile
@@ -247,31 +248,31 @@
 
   my $whatItDoes = "Filters the raw psl results from the blatRun.";
   my $bossScript = newBash HgRemoteScript("$runDir/filterPsl.bash", $workhorse,
 				      $runDir, $whatItDoes);
 
   $bossScript->add(<<_EOF_
 export db="$db"
 find ./blatRun/result -type f | xargs cat \\
     | gnusort -S100G --parallel=32 -k10,10 > \$db.all.psl
 
 pslCDnaFilter -minId=0.35 -minCover=0.25  -globalNearBest=0.0100 -minQSize=20 \\
   -ignoreIntrons -repsAsMatch -ignoreNs -bestOverlap \\
     \$db.all.psl \$db.xenoRefGene.psl
 
 pslCheck -targetSizes=\$db.chrom.sizes \\
-  -querySizes=/hive/data/genomes/asmHubs/VGP/xenoRefSeq/xenoRefMrna.sizes \\
+  -querySizes=$mrnas/xenoRefMrna.sizes \\
      \$db.xenoRefGene.psl
 _EOF_
   );
   $bossScript->execute();
 } # doFilterPsl
 
 #########################################################################
 # * step: make gp [workhorse]
 sub doMakeGp {
   my $runDir = $buildDir;
   &HgAutomate::mustMkdir($runDir);
 
   # First, make sure we're starting clean.
   if (! -e "$runDir/$db.xenoRefGene.psl") {
     die "doMakeGp: the previous step filterPsl did not complete \n" .