4da1a14630708d887c3481e3b16df5044c86c59a hiram Tue Oct 13 11:16:59 2020 -0700 better batch size for xenoRefGene and fixup to handle assemblies without repeat masker and RM can not run on those species refs #26347 diff --git src/hg/utils/automation/doXenoRefGene.pl src/hg/utils/automation/doXenoRefGene.pl index 45f384d..fc2ca9a 100755 --- src/hg/utils/automation/doXenoRefGene.pl +++ src/hg/utils/automation/doXenoRefGene.pl @@ -26,31 +26,31 @@ my $stepper = new HgStepManager( [ { name => 'splitTarget', func => \&doSplitTarget }, { name => 'blatRun', func => \&doBlatRun }, { name => 'filterPsl', func => \&doFilterPsl }, { name => 'makeGp', func => \&doMakeGp }, { name => 'cleanup', func => \&doCleanup }, ] ); # Option defaults: my $bigClusterHub = 'ku'; my $workhorse = 'hgwdev'; my $dbHost = 'hgwdev'; my $defaultWorkhorse = 'hgwdev'; my $maskedSeq = "$HgAutomate::clusterData/\$db/\$db.2bit"; -my $mrnas = "/hive/data/genomes/asmHubs/VGP/xenoRefSeq"; +my $mrnas = "/hive/data/genomes/asmHubs/xenoRefSeq"; my $noDbGenePredCheck = 1; # default yes, use -db for genePredCheck my $augustusDir = "/hive/data/outside/augustus/augustus-3.3.1"; my $augustusConfig="$augustusDir/config"; my $base = $0; $base =~ s/^(.*\/)?//; sub usage { # Usage / help / self-documentation: my ($status, $detailed) = @_; # Basic help (for incorrect usage): print STDERR " usage: $base db options: "; @@ -125,37 +125,38 @@ my $runDir = "$buildDir"; # First, make sure we're starting clean. if (-d "$runDir/target") { die "doXenoRefGene splitTarget step already done, remove directory 'target' to rerun,\n" . "or '-continue blatRun' to run next step.\n"; } my $whatItDoes="split the masked 2bit file into fasta files for blat alignment processing."; my $bossScript = newBash HgRemoteScript("$runDir/doSplitTarget.bash", $workhorse, $runDir, $whatItDoes); $bossScript->add(<<_EOF_ export asmId="$db" export maskedSeq="$maskedSeq" export queryCount=`cat "$mrnas/query.list" | wc -l` -# aim for 1,000,000 cluster job batch size -export targetPartCount=`echo \$queryCount | awk '{printf "%d", 1 + (1000000 / \$1)}'` +# aim for 100,000 cluster job batch size +export targetPartCount=`echo \$queryCount | awk '{printf "%d", 1 + (100000 / \$1)}'` twoBitInfo \$maskedSeq stdout | sort -k2,2nr > \$asmId.chrom.sizes export targetParts=`cat \$asmId.chrom.sizes | wc -l` export maxChunk=`head -1 \$asmId.chrom.sizes | awk '{printf "%d", 1.1*\$(NF)}'` export seqLimit=`echo \$targetParts \$targetPartCount | awk '{printf "%d", 1 + (\$1 / \$2)}'` export totalJobs=`echo \$queryCount \$targetPartCount | awk '{printf "%d", \$1 * \$2}'` +printf "# batch job count will be: \%d\\n", \$totalJobs rm -fr targetList ~/kent/src/hg/utils/automation/partitionSequence.pl -concise \\ -lstDir=targetList \$maxChunk 0 \$maskedSeq \$asmId.chrom.sizes \$seqLimit rm -fr target mkdir target ls targetList/*.lst | while read partSpec do export part=`basename \$partSpec | sed -e 's/.lst/.fa/;'` export faFile="target/\$part" rm -f \$faFile touch \$faFile cat \$partSpec | while read seq do twoBitToFa \$seq stdout done > \$faFile @@ -247,31 +248,31 @@ my $whatItDoes = "Filters the raw psl results from the blatRun."; my $bossScript = newBash HgRemoteScript("$runDir/filterPsl.bash", $workhorse, $runDir, $whatItDoes); $bossScript->add(<<_EOF_ export db="$db" find ./blatRun/result -type f | xargs cat \\ | gnusort -S100G --parallel=32 -k10,10 > \$db.all.psl pslCDnaFilter -minId=0.35 -minCover=0.25 -globalNearBest=0.0100 -minQSize=20 \\ -ignoreIntrons -repsAsMatch -ignoreNs -bestOverlap \\ \$db.all.psl \$db.xenoRefGene.psl pslCheck -targetSizes=\$db.chrom.sizes \\ - -querySizes=/hive/data/genomes/asmHubs/VGP/xenoRefSeq/xenoRefMrna.sizes \\ + -querySizes=$mrnas/xenoRefMrna.sizes \\ \$db.xenoRefGene.psl _EOF_ ); $bossScript->execute(); } # doFilterPsl ######################################################################### # * step: make gp [workhorse] sub doMakeGp { my $runDir = $buildDir; &HgAutomate::mustMkdir($runDir); # First, make sure we're starting clean. if (! -e "$runDir/$db.xenoRefGene.psl") { die "doMakeGp: the previous step filterPsl did not complete \n" .