0c48fdb70c330626acba56d84e4c4a48b8964693 hiram Thu Dec 2 11:40:57 2021 -0800 large performance improvement, better splitting procedure and smaller cluster batch sizes for high contig count assemblies no redmine diff --git src/hg/utils/automation/doXenoRefGene.pl src/hg/utils/automation/doXenoRefGene.pl index 8aa9818..ca80d0e 100755 --- src/hg/utils/automation/doXenoRefGene.pl +++ src/hg/utils/automation/doXenoRefGene.pl @@ -123,53 +123,58 @@ my $runDir = "$buildDir"; # First, make sure we're starting clean. if (-d "$runDir/target") { die "doXenoRefGene splitTarget step already done, remove directory 'target' to rerun,\n" . "or '-continue blatRun' to run next step.\n"; } my $whatItDoes="split the masked 2bit file into fasta files for blat alignment processing."; my $bossScript = newBash HgRemoteScript("$runDir/doSplitTarget.bash", $workhorse, $runDir, $whatItDoes); $bossScript->add(<<_EOF_ export asmId="$db" export maskedSeq="$maskedSeq" export queryCount=`cat "$mrnas/query.list" | wc -l` -# aim for 100,000 cluster job batch size +# aim for about 100,000 cluster job batch size, could end up less than this export targetPartCount=`echo \$queryCount | awk '{printf "%d", 1 + (100000 / \$1)}'` twoBitInfo \$maskedSeq stdout | sort -k2,2nr > \$asmId.chrom.sizes export targetParts=`cat \$asmId.chrom.sizes | wc -l` export maxChunk=`head -1 \$asmId.chrom.sizes | awk '{printf "%d", 1.1*\$(NF)}'` +if [ \$maxChunk -lt 10000000 ]; then + maxChunk=10000000 +fi export seqLimit=`echo \$targetParts \$targetPartCount | awk '{printf "%d", 1 + (\$1 / \$2)}'` +if [ \$seqLimit -lt 1000 ]; then + seqLimit=1000 +fi export totalJobs=`echo \$queryCount \$targetPartCount | awk '{printf "%d", \$1 * \$2}'` -printf "# batch job count will be: \%d\\n", \$totalJobs +printf "# batch job count will be approximately: \%d or even less than that.\\n", \$totalJobs rm -fr targetList ~/kent/src/hg/utils/automation/partitionSequence.pl -concise \\ -lstDir=targetList \$maxChunk 0 \$maskedSeq \$asmId.chrom.sizes \$seqLimit rm -fr target mkdir target ls targetList/*.lst | while read partSpec do - export part=`basename \$partSpec | sed -e 's/.lst/.fa/;'` - export faFile="target/\$part" + export part=`basename \$partSpec | sed -e 's/.lst//;'` + export faFile="target/\$part.fa" + export seqList="target/\$part.lst" rm -f \$faFile - touch \$faFile - cat \$partSpec | while read seq - do - twoBitToFa \$seq stdout - done > \$faFile + cat \$partSpec | sed -e 's#.*2bit:##;' > \$seqList + twoBitToFa -seqList=\$seqList \$maskedSeq \$faFile + rm -f \$seqList done gzip target/*.fa ls target | sed -e 's/.fa.gz//;' > target.list _EOF_ ); $bossScript->execute(); } # doSplitTarget ######################################################################### # * step: blatRun [bigClusterHub] sub doBlatRun { # Set up and perform the cluster run to run the blat alignment of RefSeq # mrnas to the split target fasta sequences. my $paraHub = $bigClusterHub;