cb83116ca0be25e3e45c93468d951fe53cb2c1e4 hiram Fri Dec 27 00:49:20 2019 -0800 better partitioning of the target genome refs #24354 diff --git src/hg/utils/automation/doXenoRefGene.pl src/hg/utils/automation/doXenoRefGene.pl index be16b2f..691a5c1 100755 --- src/hg/utils/automation/doXenoRefGene.pl +++ src/hg/utils/automation/doXenoRefGene.pl @@ -123,35 +123,58 @@ # * step: splitTarget [workhorse] sub doSplitTarget { # run faSplit on the masked 2bit target sequence and prepare the target.list my $runDir = "$buildDir"; # First, make sure we're starting clean. if (-d "$runDir/target") { die "doXenoRefGene splitTarget step already done, remove directory 'target' to rerun,\n" . "or '-continue blatRun' to run next step.\n"; } my $whatItDoes="split the masked 2bit file into fasta files for blat alignment processing."; my $bossScript = newBash HgRemoteScript("$runDir/doSplitTarget.bash", $workhorse, $runDir, $whatItDoes); $bossScript->add(<<_EOF_ +export asmId="$db" +export maskedSeq="$maskedSeq" +export queryCount=`cat "$mrnas/query.list" | wc -l` +# aim for 1,000,000 cluster job batch size +export targetPartCount=`echo \$queryCount | awk '{printf "%d", 1 + (1000000 / \$1)}'` +twoBitInfo \$maskedSeq stdout | sort -k2,2nr > \$asmId.chrom.sizes +export targetParts=`cat \$asmId.chrom.sizes | wc -l` +export maxChunk=`head -1 \$asmId.chrom.sizes | awk '{printf "%d", 1.1*\$(NF)}'` +export seqLimit=`echo \$targetParts \$targetPartCount | awk '{printf "%d", 1 + (\$1 / \$2)}'` +export totalJobs=`echo \$queryCount \$targetPartCount | awk '{printf "%d", \$1 * \$2}'` +rm -fr targetList +~/kent/src/hg/utils/automation/partitionSequence.pl -concise \\ + -lstDir=targetList \$maxChunk 0 \$maskedSeq \$asmId.chrom.sizes \$seqLimit +rm -fr target mkdir target -twoBitToFa $maskedSeq stdout | faSplit byname stdin target/ +ls targetList/*.lst | while read partSpec +do + export part=`basename \$partSpec | sed -e 's/.lst/.fa/;'` + export faFile="target/\$part" + rm -f \$faFile + touch \$faFile + cat \$partSpec | while read seq + do + twoBitToFa \$seq stdout + done > \$faFile +done gzip target/*.fa ls target | sed -e 's/.fa.gz//;' > target.list -faSize -detailed target/*.fa.gz | sort -k2,2nr > $db.chrom.sizes _EOF_ ); $bossScript->execute(); } # doSplitTarget ######################################################################### # * step: blatRun [bigClusterHub] sub doBlatRun { # Set up and perform the cluster run to run the blat alignment of RefSeq # mrnas to the split target fasta sequences. my $paraHub = $bigClusterHub; my $runDir = "$buildDir/blatRun"; # First, make sure previous step has completed, # and starting clean without this step result present: