3450181d0d4ea0e081016d29b67dcbe3245c5d32 hiram Mon Dec 6 14:40:45 2021 -0800 attempt to better adjust cluster batch size when many small contigs no redmine diff --git src/hg/utils/automation/doSimpleRepeat.pl src/hg/utils/automation/doSimpleRepeat.pl index c48e2a9..354a557 100755 --- src/hg/utils/automation/doSimpleRepeat.pl +++ src/hg/utils/automation/doSimpleRepeat.pl @@ -4,31 +4,31 @@ # edit ~/kent/src/hg/utils/automation/doSimpleRepeat.pl instead. # $Id: doSimpleRepeat.pl,v 1.5 2009/06/08 18:38:58 hiram Exp $ use Getopt::Long; use warnings; use strict; use Carp; use FindBin qw($Bin); use lib "$Bin"; use HgAutomate; use HgRemoteScript; use HgStepManager; # Hardcoded (for now): -my $chunkSize = 50000000; +my $chunkSize = 50000000; # will be readjusted if seqCount > 100000 my $singleRunSize = 200000000; my $clusterBin = qw(/cluster/bin/$MACHTYPE); # Option variable names, both common and peculiar to this script: use vars @HgAutomate::commonOptionVars; use vars @HgStepManager::optionVars; use vars qw/ $opt_buildDir $opt_unmaskedSeq $opt_trf409 /; # Specify the steps supported with -continue / -stop: my $stepper = new HgStepManager( [ { name => 'trf', func => \&doTrf }, @@ -325,31 +325,31 @@ "It concatenates .bed files from cluster run into simpleRepeat.bed.\n" . $whatItDoes; } if ($chromBased) { $whatItDoes .= "It splits trfMask.bed into per-chrom files for bigZips download generation."; } my $fileServer = &HgAutomate::chooseFileServer($runDir); my $bossScript = new HgRemoteScript("$runDir/doFilter.csh", $fileServer, $runDir, $whatItDoes); # Use symbolic link created in cluster step: my $partDir = "$buildDir/TrfPart"; if ($useCluster) { $bossScript->add(<<_EOF_ -cat $partDir/???/*.bed > simpleRepeat.bed +find $partDir/??? -type f | grep lst.bed | xargs cat > simpleRepeat.bed endsInLf simpleRepeat.bed if (\$status) then echo Uh-oh -- simpleRepeat.bed fails endsInLf. Look at $partDir/ bed files. exit 1 endif _EOF_ ); } $bossScript->add(<<_EOF_ if ( -s simpleRepeat.bed ) then awk '{if (\$5 <= 12) print;}' simpleRepeat.bed > trfMask.bed awk 'BEGIN{OFS="\\t"}{name=substr(\$16,0,16);\$4=name;printf "%s\\n", \$0}' \\ simpleRepeat.bed | sort -k1,1 -k2,2n > simpleRepeat.bed16.bed twoBitInfo $unmaskedSeq stdout | sort -k2nr > tmp.chrom.sizes bedToBigBed -tab -type=bed4+12 -as=\$HOME/kent/src/hg/lib/simpleRepeat.as \\ @@ -449,48 +449,54 @@ $trf409 = $opt_trf409 ? $opt_trf409 : ""; if (! -e $unmaskedSeq) { die $opt_unmaskedSeq ? "Error: -unmaskedSeq $unmaskedSeq does not exist.\n" : "Error: required file $unmaskedSeq does not exist. " . "(use -unmaskedSeq <file> ?)\n"; } die "Error: -unmaskedSeq filename must end in .2bit (got $unmaskedSeq).\n" if ($unmaskedSeq !~ /\.2bit$/); if ($unmaskedSeq !~ /^\//) { my $pwd = `pwd`; chomp $pwd; $unmaskedSeq = "$pwd/$unmaskedSeq"; } +# try to adjust chunkSize to achive a reasonable (1K) single job sequence count my $pipe = &HgAutomate::mustOpen("twoBitInfo $unmaskedSeq stdout |"); my $seqCount = 0; my $genomeSize = 0; while (<$pipe>) { chomp; my (undef, $size) = split; $genomeSize += $size; $seqCount++; - if ($seqCount > $HgAutomate::splitThreshold && - $genomeSize > $singleRunSize) { - # No need to keep counting -- we know our boolean answers. - last; - } } close($pipe); die "Could not open pipe from twoBitInfo $unmaskedSeq" unless ($genomeSize > 0 && $seqCount > 0); +# lots of contigs, and big genome, adjust chunkSize +if ( ($seqCount > 100000) && ($genomeSize > $singleRunSize) ) { + # if they were uniform size, 1K jobs would be sized: + my $uniform1K = int( 1 + $genomeSize / 1000 ); + # but if they were all small, 1K jobs would be sized + my $allSmall1K = int (1 + (($seqCount / 1000) * 5000)); + # use the smaller size of those two measures + $chunkSize = $uniform1K > $allSmall1K ? $allSmall1K : $uniform1K; +} + $chromBased = ($seqCount <= $HgAutomate::splitThreshold); $useCluster = ($genomeSize > $singleRunSize); &HgAutomate::verbose(2, "\n$db is " . ($chromBased ? "chrom-based" : "scaffold-based") . ".\n" . "Total genome size is " . ($useCluster ? "> $singleRunSize; cluster & cat." : "<= $singleRunSize; single run.") . "\n\n"); # Do everything. $stepper->execute(); # Tell the user anything they should know. my $stopStep = $stepper->getStopStep(); my $upThrough = ($stopStep eq 'cleanup') ? "" :