3450181d0d4ea0e081016d29b67dcbe3245c5d32
hiram
  Mon Dec 6 14:40:45 2021 -0800
attempt to better adjust cluster batch size when many small contigs no redmine

diff --git src/hg/utils/automation/doSimpleRepeat.pl src/hg/utils/automation/doSimpleRepeat.pl
index c48e2a9..354a557 100755
--- src/hg/utils/automation/doSimpleRepeat.pl
+++ src/hg/utils/automation/doSimpleRepeat.pl
@@ -4,31 +4,31 @@
 # edit ~/kent/src/hg/utils/automation/doSimpleRepeat.pl instead.
 
 # $Id: doSimpleRepeat.pl,v 1.5 2009/06/08 18:38:58 hiram Exp $
 
 use Getopt::Long;
 use warnings;
 use strict;
 use Carp;
 use FindBin qw($Bin);
 use lib "$Bin";
 use HgAutomate;
 use HgRemoteScript;
 use HgStepManager;
 
 # Hardcoded (for now):
-my $chunkSize = 50000000;
+my $chunkSize = 50000000;	# will be readjusted if seqCount > 100000
 my $singleRunSize = 200000000;
 my $clusterBin = qw(/cluster/bin/$MACHTYPE);
 
 # Option variable names, both common and peculiar to this script:
 use vars @HgAutomate::commonOptionVars;
 use vars @HgStepManager::optionVars;
 use vars qw/
     $opt_buildDir
     $opt_unmaskedSeq
     $opt_trf409
     /;
 
 # Specify the steps supported with -continue / -stop:
 my $stepper = new HgStepManager(
     [ { name => 'trf',     func => \&doTrf },
@@ -325,31 +325,31 @@
       "It concatenates .bed files from cluster run into simpleRepeat.bed.\n"
 	. $whatItDoes;
   }
   if ($chromBased) {
     $whatItDoes .=
 "It splits trfMask.bed into per-chrom files for bigZips download generation.";
   }
   my $fileServer = &HgAutomate::chooseFileServer($runDir);
   my $bossScript = new HgRemoteScript("$runDir/doFilter.csh", $fileServer,
 				      $runDir, $whatItDoes);
 
   # Use symbolic link created in cluster step:
   my $partDir = "$buildDir/TrfPart";
   if ($useCluster) {
     $bossScript->add(<<_EOF_
-cat $partDir/???/*.bed > simpleRepeat.bed
+find $partDir/??? -type f | grep lst.bed | xargs cat > simpleRepeat.bed
 endsInLf simpleRepeat.bed
 if (\$status) then
   echo Uh-oh -- simpleRepeat.bed fails endsInLf.  Look at $partDir/ bed files.
   exit 1
 endif
 _EOF_
     );
   }
   $bossScript->add(<<_EOF_
 if ( -s simpleRepeat.bed ) then
   awk '{if (\$5 <= 12) print;}' simpleRepeat.bed > trfMask.bed
   awk 'BEGIN{OFS="\\t"}{name=substr(\$16,0,16);\$4=name;printf "%s\\n", \$0}' \\
     simpleRepeat.bed | sort -k1,1 -k2,2n > simpleRepeat.bed16.bed
   twoBitInfo $unmaskedSeq stdout | sort -k2nr > tmp.chrom.sizes
   bedToBigBed -tab -type=bed4+12 -as=\$HOME/kent/src/hg/lib/simpleRepeat.as \\
@@ -449,48 +449,54 @@
 $trf409 = $opt_trf409 ? $opt_trf409 : "";
 
 if (! -e $unmaskedSeq) {
   die $opt_unmaskedSeq ? "Error: -unmaskedSeq $unmaskedSeq does not exist.\n" :
     "Error: required file $unmaskedSeq does not exist. " .
       "(use -unmaskedSeq <file> ?)\n";
 }
 die "Error: -unmaskedSeq filename must end in .2bit (got $unmaskedSeq).\n"
   if ($unmaskedSeq !~ /\.2bit$/);
 if ($unmaskedSeq !~ /^\//) {
   my $pwd = `pwd`;
   chomp $pwd;
   $unmaskedSeq = "$pwd/$unmaskedSeq";
 }
 
+# try to adjust chunkSize to achive a reasonable (1K) single job sequence count
 my $pipe = &HgAutomate::mustOpen("twoBitInfo $unmaskedSeq stdout |");
 my $seqCount = 0;
 my $genomeSize = 0;
 while (<$pipe>) {
   chomp;
   my (undef, $size) = split;
   $genomeSize += $size;
   $seqCount++;
-  if ($seqCount > $HgAutomate::splitThreshold &&
-      $genomeSize > $singleRunSize) {
-    # No need to keep counting -- we know our boolean answers.
-    last;
-  }
 }
 close($pipe);
 die "Could not open pipe from twoBitInfo $unmaskedSeq"
   unless ($genomeSize > 0 && $seqCount > 0);
 
+# lots of contigs, and big genome, adjust chunkSize
+if ( ($seqCount > 100000) && ($genomeSize > $singleRunSize) ) {
+  # if they were uniform size, 1K jobs would be sized:
+  my $uniform1K = int( 1 + $genomeSize / 1000 );
+  # but if they were all small, 1K jobs would be sized
+  my $allSmall1K = int (1 + (($seqCount / 1000) * 5000));
+  # use the smaller size of those two measures
+  $chunkSize = $uniform1K > $allSmall1K ? $allSmall1K : $uniform1K;
+}
+
 $chromBased = ($seqCount <= $HgAutomate::splitThreshold);
 $useCluster = ($genomeSize > $singleRunSize);
 &HgAutomate::verbose(2, "\n$db is " .
 		     ($chromBased ? "chrom-based" : "scaffold-based") . ".\n" .
 		     "Total genome size is " .
 		     ($useCluster ? "> $singleRunSize; cluster & cat." :
 		                    "<= $singleRunSize; single run.") .
 		     "\n\n");
 
 # Do everything.
 $stepper->execute();
 
 # Tell the user anything they should know.
 my $stopStep = $stepper->getStopStep();
 my $upThrough = ($stopStep eq 'cleanup') ? "" :