3450181d0d4ea0e081016d29b67dcbe3245c5d32
hiram
  Mon Dec 6 14:40:45 2021 -0800
attempt to better adjust cluster batch size when many small contigs no redmine

diff --git src/hg/utils/automation/doSimpleRepeat.pl src/hg/utils/automation/doSimpleRepeat.pl
index c48e2a9..354a557 100755
--- src/hg/utils/automation/doSimpleRepeat.pl
+++ src/hg/utils/automation/doSimpleRepeat.pl
@@ -1,522 +1,528 @@
 #!/usr/bin/env perl
 
 # DO NOT EDIT the /cluster/bin/scripts copy of this file --
 # edit ~/kent/src/hg/utils/automation/doSimpleRepeat.pl instead.
 
 # $Id: doSimpleRepeat.pl,v 1.5 2009/06/08 18:38:58 hiram Exp $
 
 use Getopt::Long;
 use warnings;
 use strict;
 use Carp;
 use FindBin qw($Bin);
 use lib "$Bin";
 use HgAutomate;
 use HgRemoteScript;
 use HgStepManager;
 
 # Hardcoded (for now):
-my $chunkSize = 50000000;
+my $chunkSize = 50000000;	# will be readjusted if seqCount > 100000
 my $singleRunSize = 200000000;
 my $clusterBin = qw(/cluster/bin/$MACHTYPE);
 
 # Option variable names, both common and peculiar to this script:
 use vars @HgAutomate::commonOptionVars;
 use vars @HgStepManager::optionVars;
 use vars qw/
     $opt_buildDir
     $opt_unmaskedSeq
     $opt_trf409
     /;
 
 # Specify the steps supported with -continue / -stop:
 my $stepper = new HgStepManager(
     [ { name => 'trf',     func => \&doTrf },
       { name => 'filter',  func => \&doFilter },
       { name => 'load',    func => \&doLoad },
       { name => 'cleanup', func => \&doCleanup },
     ]
 				);
 
 # Option defaults:
 my $defaultSmallClusterHub = 'most available';
 my $defaultWorkhorse = 'least loaded';
 my $dbHost = 'hgwdev';
 my $unmaskedSeq = "\$db.unmasked.2bit";
 my $trf409 = "";
 
 my $base = $0;
 $base =~ s/^(.*\/)?//;
 
 sub usage {
   # Usage / help / self-documentation:
   my ($status, $detailed) = @_;
   # Basic help (for incorrect usage):
   print STDERR "
 usage: $base db
 options:
 ";
   print STDERR $stepper->getOptionHelp();
   print STDERR <<_EOF_
     -buildDir dir         Use dir instead of default
                           $HgAutomate::clusterData/\$db/$HgAutomate::trackBuild/simpleRepeat.\$date
                           (necessary when continuing at a later date).
     -unmaskedSeq seq.2bit Use seq.2bit as the unmasked input sequence instead
                           of default ($unmaskedSeq).
     -trf409 n             use new -l option to trf v4.09 (l=n)
                           maximum TR length expected (in millions)
                           (eg, -l=3 for 3 million)
                           Human genome hg38 uses: -trf409=6 -> -l=6
 _EOF_
   ;
   print STDERR &HgAutomate::getCommonOptionHelp('dbHost' => $dbHost,
 						'workhorse' => '',
 						'smallClusterHub' => '');
   my ($sizeM, $chunkM) = ($singleRunSize, $chunkSize);
   $sizeM =~ s/000000$/Mb/;  $chunkM =~ s/000000$/Mb/;
   print STDERR "
 Automates UCSC's simpleRepeat (TRF) process for genome database \$db.  Steps:
     trf:     If total genome size is <= $sizeM, run trfBig on a workhorse;
              otherwise do a cluster run of trfBig on $chunkM sequence chunks.
     filter:  If a cluster run was performed, concatenate the results into
              simpleRepeat.bed.  Filter simpleRepeat.bed (period <= 12) to
              trfMask.bed.  If \$db is chrom-based, split trfMaskBed into
              trfMaskChrom/chr*.bed for downloads.
     load:    Load simpleRepeat.bed into the simpleRepeat table in \$db.
     cleanup: Removes or compresses intermediate files.
 All operations are performed in the build directory which is
 $HgAutomate::clusterData/\$db/$HgAutomate::trackBuild/simpleRepeat.\$date unless -buildDir is given.
 Run -help to see what files are required for this script.
 ";
   # Detailed help (-help):
   print STDERR "
 Assumptions:
 1. $HgAutomate::clusterData/\$db/\$db.unmasked.2bit contains sequence for
    database/assembly \$db.  (This can be overridden with -unmaskedSeq.)
 2. $clusterBin/trfBig and $clusterBin/trf exist
    on workhorse, fileServer and cluster nodes.
 " if ($detailed);
   print "\n";
   exit $status;
 }
 
 
 # Globals:
 # Command line args: db
 my ($db);
 # Other:
 my ($buildDir, $chromBased, $useCluster);
 
 sub checkOptions {
   # Make sure command line options are valid/supported.
   my $ok = GetOptions(@HgStepManager::optionSpec,
 		      'buildDir=s',
 		      'unmaskedSeq=s',
 		      'trf409=s',
 		      @HgAutomate::commonOptionSpec,
 		      );
   &usage(1) if (!$ok);
   &usage(0, 1) if ($opt_help);
   &HgAutomate::processCommonOptions();
   my $err = $stepper->processOptions();
   usage(1) if ($err);
   $dbHost = $opt_dbHost if ($opt_dbHost);
 }
 
 
 #########################################################################
 # * step: trf [smallClusterHub or workhorse depending on genome size]
 sub doCluster {
   my $runDir = "$buildDir/run.cluster";
   &HgAutomate::mustMkdir($runDir);
 
   my $paraHub = $opt_smallClusterHub ? $opt_smallClusterHub :
     &HgAutomate::chooseSmallClusterByBandwidth();
   my @okIn = grep !/scratch/,
     &HgAutomate::chooseFilesystemsForCluster($paraHub, "in");
   my @okOut =
     &HgAutomate::chooseFilesystemsForCluster($paraHub, "out");
   if (scalar(@okOut) > 1) {
     @okOut = grep !/$okIn[0]/, @okOut;
   }
   my $inHive = 0;
   $inHive = 1 if ($okIn[0] =~ m#/hive/data/genomes#);
   my $clusterSeqDir = "$okIn[0]/$db";
   $clusterSeqDir = "$buildDir" if ($opt_unmaskedSeq);
   my $clusterSeq = "$clusterSeqDir/$db.doSimp.2bit";
   if ($inHive) {
     $clusterSeq = "$clusterSeqDir/$db.unmasked.2bit";
   }
   $clusterSeq = "$unmaskedSeq" if ($opt_unmaskedSeq);
   my $partDir .= "$okOut[0]/$db/TrfPart";
   $partDir = "$buildDir/TrfPart" if ($opt_unmaskedSeq);
 
   my $trf409Option = "";
   my $trfCmd = "trf";
   if ($trf409 ne 0) {
      $trf409Option = "-l=$trf409";
      $trfCmd = "trf.4.09";
   }
   # Cluster job script:
   my $fh = &HgAutomate::mustOpen(">$runDir/TrfRun.csh");
   print $fh <<_EOF_
 #!/bin/csh -ef
 
 set finalOut = \$1
 
 set inLst = \$finalOut:r
 set inLft = \$inLst:r.lft
 
 $HgAutomate::setMachtype
 
 # Use local disk for output, and move the final result to \$finalOut
 # when done, to minimize I/O.
 set tmpDir = `mktemp -d -p /scratch/tmp doSimpleRepeat.cluster.XXXXXX`
 pushd \$tmpDir
 
 foreach spec (`cat \$inLst`)
   # Remove path and .2bit filename to get just the seq:start-end spec:
   set base = `echo \$spec | sed -r -e 's/^[^:]+://'`
 
   # If \$spec is the whole sequence, twoBitToFa removes the :start-end part,
   # which causes liftUp to barf later.  So tweak the header back to
   # seq:start-end for liftUp's sake:
   twoBitToFa \$spec stdout \\
   | sed -e "s/^>.*/>\$base/" \\
   | $clusterBin/trfBig $trf409Option -trf=$clusterBin/$trfCmd \\
       stdin /dev/null -bedAt=\$base.bed -tempDir=/scratch/tmp
 end
 
 # Due to the large chunk size, .lft files can have thousands of lines.
 # Even though the liftUp code does &lineFileClose, somehow we still
 # run out of filehandles.  So limit the size of liftSpecs:
 split -a 3 -d -l 500 \$inLft SplitLft.
 
 # Lift up:
 foreach splitLft (SplitLft.*)
   set bedFiles = `awk '{print \$2 ".bed"};' \$splitLft`
   endsInLf -zeroOk \$bedFiles
   set lineCount=`cat \$bedFiles | wc -l`
   if (\$lineCount > 0) then
     cat \$bedFiles \\
       | liftUp -type=.bed tmpOut.\$splitLft \$splitLft error stdin
   else
     touch tmpOut.\$splitLft
   endif
 end
 cat tmpOut.* > tmpOut__bed
 
 # endsInLf is much faster than using para to {check out line}:
 endsInLf -zeroOk tmpOut*
 
 # Move final result into place:
 mv tmpOut__bed \$finalOut
 
 popd
 rm -rf \$tmpDir
 _EOF_
   ;
   close($fh);
 
   &HgAutomate::makeGsub($runDir,
       "./TrfRun.csh $partDir/\$(path1).bed");
   `touch "$runDir/para_hub_$paraHub"`;
 
   my $chunkM = $chunkSize;
   $chunkM =~ s/000000$/Mb/;
   my $whatItDoes =
 "It computes a logical partition of unmasked 2bit into $chunkM chunks
 and runs it on the cluster with the most available bandwidth.";
   my $bossScript = new HgRemoteScript("$runDir/doTrf.csh", $paraHub,
 				      $runDir, $whatItDoes);
 
   if ( ! $opt_unmaskedSeq && ! $inHive) {
     $bossScript->add(<<_EOF_
 mkdir -p $clusterSeqDir
 rsync -av $unmaskedSeq $clusterSeq
 _EOF_
     );
   }
 
   my $paraRun = &HgAutomate::paraRun();
   my $gensub2 = &HgAutomate::gensub2();
   if ($opt_unmaskedSeq) {
     $bossScript->add(<<_EOF_
 chmod a+x TrfRun.csh
 
 rm -rf $partDir
 $Bin/simplePartition.pl $clusterSeq $chunkSize $partDir
 
 $gensub2 $partDir/partitions.lst single gsub jobList
 $paraRun
 _EOF_
     );
   } else {
   my $paraRun = &HgAutomate::paraRun();
   my $gensub2 = &HgAutomate::gensub2();
   $bossScript->add(<<_EOF_
 chmod a+x TrfRun.csh
 
 rm -rf $partDir
 $Bin/simplePartition.pl $clusterSeq $chunkSize $partDir
 rm -f $buildDir/TrfPart
 ln -s $partDir $buildDir/TrfPart
 
 $gensub2 $partDir/partitions.lst single gsub jobList
 $paraRun
 _EOF_
   );
   }
 
   if (! $opt_unmaskedSeq && ! $inHive) {
     $bossScript->add(<<_EOF_
 rm -f $clusterSeq
 _EOF_
     );
   }
   $bossScript->execute();
 } # doCluster
 
 sub doSingle {
   my $runDir = "$buildDir";
   &HgAutomate::mustMkdir($runDir);
 
   my $workhorse = &HgAutomate::chooseWorkhorse();
   my $whatItDoes = "It runs trfBig on the entire (small) genome in one pass.";
   my $bossScript = new HgRemoteScript("$runDir/doTrf.csh", $workhorse,
 				      $runDir, $whatItDoes);
 
   my $trf409Option = "";
   my $trfCmd = "trf";
   if ($trf409 ne 0) {
      $trf409Option = "-l=$trf409";
      $trfCmd = "trf.4.09";
   }
   $bossScript->add(<<_EOF_
 $HgAutomate::setMachtype
 twoBitToFa $unmaskedSeq stdout \\
 | $clusterBin/trfBig $trf409Option -trf=$clusterBin/$trfCmd \\
       stdin /dev/null -bedAt=simpleRepeat.bed -tempDir=/scratch/tmp
 _EOF_
   );
   $bossScript->execute();
 } # doSingle
 
 sub doTrf {
   if ($useCluster) {
     &doCluster();
   } else {
     &doSingle();
   }
 } # doTrf
 
 
 #########################################################################
 # * step: filter [fileServer]
 sub doFilter {
   my $runDir = "$buildDir";
   if ($useCluster) {
     &HgAutomate::checkExistsUnlessDebug('trf', 'filter',
 					"$buildDir/run.cluster/run.time");
   }
   my $whatItDoes = "It filters simpleRepeat.bed > trfMask.bed.\n";
   if ($useCluster) {
     $whatItDoes =
       "It concatenates .bed files from cluster run into simpleRepeat.bed.\n"
 	. $whatItDoes;
   }
   if ($chromBased) {
     $whatItDoes .=
 "It splits trfMask.bed into per-chrom files for bigZips download generation.";
   }
   my $fileServer = &HgAutomate::chooseFileServer($runDir);
   my $bossScript = new HgRemoteScript("$runDir/doFilter.csh", $fileServer,
 				      $runDir, $whatItDoes);
 
   # Use symbolic link created in cluster step:
   my $partDir = "$buildDir/TrfPart";
   if ($useCluster) {
     $bossScript->add(<<_EOF_
-cat $partDir/???/*.bed > simpleRepeat.bed
+find $partDir/??? -type f | grep lst.bed | xargs cat > simpleRepeat.bed
 endsInLf simpleRepeat.bed
 if (\$status) then
   echo Uh-oh -- simpleRepeat.bed fails endsInLf.  Look at $partDir/ bed files.
   exit 1
 endif
 _EOF_
     );
   }
   $bossScript->add(<<_EOF_
 if ( -s simpleRepeat.bed ) then
   awk '{if (\$5 <= 12) print;}' simpleRepeat.bed > trfMask.bed
   awk 'BEGIN{OFS="\\t"}{name=substr(\$16,0,16);\$4=name;printf "%s\\n", \$0}' \\
     simpleRepeat.bed | sort -k1,1 -k2,2n > simpleRepeat.bed16.bed
   twoBitInfo $unmaskedSeq stdout | sort -k2nr > tmp.chrom.sizes
   bedToBigBed -tab -type=bed4+12 -as=\$HOME/kent/src/hg/lib/simpleRepeat.as \\
     simpleRepeat.bed16.bed tmp.chrom.sizes simpleRepeat.bb
   rm -f tmp.chrom.sizes simpleRepeat.bed16.bed
 else
   echo empty simpleRepeat.bed - no repeats found
 endif
 _EOF_
   );
   if ($chromBased) {
     $bossScript->add(<<_EOF_
 if ( -s trfMask.bed ) then
   splitFileByColumn trfMask.bed trfMaskChrom/
 else
   echo empty trfMask.bed - no repeats found
 endif
 _EOF_
     );
   }
 
   $bossScript->execute();
 } # doFilter
 
 
 #########################################################################
 # * step: load [dbHost]
 sub doLoad {
   my $runDir = "$buildDir";
   &HgAutomate::checkExistsUnlessDebug('filter', 'load',
 				      "$buildDir/simpleRepeat.bed");
 
   my $whatItDoes = "It loads simpleRepeat.bed into the simpleRepeat table.";
   my $bossScript = new HgRemoteScript("$runDir/doLoad.csh", $dbHost,
 				      $runDir, $whatItDoes);
 
   $bossScript->add(<<_EOF_
 if ( -s "simpleRepeat.bed" ) then
   hgLoadBed $db simpleRepeat simpleRepeat.bed \\
         -sqlTable=\$HOME/kent/src/hg/lib/simpleRepeat.sql
   featureBits $db simpleRepeat >& fb.simpleRepeat
   cat fb.simpleRepeat
 else
   echo empty simpleRepeat.bed - no repeats found
 endif
 _EOF_
   );
   $bossScript->execute();
 } # doLoad
 
 
 #########################################################################
 # * step: cleanup [fileServer]
 sub doCleanup {
   my $runDir = "$buildDir";
   my $whatItDoes = "It cleans up or compresses intermediate files.";
   my $fileServer = &HgAutomate::chooseFileServer($runDir);
   my $bossScript = new HgRemoteScript("$runDir/doCleanup.csh", $fileServer,
 				      $runDir, $whatItDoes);
   $bossScript->add(<<_EOF_
 if ( -d "TrfPart" || -l "TrfPart" ) then
   rm -fr TrfPart/*
   rm -fr TrfPart
 endif
 if (-d /hive/data/genomes/$db/TrfPart) then
   rmdir /hive/data/genomes/$db/TrfPart
 endif
 _EOF_
   );
   $bossScript->execute();
 } # doCleanup
 
 
 #########################################################################
 # main
 
 # Prevent "Suspended (tty input)" hanging:
 &HgAutomate::closeStdin();
 
 # Make sure we have valid options and exactly 1 argument:
 &checkOptions();
 &usage(1) if (scalar(@ARGV) != 1);
 ($db) = @ARGV;
 
 if (! $opt_trf409 ) {
   die "Error: must supply -trf409 argument, e.g.: -trf409=6\n";
 }
 
 # Now that we know the $db, figure out our paths:
 my $date = `date +%Y-%m-%d`;
 chomp $date;
 $buildDir = $opt_buildDir ? $opt_buildDir :
   "$HgAutomate::clusterData/$db/$HgAutomate::trackBuild/simpleRepeat.$date";
 $unmaskedSeq = $opt_unmaskedSeq ? $opt_unmaskedSeq :
   "$HgAutomate::clusterData/$db/$db.unmasked.2bit";
 
 $trf409 = $opt_trf409 ? $opt_trf409 : "";
 
 if (! -e $unmaskedSeq) {
   die $opt_unmaskedSeq ? "Error: -unmaskedSeq $unmaskedSeq does not exist.\n" :
     "Error: required file $unmaskedSeq does not exist. " .
       "(use -unmaskedSeq <file> ?)\n";
 }
 die "Error: -unmaskedSeq filename must end in .2bit (got $unmaskedSeq).\n"
   if ($unmaskedSeq !~ /\.2bit$/);
 if ($unmaskedSeq !~ /^\//) {
   my $pwd = `pwd`;
   chomp $pwd;
   $unmaskedSeq = "$pwd/$unmaskedSeq";
 }
 
+# try to adjust chunkSize to achive a reasonable (1K) single job sequence count
 my $pipe = &HgAutomate::mustOpen("twoBitInfo $unmaskedSeq stdout |");
 my $seqCount = 0;
 my $genomeSize = 0;
 while (<$pipe>) {
   chomp;
   my (undef, $size) = split;
   $genomeSize += $size;
   $seqCount++;
-  if ($seqCount > $HgAutomate::splitThreshold &&
-      $genomeSize > $singleRunSize) {
-    # No need to keep counting -- we know our boolean answers.
-    last;
-  }
 }
 close($pipe);
 die "Could not open pipe from twoBitInfo $unmaskedSeq"
   unless ($genomeSize > 0 && $seqCount > 0);
 
+# lots of contigs, and big genome, adjust chunkSize
+if ( ($seqCount > 100000) && ($genomeSize > $singleRunSize) ) {
+  # if they were uniform size, 1K jobs would be sized:
+  my $uniform1K = int( 1 + $genomeSize / 1000 );
+  # but if they were all small, 1K jobs would be sized
+  my $allSmall1K = int (1 + (($seqCount / 1000) * 5000));
+  # use the smaller size of those two measures
+  $chunkSize = $uniform1K > $allSmall1K ? $allSmall1K : $uniform1K;
+}
+
 $chromBased = ($seqCount <= $HgAutomate::splitThreshold);
 $useCluster = ($genomeSize > $singleRunSize);
 &HgAutomate::verbose(2, "\n$db is " .
 		     ($chromBased ? "chrom-based" : "scaffold-based") . ".\n" .
 		     "Total genome size is " .
 		     ($useCluster ? "> $singleRunSize; cluster & cat." :
 		                    "<= $singleRunSize; single run.") .
 		     "\n\n");
 
 # Do everything.
 $stepper->execute();
 
 # Tell the user anything they should know.
 my $stopStep = $stepper->getStopStep();
 my $upThrough = ($stopStep eq 'cleanup') ? "" :
   "  (through the '$stopStep' step)";
 
 &HgAutomate::verbose(1, <<_EOF_
 
  *** All done!$upThrough
  *** Steps were performed in $buildDir
 _EOF_
 );
 if ($stepper->stepPrecedes('trf', $stopStep)) {
   my $buildDirRel = $buildDir;
   $buildDirRel =~ s/^$HgAutomate::clusterData\/$db\///;
   &HgAutomate::verbose(1, <<_EOF_
  *** Please check the log file to see if trf had warnings that we
      should pass on to Gary Benson (ideally with a small test case).
  *** IMPORTANT: Developer, it is up to you to make use of trfMask.bed!
      Here is a typical command sequence, to be run after RepeatMasker
      (or WindowMasker etc.) has completed:
 
     cd $HgAutomate::clusterData/$db
     twoBitMask $db.rmsk.2bit -add $buildDirRel/trfMask.bed $db.2bit
 
  *** Again, it is up to you to incorporate trfMask.bed into the final 2bit!
 _EOF_
   );
 }
 &HgAutomate::verbose(1, "\n");