68a21741ea6388775442314e7e71e98c7a15adad
hiram
  Sat Oct 15 22:29:42 2022 -0700
rail para job ram size to 32g to avoid crashes due to out of memory no redmine

diff --git src/hg/utils/automation/doTandemDup.pl src/hg/utils/automation/doTandemDup.pl
index 343d6b2..d83e387 100755
--- src/hg/utils/automation/doTandemDup.pl
+++ src/hg/utils/automation/doTandemDup.pl
@@ -1,407 +1,407 @@
 #!/usr/bin/env perl
 
 # DO NOT EDIT the /cluster/bin/scripts copy of this file --
 # edit ~/kent/src/hg/utils/automation/doTandemDup.pl instead.
 
 use Getopt::Long;
 use warnings;
 use strict;
 use FindBin qw($Bin);
 use lib "$Bin";
 use HgAutomate;
 use HgRemoteScript;
 use HgStepManager;
 
 # Option variable names, both common and peculiar to this script:
 use vars @HgAutomate::commonOptionVars;
 use vars @HgStepManager::optionVars;
 use vars qw/
     $opt_kmerSize
     $opt_gapSize
     $opt_buildDir
     $opt_twoBit
     /;
 
 # Specify the steps supported with -continue / -stop:
 my $stepper = new HgStepManager(
     [ { name => 'kmers',   func => \&doKmers },
       { name => 'pairedEnds',   func => \&doPairedEnds },
       { name => 'collapsePairedEnds',   func => \&doCollapsePairedEnds },
       { name => 'load',   func => \&doLoad },
       { name => 'cleanup', func => \&doCleanup },
     ]
 				);
 
 # Option defaults:
 my $dbHost = 'hgwdev';
 my $bigClusterHub = 'ku';
 my $workhorse = 'hgwdev';
 my $defaultWorkhorse = 'hgwdev';
 my $twoBit = "$HgAutomate::clusterData/\$db/\$db.2bit";
 my $kmerSize = 30;	# initial default kmer size
 my $gapSize = 20000;	# initial default gap size
 
 my $base = $0;
 $base =~ s/^(.*\/)?//;
 
 sub usage {
   # Usage / help / self-documentation:
   my ($status, $detailed) = @_;
   # Basic help (for incorrect usage):
   print STDERR "
 usage: $base db
 options:
 ";
   print STDERR $stepper->getOptionHelp();
   print STDERR <<_EOF_
     -kmerSize N       Initial kmer size N instead of default: $kmerSize
     -gapSize N        Initial gap size N instead of default: $gapSize
     -buildDir dir     Use dir instead of default
                       $HgAutomate::clusterData/\$db/$HgAutomate::trackBuild/tandemDups
     -twoBit seq.2bit  Use seq.2bit as the input sequence instead
                       of default ($twoBit).
 
 _EOF_
   ;
   print STDERR &HgAutomate::getCommonOptionHelp('dbHost' => $dbHost,
 						'workhorse' => $defaultWorkhorse,
 						'bigClusterHub' => $bigClusterHub,
 						'smallClusterHub' => '');
   print STDERR "
 Automates UCSC's build of tandemDups track.  Steps:
     kmers - generate kmers over all sequence of size $kmerSize
     pairedEnds - pair up all kmers that are identical with gap from 1 to $gapSize
     collapsePairedEnds - collapse the pairedEnds when overlapping and same gap size
     load:       Collect results into bed file and load table '<db>.tandemDups'
     cleanup:    Removes or compresses intermediate files.
 All operations are performed in the build directory which is
 $HgAutomate::clusterData/\$db/$HgAutomate::trackBuild/tandemDups unless -buildDir is given.
 ";
   # Detailed help (-help):
   print STDERR "
 Assumptions:
 1. $HgAutomate::clusterData/\$db/\$db.2bit contains RepeatMasked sequence for
    database/assembly \$db.
 " if ($detailed);
   print "\n";
   exit $status;
 }
 
 # Globals:
 # Command line args: db
 my ($db);
 # Other:
 my ($buildDir, $secondsStart, $secondsEnd, $dbExists, $kmersMinus1);
 
 sub checkOptions {
   # Make sure command line options are valid/supported.
   my $ok = GetOptions(@HgStepManager::optionSpec,
 		      'kmerSize=i',
 		      'gapSize=i',
 		      'buildDir=s',
 		      'twoBit=s',
 		      @HgAutomate::commonOptionSpec,
 		      );
   &usage(1) if (!$ok);
   &usage(0, 1) if ($opt_help);
   &HgAutomate::processCommonOptions();
   my $err = $stepper->processOptions();
   usage(1) if ($err);
   $workhorse = $opt_workhorse if ($opt_workhorse);
   $bigClusterHub = $opt_bigClusterHub if ($opt_bigClusterHub);
   $dbHost = $opt_dbHost if ($opt_dbHost);
 }
 
 #########################################################################
 # * step: kmers [bigClusterHub]
 sub doKmers {
   my $runDir = "$buildDir/kmers";
   my $paraHub = $bigClusterHub;
 
   # First, make sure we're starting clean.
   if ( ! $opt_debug && ( -s "$runDir/runOne" ) ) {
     die "kmers: looks like this was run successfully already " .
       "(file db/bed/tandemDups/kmers/runOne exists).  Either run with -continue pairedEnds or some later " .
         "stage, or move aside/remove $runDir and run again.\n";
   }
 
   &HgAutomate::mustMkdir($runDir);
 
   my $templateCmd = ('runOne $(path1) {check out exists tmp/$(path1).' .
             "$kmersMinus1" . 'mer.txt.gz}');
   &HgAutomate::makeGsub($runDir, $templateCmd);
   `touch "$runDir/para_hub_$paraHub"`;
 
   my $whatItDoes = "Generate kmers for entire sequence.";
   my $bossScript = newBash HgRemoteScript("$runDir/runKmers.bash",
 		$paraHub, $runDir, $whatItDoes);
 
   my $paraRun = &HgAutomate::paraRun();
 
   $bossScript->add(<<_EOF_
 twoBitInfo $twoBit stdout | cut -f1 > part.list
 printf '#!/bin/bash
 
 set -beEu -o pipefail
 
 export fa=\$1
 export result=\$2
 
 mkdir -p tmp
 
 twoBitToFa ${twoBit}:\$fa stdout \\
   | $Bin/kmerPrint.pl $kmersMinus1 stdin | gzip -c > \$result
 ' > runOne
 chmod +x runOne
 gensub2 part.list single gsub jobList
 $paraRun
 _EOF_
   );
   $bossScript->execute();
 } # doKmers()
 
 #########################################################################
 # * step: pairedEnds [bigClusterHub]
 sub doPairedEnds {
   my $prevRunDir = "$buildDir/kmers";
   my $runDir = "$buildDir/pairedEnds";
   my $paraHub = $bigClusterHub;
 
   # First, make sure we're starting clean.
   if ( ! $opt_debug && ( -s "$runDir/runOne" ) ) {
     die "pairedEnds: looks like this was run successfully already " .
       "(file db/bed/tandemDups/pairedEnds/runOne exists).  Either run with -continue blat or some later " .
         "stage, or move aside/remove $runDir and run again.\n";
   }
   if ( ! $opt_debug && ( ! -s "$prevRunDir/run.time" ) ) {
     die "pairedEnds: looks like previous kmers step has not been competed, " .
       "file db/bed/tandemDups/kmers/run.time does not exist. run with: -continue kmers to perform previous step.\n";
   }
 
   &HgAutomate::mustMkdir($runDir);
 
   my $templateCmd = ("runOne $kmersMinus1 $gapSize " . '$(path1) {check out exists tmp/$(path1).bed.gz}');
   &HgAutomate::makeGsub($runDir, $templateCmd);
   `touch "$runDir/para_hub_$paraHub"`;
 
   my $whatItDoes = "Collect kmers into pairs.";
   my $bossScript = newBash HgRemoteScript("$runDir/runPairedEnds.bash",
 		$paraHub, $runDir, $whatItDoes);
 
-  # trying 16G ram to see if jobs will not fail
-  my $paraRun = "para make -ram=16g jobList
+  # trying 32G ram to see if jobs will not fail
+  my $paraRun = "para make -ram=32g jobList
 para check
 para time > run.time
 cat run.time\n";
 
   $bossScript->add(<<_EOF_
 ln -s $prevRunDir/part.list .
 printf '#!/bin/bash
 
 set -beEu -o pipefail
 
 export kmerSize=\$1
 export gapSize=\$2
 export chrName=\$3
 export result=\$4
 
 mkdir -p tmp
 
 $Bin/kmerPairs.pl \$kmerSize \$gapSize \$chrName \$result
 ' > runOne
 chmod +x runOne
 gensub2 part.list single gsub jobList
 $paraRun
 _EOF_
   );
   $bossScript->execute();
 } # doPairedEnds()
 
 #########################################################################
 # * step: collapsePairedEnds [bigClusterHub]
 sub doCollapsePairedEnds {
   my $prevRunDir = "$buildDir/pairedEnds";
   my $runDir = "$buildDir/collapsePairedEnds";
   my $paraHub = $bigClusterHub;
 
   # First, make sure we're starting clean.
   if ( ! $opt_debug && ( -s "$runDir/runOne" ) ) {
     die "collapsePairedEnds: looks like this was run successfully already " .
       "(file db/bed/tandemDups/collapsePairedEnds/runOne exists).  Either run with -continue load or some later " .
         "stage, or move aside/remove $runDir and run again.\n";
   }
   if ( ! $opt_debug && ( ! -s "$prevRunDir/run.time" ) ) {
     die "pairedEnds: looks like previous pairedEnds step has not been competed, " .
       "file db/bed/tandemDups/pairedEnds/run.time does not exist. run with: -continue pairedEnds to perform previous step.\n";
   }
 
   &HgAutomate::mustMkdir($runDir);
 
   my $templateCmd = ('runOne $(path1) {check out exists tmp/$(path1).bed.gz}');
   &HgAutomate::makeGsub($runDir, $templateCmd);
   `touch "$runDir/para_hub_$paraHub"`;
 
   my $whatItDoes = "Collapse kmers pairs into larger paired ends.";
   my $bossScript = newBash HgRemoteScript("$runDir/runCollapsePairedEnds.bash",
 		$paraHub, $runDir, $whatItDoes);
 
-  # trying 16G ram to see if jobs will not fail
-  my $paraRun = "para make -ram=16g jobList
+  # trying 32G ram to see if jobs will not fail
+  my $paraRun = "para make -ram=32g jobList
 para check
 para time > run.time
 cat run.time\n";
 
   $bossScript->add(<<_EOF_
 ln -s $prevRunDir/part.list .
 printf '#!/bin/bash
 
 set -beEu -o pipefail
 
 export chrName=\$1
 export result=\$2
 mkdir -p tmp
 
 $Bin/kmerCollapsePairedEnds.pl $kmersMinus1 ../pairedEnds/tmp/\$chrName.bed.gz \\
    | gzip -c > tmp/\$chrName.bed.gz
 ' > runOne
 chmod +x runOne
 gensub2 part.list single gsub jobList
 $paraRun
 _EOF_
   );
   $bossScript->execute();
 } # doCollapsePairedEnds()
 
 #########################################################################
 # * step: load [dbHost]
 sub doLoad {
   my $prevRunDir = "$buildDir/collapsePairedEnds";
   my $runDir = "$buildDir";
 
   # First, make sure we're starting clean.
   if ( ! $opt_debug && ( -s "$runDir/loadUp.bash" ) ) {
     die "load: looks like this was run successfully already, " .
       "file db/bed/tandemDups/loadUp.bash exists.  Can run with: -continue cleanup " .
         "to complete this procedure.\n";
   }
   # And, must have something to load
   if ( ! $opt_debug && ( ! -s "$prevRunDir/run.time" ) ) {
     die "load: previous step collapsePairedEnds has not completed,\n" .
       "Can run with: -continue collapsePairedEnds or check why previous step has not completed.\n";
   }
 
   my $whatItDoes = "Collect results into single file, if db exists, load table tandemDups.";
   my $bossScript = newBash HgRemoteScript("$runDir/loadUp.bash",
 		$workhorse, $runDir, $whatItDoes);
 
   $bossScript->add(<<_EOF_
 for F in collapsePairedEnds/tmp/*.bed.gz
 do
   zcat \$F | awk '\$5 > $kmersMinus1'
 done | sort -k1,1 -k2,2n | gzip -c > $db.tandemDups.bed.gz
 twoBitInfo $twoBit stdout | sort -k2nr > $db.chrom.sizes
 export maxScore=`zcat $db.tandemDups.bed.gz | cut -f5 | sort -k1,1nr | head -1`
 zcat $db.tandemDups.bed.gz | awk -vmax=\$maxScore '{score=\$5; newScore=int(1000*score/max); printf "%s\\t%s\\t%s\\t%s\\t", \$1, \$2, \$3, \$4; printf "%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n", newScore, \$6, \$7, \$8, \$9, \$10, \$11, \$12, score}' > $db.reScore.bed
 bedToBigBed -type=bed12+1 $db.reScore.bed $db.chrom.sizes $db.tandemDups.bb
 bedToExons $db.tandemDups.bed.gz stdout | bedSingleCover.pl stdin > $db.exons.bed
 export baseCount=`awk '{sum+=\$3-\$2}END{printf "%d", sum}' $db.exons.bed`
 export asmSize=`awk '{sum+=\$2}END{printf "%d", sum}' $db.chrom.sizes`
 export perCent=`echo \$baseCount \$asmSize | awk '{printf "%.3f", 100.0*\$1/\$2}'`
 printf "%d bases of %d (%s%%) in intersection\\n" "\$baseCount" "\$asmSize" "\$perCent" > fb.$db.tandemDups.txt
 _EOF_
   );
 
   # do not load if db does not exist:
   if ( $opt_debug || $dbExists ) {
       $bossScript->add(<<_EOF_
 # do not load empty files
 if [ -s $db.reScore.bed ]; then
   hgLoadBed -as=\$HOME/kent/src/hg/lib/fullBed.as -type=bed4+8 $db tandemDups $db.tandemDups.bed.gz
   checkTableCoords $db tandemDups
 fi
 _EOF_
       );
   }
 
   $bossScript->execute();
 } # doLoad
 
 #########################################################################
 # * step: cleanup [workhorse]
 sub doCleanup {
   my $runDir = "$buildDir";
 
   # Skip this if already done
   if ( ! $opt_debug && ( -s "$runDir/doCleanup.csh" ) ) {
     printf STDERR "# cleanup already done\n";
   } else {
     my $whatItDoes = "Clean up and/or compress intermediate files.";
     my $bossScript = new HgRemoteScript("$runDir/doCleanup.csh", $workhorse,
 				      $runDir, $whatItDoes);
     $bossScript->add(<<_EOF_
 rm -f $db.reScore.bed $db.exons.bed
 rm -fr kmers/tmp pairedEnds/tmp collapsePairedEnds/tmp $db.chrom.sizes bed.tab
 rm -fr kmers/err pairedEnds/err collapsePairedEnds/err
 _EOF_
     );
     $bossScript->execute();
   }
 } # doCleanup
 
 #########################################################################
 # main
 
 # Prevent "Suspended (tty input)" hanging:
 &HgAutomate::closeStdin();
 
 # Make sure we have valid options and exactly 1 argument:
 &checkOptions();
 &usage(1) if (scalar(@ARGV) != 1);
 $secondsStart = `date "+%s"`;
 chomp $secondsStart;
 ($db) = @ARGV;
 
 # Force debug and verbose until this is looking pretty solid:
 # $opt_debug = 1;
 $opt_verbose = 3 if ($opt_verbose < 3);
 
 # Establish directory to work in:
 $buildDir = $opt_buildDir ? $opt_buildDir :
   "$HgAutomate::clusterData/$db/$HgAutomate::trackBuild/tandemDups";
 $twoBit = $opt_twoBit ? $opt_twoBit :
   "$HgAutomate::clusterData/$db/$db.2bit";
 $kmerSize = $opt_kmerSize ? $opt_kmerSize : $kmerSize;
 # initial kmer set are size one less than specified kmerSize
 # this gets the first set of kmer pairs established correctly at size kmerSize
 $kmersMinus1 = $kmerSize - 1;
 $gapSize = $opt_gapSize ? $opt_gapSize : $gapSize;
 &HgAutomate::verbose(1, "# kmer size: $kmerSize\n");
 &HgAutomate::verbose(1, "#  gap size: $gapSize\n");
 &HgAutomate::verbose(1, "# 2bit file: $twoBit\n");
 die "kmerSize must be >= 30, this is the lower limit" if ($kmerSize < 30);
 die "gapSize must be >= 1, this is the lower limit" if ($gapSize < 1);
 die "can not find 2bit file: '$twoBit'" if ( ! -s $twoBit);
 
 # may be working on a 2bit file that does not have a database browser
 $dbExists = 0;
 $dbExists = 1 if (&HgAutomate::databaseExists($dbHost, $db));
 
 # Do everything.
 $stepper->execute();
 
 # Tell the user anything they should know.
 my $stopStep = $stepper->getStopStep();
 my $upThrough = ($stopStep eq 'cleanup') ? "" :
   "  (through the '$stopStep' step)";
 
 $secondsEnd = `date "+%s"`;
 chomp $secondsEnd;
 my $elapsedSeconds = $secondsEnd - $secondsStart;
 my $elapsedMinutes = int($elapsedSeconds/60);
 $elapsedSeconds -= $elapsedMinutes * 60;
 
 &HgAutomate::verbose(1,
 	"\n *** All done !$upThrough  Elapsed time: ${elapsedMinutes}m${elapsedSeconds}s\n");
 &HgAutomate::verbose(1,
 	" *** Steps were performed in $buildDir\n");
 &HgAutomate::verbose(1, "\n");