b02e6b88da8d3a3d306017183d22d63a22c3543e
hiram
  Thu Mar 21 22:03:37 2024 -0700
adding ram and cpu arguments no redmine

diff --git src/hg/utils/automation/doAugustus.pl src/hg/utils/automation/doAugustus.pl
index b410ff0..38d221e 100755
--- src/hg/utils/automation/doAugustus.pl
+++ src/hg/utils/automation/doAugustus.pl
@@ -1,91 +1,93 @@
 #!/usr/bin/env perl
 
 # DO NOT EDIT the /cluster/bin/scripts copy of this file --
 # edit ~/kent/src/hg/utils/automation/doAugustus.pl instead.
 
 use Getopt::Long;
 use warnings;
 use strict;
 use FindBin qw($Bin);
 use lib "$Bin";
 use HgAutomate;
+use AsmHub;
 use HgRemoteScript;
 use HgStepManager;
 
 # Option variable names, both common and peculiar to this script:
 use vars @HgAutomate::commonOptionVars;
 use vars @HgStepManager::optionVars;
 use vars qw/
     $opt_buildDir
     $opt_maskedSeq
     $opt_species
     $opt_utr
-    $opt_ram
     $opt_noDbGenePredCheck
     /;
 
 # Specify the steps supported with -continue / -stop:
 my $stepper = new HgStepManager(
     [ { name => 'partition',   func => \&doPartition },
       { name => 'augustus', func => \&doAugustus },
       { name => 'makeGp', func => \&doMakeGp },
       { name => 'load', func => \&doLoadAugustus },
       { name => 'cleanup', func => \&doCleanup },
     ]
 				);
 
 # Option defaults:
 # my $bigClusterHub = 'swarm';
 my $bigClusterHub = 'ku';
 my $workhorse = 'hgwdev';
 my $dbHost = 'hgwdev';
+my $ram = '6g';
+my $cpu = 1;
 my $defaultWorkhorse = 'hgwdev';
 my $maskedSeq = "$HgAutomate::clusterData/\$db/\$db.2bit";
 my $utr = "off";
-my $ramG = "-ram=8g";
 my $noDbGenePredCheck = 1;    # default yes, use -db for genePredCheck
 my $species = "human";
 my $augustusDir = "/hive/data/outside/augustus/augustus-3.3.1";
 my $augustusConfig="$augustusDir/config";
 
 my $base = $0;
 $base =~ s/^(.*\/)?//;
 
 sub usage {
   # Usage / help / self-documentation:
   my ($status, $detailed) = @_;
   # Basic help (for incorrect usage):
   print STDERR "
 usage: $base db
 options:
 ";
   print STDERR $stepper->getOptionHelp();
   print STDERR <<_EOF_
     -buildDir dir         Use dir instead of default
                           $HgAutomate::clusterData/\$db/$HgAutomate::trackBuild/augustus
                           (necessary when continuing at a later date).
     -maskedSeq seq.2bit   Use seq.2bit as the masked input sequence instead
                           of default ($maskedSeq).
     -utr                  Obsolete, now is automatic (was: Use augustus arg: --UTR=on, default is --UTR=off)
     -noDbGenePredCheck    do not use -db= on genePredCheck, there is no real db
-    -ram Ng             set -ram=Ng argument to para create command (default 8g)
     -species <name>       name from list: human chicken zebrafish, default: human
 _EOF_
   ;
   print STDERR &HgAutomate::getCommonOptionHelp('dbHost' => $dbHost,
                                 'bigClusterHub' => $bigClusterHub,
+                                'ram' => $ram,
+                                'cpu' => $cpu,
                                 'workhorse' => $defaultWorkhorse);
   print STDERR "
 Automates UCSC's Augustus track construction for the database \$db.  Steps:
     partition:  Creates hard-masked fastas needed for the CpG Island program.
     augustus:   Run gsBig on the hard-masked fastas
     makeGp:   Transform output from gsBig into augustus.gtf augustus.pep and
     load:      Load augustus.gtf and into \$db.
     cleanup:   Removes hard-masked fastas and output from gsBig.
 All operations are performed in the build directory which is
 $HgAutomate::clusterData/\$db/$HgAutomate::trackBuild/augustus unless -buildDir is given.
 ";
   # Detailed help (-help):
   print STDERR "
 Assumptions:
 1. $HgAutomate::clusterData/\$db/\$db.2bit contains RepeatMasked sequence for
@@ -236,54 +238,52 @@
  --AUGUSTUS_CONFIG_PATH=$augustusConfig \\
   --alternatives-from-sampling=true --sample=100 --minexonintronprob=0.2 \\
    --minmeanexonintronprob=0.5 --maxtracks=3 --temperature=2 \\
      --predictionStart=\$start --predictionEnd=\$end \\
       \$fasta --outfile=\$gtfFile --errfile=\$errFile:t
 endif
 
 gzip \$gtfFile
 popd
 mv \$tmpDir/\$gtfFile.gz \$resultGz
 mv \$tmpDir/\$errFile:t \$errFile
 rm -fr \$tmpDir
 _EOF_
   );
 
+  my $paraRun = &HgAutomate::paraRun($ram, $cpu);
   $whatItDoes = "Run augustus on chunked fasta sequences.";
   $bossScript = newBash HgRemoteScript("$runDir/runAugustus.bash", $paraHub,
 				      $runDir, $whatItDoes);
   $bossScript->add(<<_EOF_
 (grep -v partBundles ../partition/part.list || /bin/true) | while read twoBit
 do
   chr=`echo \$twoBit |  sed -e 's/.*2bit://;' | awk -F':' '{print \$1}'`
   chrStart=`echo \$twoBit |  sed -e 's/.*2bit://;' | awk -F':' '{print \$2}' | sed -e 's/-.*//;'`
   chrEnd=`echo \$twoBit |  sed -e 's/.*2bit://;' | awk -F':' '{print \$2}' | sed -e 's/.*-//;'`
   echo "runOne \$chrStart \$chrEnd {check in exists+ $buildDir/fasta/\${chr}.fa} {check out exists+ gtf/\$chr/\$chr.\$chrStart.\$chrEnd.gtf.gz}"
 done > jobList
 
 (grep partBundles ../partition/part.list || /bin/true) | while read bundleName
 do
   B=`basename \$bundleName | sed -e 's/.lst//;'`
   echo "runOne 0 0 {check in exists+ $buildDir/fasta/\${B}.fa} {check out exists+ gtf/\${B}/\${B}.0.0.gtf.gz}"
 done >> jobList
 
 chmod +x runOne
 
-/parasol/bin/para $ramG make jobList
-/parasol/bin/para check
-/parasol/bin/para time > run.time
-cat run.time
+$paraRun
 _EOF_
   );
   $bossScript->execute();
 } # doAugustus
 
 #########################################################################
 # * step: make gp [workhorse]
 sub doMakeGp {
   my $runDir = $buildDir;
   &HgAutomate::mustMkdir($runDir);
 
   # First, make sure we're starting clean.
   if (! -e "$runDir/run.augustus/run.time") {
     die "doMakeGp: the previous step augustus did not complete \n" .
       "successfully ($buildDir/run.augustus/run.time does not exist).\nPlease " .
@@ -402,43 +402,54 @@
 # Prevent "Suspended (tty input)" hanging:
 &HgAutomate::closeStdin();
 
 # Make sure we have valid options and exactly 1 argument:
 &checkOptions();
 &usage(1) if (scalar(@ARGV) != 1);
 $secondsStart = `date "+%s"`;
 chomp $secondsStart;
 ($db) = @ARGV;
 
 # Force debug and verbose until this is looking pretty solid:
 #$opt_debug = 1;
 #$opt_verbose = 3 if ($opt_verbose < 3);
 
 $noDbGenePredCheck = $opt_noDbGenePredCheck ? 0 : $noDbGenePredCheck;
-$ramG = $opt_ram ? $ramG : "-ram=$opt_ram";
 
 # Establish what directory we will work in.
 $buildDir = $opt_buildDir ? $opt_buildDir :
   "$HgAutomate::clusterData/$db/$HgAutomate::trackBuild/augustus";
 $maskedSeq = $opt_maskedSeq ? $opt_maskedSeq :
   "$HgAutomate::clusterData/$db/$db.2bit";
 $species = $opt_species ? $opt_species : $species;
 if ( -s "${augustusConfig}/species/$species/${species}_utr_probs.pbl" ) {
   $utr = "on";
 } else {
   $utr = "off";
 }
+my $maxSeqSize = `twoBitInfo $maskedSeq stdout | sort -k2,2nr | head -1 | awk '{printf "%s", \$NF}'`;
+my $asmSize = `twoBitInfo $maskedSeq stdout | ave -col=2 stdin | grep -w total | awk '{printf "%d", \$NF}'`;
+chomp $maxSeqSize;
+chomp $asmSize;
+#   big genomes are over 4Gb: 4*1024*1024*1024 = 4294967296
+#   or if maxSeqSize over 1Gb
+if ( "$asmSize" > 4*1024**3 || $maxSeqSize > 1024**3 ) {
+  $ram = '16g';
+}
+printf STDERR "# maxSeqSize: %s\n", &AsmHub::commify($maxSeqSize);
+printf STDERR "# asmSize %s\n", &AsmHub::commify($asmSize);
+printf STDERR "# -ram=%s\n", $ram;
 
 # Do everything.
 $stepper->execute();
 
 # Tell the user anything they should know.
 my $stopStep = $stepper->getStopStep();
 my $upThrough = ($stopStep eq 'cleanup') ? "" :
   "  (through the '$stopStep' step)";
 
 $secondsEnd = `date "+%s"`;
 chomp $secondsEnd;
 my $elapsedSeconds = $secondsEnd - $secondsStart;
 my $elapsedMinutes = int($elapsedSeconds/60);
 $elapsedSeconds -= $elapsedMinutes * 60;