b02e6b88da8d3a3d306017183d22d63a22c3543e hiram Thu Mar 21 22:03:37 2024 -0700 adding ram and cpu arguments no redmine diff --git src/hg/utils/automation/doAugustus.pl src/hg/utils/automation/doAugustus.pl index b410ff0..38d221e 100755 --- src/hg/utils/automation/doAugustus.pl +++ src/hg/utils/automation/doAugustus.pl @@ -1,91 +1,93 @@ #!/usr/bin/env perl # DO NOT EDIT the /cluster/bin/scripts copy of this file -- # edit ~/kent/src/hg/utils/automation/doAugustus.pl instead. use Getopt::Long; use warnings; use strict; use FindBin qw($Bin); use lib "$Bin"; use HgAutomate; +use AsmHub; use HgRemoteScript; use HgStepManager; # Option variable names, both common and peculiar to this script: use vars @HgAutomate::commonOptionVars; use vars @HgStepManager::optionVars; use vars qw/ $opt_buildDir $opt_maskedSeq $opt_species $opt_utr - $opt_ram $opt_noDbGenePredCheck /; # Specify the steps supported with -continue / -stop: my $stepper = new HgStepManager( [ { name => 'partition', func => \&doPartition }, { name => 'augustus', func => \&doAugustus }, { name => 'makeGp', func => \&doMakeGp }, { name => 'load', func => \&doLoadAugustus }, { name => 'cleanup', func => \&doCleanup }, ] ); # Option defaults: # my $bigClusterHub = 'swarm'; my $bigClusterHub = 'ku'; my $workhorse = 'hgwdev'; my $dbHost = 'hgwdev'; +my $ram = '6g'; +my $cpu = 1; my $defaultWorkhorse = 'hgwdev'; my $maskedSeq = "$HgAutomate::clusterData/\$db/\$db.2bit"; my $utr = "off"; -my $ramG = "-ram=8g"; my $noDbGenePredCheck = 1; # default yes, use -db for genePredCheck my $species = "human"; my $augustusDir = "/hive/data/outside/augustus/augustus-3.3.1"; my $augustusConfig="$augustusDir/config"; my $base = $0; $base =~ s/^(.*\/)?//; sub usage { # Usage / help / self-documentation: my ($status, $detailed) = @_; # Basic help (for incorrect usage): print STDERR " usage: $base db options: "; print STDERR $stepper->getOptionHelp(); print STDERR <<_EOF_ -buildDir dir Use dir instead of default $HgAutomate::clusterData/\$db/$HgAutomate::trackBuild/augustus (necessary when continuing at a later date). -maskedSeq seq.2bit Use seq.2bit as the masked input sequence instead of default ($maskedSeq). -utr Obsolete, now is automatic (was: Use augustus arg: --UTR=on, default is --UTR=off) -noDbGenePredCheck do not use -db= on genePredCheck, there is no real db - -ram Ng set -ram=Ng argument to para create command (default 8g) -species <name> name from list: human chicken zebrafish, default: human _EOF_ ; print STDERR &HgAutomate::getCommonOptionHelp('dbHost' => $dbHost, 'bigClusterHub' => $bigClusterHub, + 'ram' => $ram, + 'cpu' => $cpu, 'workhorse' => $defaultWorkhorse); print STDERR " Automates UCSC's Augustus track construction for the database \$db. Steps: partition: Creates hard-masked fastas needed for the CpG Island program. augustus: Run gsBig on the hard-masked fastas makeGp: Transform output from gsBig into augustus.gtf augustus.pep and load: Load augustus.gtf and into \$db. cleanup: Removes hard-masked fastas and output from gsBig. All operations are performed in the build directory which is $HgAutomate::clusterData/\$db/$HgAutomate::trackBuild/augustus unless -buildDir is given. "; # Detailed help (-help): print STDERR " Assumptions: 1. $HgAutomate::clusterData/\$db/\$db.2bit contains RepeatMasked sequence for @@ -236,54 +238,52 @@ --AUGUSTUS_CONFIG_PATH=$augustusConfig \\ --alternatives-from-sampling=true --sample=100 --minexonintronprob=0.2 \\ --minmeanexonintronprob=0.5 --maxtracks=3 --temperature=2 \\ --predictionStart=\$start --predictionEnd=\$end \\ \$fasta --outfile=\$gtfFile --errfile=\$errFile:t endif gzip \$gtfFile popd mv \$tmpDir/\$gtfFile.gz \$resultGz mv \$tmpDir/\$errFile:t \$errFile rm -fr \$tmpDir _EOF_ ); + my $paraRun = &HgAutomate::paraRun($ram, $cpu); $whatItDoes = "Run augustus on chunked fasta sequences."; $bossScript = newBash HgRemoteScript("$runDir/runAugustus.bash", $paraHub, $runDir, $whatItDoes); $bossScript->add(<<_EOF_ (grep -v partBundles ../partition/part.list || /bin/true) | while read twoBit do chr=`echo \$twoBit | sed -e 's/.*2bit://;' | awk -F':' '{print \$1}'` chrStart=`echo \$twoBit | sed -e 's/.*2bit://;' | awk -F':' '{print \$2}' | sed -e 's/-.*//;'` chrEnd=`echo \$twoBit | sed -e 's/.*2bit://;' | awk -F':' '{print \$2}' | sed -e 's/.*-//;'` echo "runOne \$chrStart \$chrEnd {check in exists+ $buildDir/fasta/\${chr}.fa} {check out exists+ gtf/\$chr/\$chr.\$chrStart.\$chrEnd.gtf.gz}" done > jobList (grep partBundles ../partition/part.list || /bin/true) | while read bundleName do B=`basename \$bundleName | sed -e 's/.lst//;'` echo "runOne 0 0 {check in exists+ $buildDir/fasta/\${B}.fa} {check out exists+ gtf/\${B}/\${B}.0.0.gtf.gz}" done >> jobList chmod +x runOne -/parasol/bin/para $ramG make jobList -/parasol/bin/para check -/parasol/bin/para time > run.time -cat run.time +$paraRun _EOF_ ); $bossScript->execute(); } # doAugustus ######################################################################### # * step: make gp [workhorse] sub doMakeGp { my $runDir = $buildDir; &HgAutomate::mustMkdir($runDir); # First, make sure we're starting clean. if (! -e "$runDir/run.augustus/run.time") { die "doMakeGp: the previous step augustus did not complete \n" . "successfully ($buildDir/run.augustus/run.time does not exist).\nPlease " . @@ -402,43 +402,54 @@ # Prevent "Suspended (tty input)" hanging: &HgAutomate::closeStdin(); # Make sure we have valid options and exactly 1 argument: &checkOptions(); &usage(1) if (scalar(@ARGV) != 1); $secondsStart = `date "+%s"`; chomp $secondsStart; ($db) = @ARGV; # Force debug and verbose until this is looking pretty solid: #$opt_debug = 1; #$opt_verbose = 3 if ($opt_verbose < 3); $noDbGenePredCheck = $opt_noDbGenePredCheck ? 0 : $noDbGenePredCheck; -$ramG = $opt_ram ? $ramG : "-ram=$opt_ram"; # Establish what directory we will work in. $buildDir = $opt_buildDir ? $opt_buildDir : "$HgAutomate::clusterData/$db/$HgAutomate::trackBuild/augustus"; $maskedSeq = $opt_maskedSeq ? $opt_maskedSeq : "$HgAutomate::clusterData/$db/$db.2bit"; $species = $opt_species ? $opt_species : $species; if ( -s "${augustusConfig}/species/$species/${species}_utr_probs.pbl" ) { $utr = "on"; } else { $utr = "off"; } +my $maxSeqSize = `twoBitInfo $maskedSeq stdout | sort -k2,2nr | head -1 | awk '{printf "%s", \$NF}'`; +my $asmSize = `twoBitInfo $maskedSeq stdout | ave -col=2 stdin | grep -w total | awk '{printf "%d", \$NF}'`; +chomp $maxSeqSize; +chomp $asmSize; +# big genomes are over 4Gb: 4*1024*1024*1024 = 4294967296 +# or if maxSeqSize over 1Gb +if ( "$asmSize" > 4*1024**3 || $maxSeqSize > 1024**3 ) { + $ram = '16g'; +} +printf STDERR "# maxSeqSize: %s\n", &AsmHub::commify($maxSeqSize); +printf STDERR "# asmSize %s\n", &AsmHub::commify($asmSize); +printf STDERR "# -ram=%s\n", $ram; # Do everything. $stepper->execute(); # Tell the user anything they should know. my $stopStep = $stepper->getStopStep(); my $upThrough = ($stopStep eq 'cleanup') ? "" : " (through the '$stopStep' step)"; $secondsEnd = `date "+%s"`; chomp $secondsEnd; my $elapsedSeconds = $secondsEnd - $secondsStart; my $elapsedMinutes = int($elapsedSeconds/60); $elapsedSeconds -= $elapsedMinutes * 60;