68a21741ea6388775442314e7e71e98c7a15adad hiram Sat Oct 15 22:29:42 2022 -0700 rail para job ram size to 32g to avoid crashes due to out of memory no redmine diff --git src/hg/utils/automation/doTandemDup.pl src/hg/utils/automation/doTandemDup.pl index 343d6b2..d83e387 100755 --- src/hg/utils/automation/doTandemDup.pl +++ src/hg/utils/automation/doTandemDup.pl @@ -1,407 +1,407 @@ #!/usr/bin/env perl # DO NOT EDIT the /cluster/bin/scripts copy of this file -- # edit ~/kent/src/hg/utils/automation/doTandemDup.pl instead. use Getopt::Long; use warnings; use strict; use FindBin qw($Bin); use lib "$Bin"; use HgAutomate; use HgRemoteScript; use HgStepManager; # Option variable names, both common and peculiar to this script: use vars @HgAutomate::commonOptionVars; use vars @HgStepManager::optionVars; use vars qw/ $opt_kmerSize $opt_gapSize $opt_buildDir $opt_twoBit /; # Specify the steps supported with -continue / -stop: my $stepper = new HgStepManager( [ { name => 'kmers', func => \&doKmers }, { name => 'pairedEnds', func => \&doPairedEnds }, { name => 'collapsePairedEnds', func => \&doCollapsePairedEnds }, { name => 'load', func => \&doLoad }, { name => 'cleanup', func => \&doCleanup }, ] ); # Option defaults: my $dbHost = 'hgwdev'; my $bigClusterHub = 'ku'; my $workhorse = 'hgwdev'; my $defaultWorkhorse = 'hgwdev'; my $twoBit = "$HgAutomate::clusterData/\$db/\$db.2bit"; my $kmerSize = 30; # initial default kmer size my $gapSize = 20000; # initial default gap size my $base = $0; $base =~ s/^(.*\/)?//; sub usage { # Usage / help / self-documentation: my ($status, $detailed) = @_; # Basic help (for incorrect usage): print STDERR " usage: $base db options: "; print STDERR $stepper->getOptionHelp(); print STDERR <<_EOF_ -kmerSize N Initial kmer size N instead of default: $kmerSize -gapSize N Initial gap size N instead of default: $gapSize -buildDir dir Use dir instead of default $HgAutomate::clusterData/\$db/$HgAutomate::trackBuild/tandemDups -twoBit seq.2bit Use seq.2bit as the input sequence instead of default ($twoBit). _EOF_ ; print STDERR &HgAutomate::getCommonOptionHelp('dbHost' => $dbHost, 'workhorse' => $defaultWorkhorse, 'bigClusterHub' => $bigClusterHub, 'smallClusterHub' => ''); print STDERR " Automates UCSC's build of tandemDups track. Steps: kmers - generate kmers over all sequence of size $kmerSize pairedEnds - pair up all kmers that are identical with gap from 1 to $gapSize collapsePairedEnds - collapse the pairedEnds when overlapping and same gap size load: Collect results into bed file and load table '.tandemDups' cleanup: Removes or compresses intermediate files. All operations are performed in the build directory which is $HgAutomate::clusterData/\$db/$HgAutomate::trackBuild/tandemDups unless -buildDir is given. "; # Detailed help (-help): print STDERR " Assumptions: 1. $HgAutomate::clusterData/\$db/\$db.2bit contains RepeatMasked sequence for database/assembly \$db. " if ($detailed); print "\n"; exit $status; } # Globals: # Command line args: db my ($db); # Other: my ($buildDir, $secondsStart, $secondsEnd, $dbExists, $kmersMinus1); sub checkOptions { # Make sure command line options are valid/supported. my $ok = GetOptions(@HgStepManager::optionSpec, 'kmerSize=i', 'gapSize=i', 'buildDir=s', 'twoBit=s', @HgAutomate::commonOptionSpec, ); &usage(1) if (!$ok); &usage(0, 1) if ($opt_help); &HgAutomate::processCommonOptions(); my $err = $stepper->processOptions(); usage(1) if ($err); $workhorse = $opt_workhorse if ($opt_workhorse); $bigClusterHub = $opt_bigClusterHub if ($opt_bigClusterHub); $dbHost = $opt_dbHost if ($opt_dbHost); } ######################################################################### # * step: kmers [bigClusterHub] sub doKmers { my $runDir = "$buildDir/kmers"; my $paraHub = $bigClusterHub; # First, make sure we're starting clean. if ( ! $opt_debug && ( -s "$runDir/runOne" ) ) { die "kmers: looks like this was run successfully already " . "(file db/bed/tandemDups/kmers/runOne exists). Either run with -continue pairedEnds or some later " . "stage, or move aside/remove $runDir and run again.\n"; } &HgAutomate::mustMkdir($runDir); my $templateCmd = ('runOne $(path1) {check out exists tmp/$(path1).' . "$kmersMinus1" . 'mer.txt.gz}'); &HgAutomate::makeGsub($runDir, $templateCmd); `touch "$runDir/para_hub_$paraHub"`; my $whatItDoes = "Generate kmers for entire sequence."; my $bossScript = newBash HgRemoteScript("$runDir/runKmers.bash", $paraHub, $runDir, $whatItDoes); my $paraRun = &HgAutomate::paraRun(); $bossScript->add(<<_EOF_ twoBitInfo $twoBit stdout | cut -f1 > part.list printf '#!/bin/bash set -beEu -o pipefail export fa=\$1 export result=\$2 mkdir -p tmp twoBitToFa ${twoBit}:\$fa stdout \\ | $Bin/kmerPrint.pl $kmersMinus1 stdin | gzip -c > \$result ' > runOne chmod +x runOne gensub2 part.list single gsub jobList $paraRun _EOF_ ); $bossScript->execute(); } # doKmers() ######################################################################### # * step: pairedEnds [bigClusterHub] sub doPairedEnds { my $prevRunDir = "$buildDir/kmers"; my $runDir = "$buildDir/pairedEnds"; my $paraHub = $bigClusterHub; # First, make sure we're starting clean. if ( ! $opt_debug && ( -s "$runDir/runOne" ) ) { die "pairedEnds: looks like this was run successfully already " . "(file db/bed/tandemDups/pairedEnds/runOne exists). Either run with -continue blat or some later " . "stage, or move aside/remove $runDir and run again.\n"; } if ( ! $opt_debug && ( ! -s "$prevRunDir/run.time" ) ) { die "pairedEnds: looks like previous kmers step has not been competed, " . "file db/bed/tandemDups/kmers/run.time does not exist. run with: -continue kmers to perform previous step.\n"; } &HgAutomate::mustMkdir($runDir); my $templateCmd = ("runOne $kmersMinus1 $gapSize " . '$(path1) {check out exists tmp/$(path1).bed.gz}'); &HgAutomate::makeGsub($runDir, $templateCmd); `touch "$runDir/para_hub_$paraHub"`; my $whatItDoes = "Collect kmers into pairs."; my $bossScript = newBash HgRemoteScript("$runDir/runPairedEnds.bash", $paraHub, $runDir, $whatItDoes); - # trying 16G ram to see if jobs will not fail - my $paraRun = "para make -ram=16g jobList + # trying 32G ram to see if jobs will not fail + my $paraRun = "para make -ram=32g jobList para check para time > run.time cat run.time\n"; $bossScript->add(<<_EOF_ ln -s $prevRunDir/part.list . printf '#!/bin/bash set -beEu -o pipefail export kmerSize=\$1 export gapSize=\$2 export chrName=\$3 export result=\$4 mkdir -p tmp $Bin/kmerPairs.pl \$kmerSize \$gapSize \$chrName \$result ' > runOne chmod +x runOne gensub2 part.list single gsub jobList $paraRun _EOF_ ); $bossScript->execute(); } # doPairedEnds() ######################################################################### # * step: collapsePairedEnds [bigClusterHub] sub doCollapsePairedEnds { my $prevRunDir = "$buildDir/pairedEnds"; my $runDir = "$buildDir/collapsePairedEnds"; my $paraHub = $bigClusterHub; # First, make sure we're starting clean. if ( ! $opt_debug && ( -s "$runDir/runOne" ) ) { die "collapsePairedEnds: looks like this was run successfully already " . "(file db/bed/tandemDups/collapsePairedEnds/runOne exists). Either run with -continue load or some later " . "stage, or move aside/remove $runDir and run again.\n"; } if ( ! $opt_debug && ( ! -s "$prevRunDir/run.time" ) ) { die "pairedEnds: looks like previous pairedEnds step has not been competed, " . "file db/bed/tandemDups/pairedEnds/run.time does not exist. run with: -continue pairedEnds to perform previous step.\n"; } &HgAutomate::mustMkdir($runDir); my $templateCmd = ('runOne $(path1) {check out exists tmp/$(path1).bed.gz}'); &HgAutomate::makeGsub($runDir, $templateCmd); `touch "$runDir/para_hub_$paraHub"`; my $whatItDoes = "Collapse kmers pairs into larger paired ends."; my $bossScript = newBash HgRemoteScript("$runDir/runCollapsePairedEnds.bash", $paraHub, $runDir, $whatItDoes); - # trying 16G ram to see if jobs will not fail - my $paraRun = "para make -ram=16g jobList + # trying 32G ram to see if jobs will not fail + my $paraRun = "para make -ram=32g jobList para check para time > run.time cat run.time\n"; $bossScript->add(<<_EOF_ ln -s $prevRunDir/part.list . printf '#!/bin/bash set -beEu -o pipefail export chrName=\$1 export result=\$2 mkdir -p tmp $Bin/kmerCollapsePairedEnds.pl $kmersMinus1 ../pairedEnds/tmp/\$chrName.bed.gz \\ | gzip -c > tmp/\$chrName.bed.gz ' > runOne chmod +x runOne gensub2 part.list single gsub jobList $paraRun _EOF_ ); $bossScript->execute(); } # doCollapsePairedEnds() ######################################################################### # * step: load [dbHost] sub doLoad { my $prevRunDir = "$buildDir/collapsePairedEnds"; my $runDir = "$buildDir"; # First, make sure we're starting clean. if ( ! $opt_debug && ( -s "$runDir/loadUp.bash" ) ) { die "load: looks like this was run successfully already, " . "file db/bed/tandemDups/loadUp.bash exists. Can run with: -continue cleanup " . "to complete this procedure.\n"; } # And, must have something to load if ( ! $opt_debug && ( ! -s "$prevRunDir/run.time" ) ) { die "load: previous step collapsePairedEnds has not completed,\n" . "Can run with: -continue collapsePairedEnds or check why previous step has not completed.\n"; } my $whatItDoes = "Collect results into single file, if db exists, load table tandemDups."; my $bossScript = newBash HgRemoteScript("$runDir/loadUp.bash", $workhorse, $runDir, $whatItDoes); $bossScript->add(<<_EOF_ for F in collapsePairedEnds/tmp/*.bed.gz do zcat \$F | awk '\$5 > $kmersMinus1' done | sort -k1,1 -k2,2n | gzip -c > $db.tandemDups.bed.gz twoBitInfo $twoBit stdout | sort -k2nr > $db.chrom.sizes export maxScore=`zcat $db.tandemDups.bed.gz | cut -f5 | sort -k1,1nr | head -1` zcat $db.tandemDups.bed.gz | awk -vmax=\$maxScore '{score=\$5; newScore=int(1000*score/max); printf "%s\\t%s\\t%s\\t%s\\t", \$1, \$2, \$3, \$4; printf "%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\t%s\\n", newScore, \$6, \$7, \$8, \$9, \$10, \$11, \$12, score}' > $db.reScore.bed bedToBigBed -type=bed12+1 $db.reScore.bed $db.chrom.sizes $db.tandemDups.bb bedToExons $db.tandemDups.bed.gz stdout | bedSingleCover.pl stdin > $db.exons.bed export baseCount=`awk '{sum+=\$3-\$2}END{printf "%d", sum}' $db.exons.bed` export asmSize=`awk '{sum+=\$2}END{printf "%d", sum}' $db.chrom.sizes` export perCent=`echo \$baseCount \$asmSize | awk '{printf "%.3f", 100.0*\$1/\$2}'` printf "%d bases of %d (%s%%) in intersection\\n" "\$baseCount" "\$asmSize" "\$perCent" > fb.$db.tandemDups.txt _EOF_ ); # do not load if db does not exist: if ( $opt_debug || $dbExists ) { $bossScript->add(<<_EOF_ # do not load empty files if [ -s $db.reScore.bed ]; then hgLoadBed -as=\$HOME/kent/src/hg/lib/fullBed.as -type=bed4+8 $db tandemDups $db.tandemDups.bed.gz checkTableCoords $db tandemDups fi _EOF_ ); } $bossScript->execute(); } # doLoad ######################################################################### # * step: cleanup [workhorse] sub doCleanup { my $runDir = "$buildDir"; # Skip this if already done if ( ! $opt_debug && ( -s "$runDir/doCleanup.csh" ) ) { printf STDERR "# cleanup already done\n"; } else { my $whatItDoes = "Clean up and/or compress intermediate files."; my $bossScript = new HgRemoteScript("$runDir/doCleanup.csh", $workhorse, $runDir, $whatItDoes); $bossScript->add(<<_EOF_ rm -f $db.reScore.bed $db.exons.bed rm -fr kmers/tmp pairedEnds/tmp collapsePairedEnds/tmp $db.chrom.sizes bed.tab rm -fr kmers/err pairedEnds/err collapsePairedEnds/err _EOF_ ); $bossScript->execute(); } } # doCleanup ######################################################################### # main # Prevent "Suspended (tty input)" hanging: &HgAutomate::closeStdin(); # Make sure we have valid options and exactly 1 argument: &checkOptions(); &usage(1) if (scalar(@ARGV) != 1); $secondsStart = `date "+%s"`; chomp $secondsStart; ($db) = @ARGV; # Force debug and verbose until this is looking pretty solid: # $opt_debug = 1; $opt_verbose = 3 if ($opt_verbose < 3); # Establish directory to work in: $buildDir = $opt_buildDir ? $opt_buildDir : "$HgAutomate::clusterData/$db/$HgAutomate::trackBuild/tandemDups"; $twoBit = $opt_twoBit ? $opt_twoBit : "$HgAutomate::clusterData/$db/$db.2bit"; $kmerSize = $opt_kmerSize ? $opt_kmerSize : $kmerSize; # initial kmer set are size one less than specified kmerSize # this gets the first set of kmer pairs established correctly at size kmerSize $kmersMinus1 = $kmerSize - 1; $gapSize = $opt_gapSize ? $opt_gapSize : $gapSize; &HgAutomate::verbose(1, "# kmer size: $kmerSize\n"); &HgAutomate::verbose(1, "# gap size: $gapSize\n"); &HgAutomate::verbose(1, "# 2bit file: $twoBit\n"); die "kmerSize must be >= 30, this is the lower limit" if ($kmerSize < 30); die "gapSize must be >= 1, this is the lower limit" if ($gapSize < 1); die "can not find 2bit file: '$twoBit'" if ( ! -s $twoBit); # may be working on a 2bit file that does not have a database browser $dbExists = 0; $dbExists = 1 if (&HgAutomate::databaseExists($dbHost, $db)); # Do everything. $stepper->execute(); # Tell the user anything they should know. my $stopStep = $stepper->getStopStep(); my $upThrough = ($stopStep eq 'cleanup') ? "" : " (through the '$stopStep' step)"; $secondsEnd = `date "+%s"`; chomp $secondsEnd; my $elapsedSeconds = $secondsEnd - $secondsStart; my $elapsedMinutes = int($elapsedSeconds/60); $elapsedSeconds -= $elapsedMinutes * 60; &HgAutomate::verbose(1, "\n *** All done !$upThrough Elapsed time: ${elapsedMinutes}m${elapsedSeconds}s\n"); &HgAutomate::verbose(1, " *** Steps were performed in $buildDir\n"); &HgAutomate::verbose(1, "\n");