936a6490897222bffd01a9117c6ea9bd9c8cd41a hiram Wed Nov 17 10:22:28 2021 -0800 eliminate cluster runs use simple command on whole sequence refs #28401 diff --git src/hg/utils/automation/doCpgIslands.pl src/hg/utils/automation/doCpgIslands.pl index 5391357..c573ff1 100755 --- src/hg/utils/automation/doCpgIslands.pl +++ src/hg/utils/automation/doCpgIslands.pl @@ -20,30 +20,32 @@ $opt_maskedSeq $opt_chromSizes $opt_tableName /; # Specify the steps supported with -continue / -stop: my $stepper = new HgStepManager( [ { name => 'hardMask', func => \&doHardMask }, { name => 'cpg', func => \&doCpg }, { name => 'makeBed', func => \&doMakeBed }, { name => 'load', func => \&doLoadCpg }, { name => 'cleanup', func => \&doCleanup }, ] ); +my $cpgLh = "/hive/data/staging/data/cpgIslandExt/cpglh"; + # Option defaults: # my $bigClusterHub = 'swarm'; my $bigClusterHub = 'ku'; # my $smallClusterHub = 'encodek'; my $smallClusterHub = 'ku'; my $workhorse = 'hgwdev'; my $dbHost = 'hgwdev'; my $defaultWorkhorse = 'hgwdev'; my $maskedSeq = "$HgAutomate::clusterData/\$db/\$db.2bit"; my $chromSizes = "$HgAutomate::clusterData/\$db/chrom.sizes"; my $base = $0; $base =~ s/^(.*\/)?//; sub usage { @@ -109,193 +111,135 @@ ); &usage(1) if (!$ok); &usage(0, 1) if ($opt_help); &HgAutomate::processCommonOptions(); my $err = $stepper->processOptions(); usage(1) if ($err); $workhorse = $opt_workhorse if ($opt_workhorse); $bigClusterHub = $opt_bigClusterHub if ($opt_bigClusterHub); $smallClusterHub = $opt_smallClusterHub if ($opt_smallClusterHub); $dbHost = $opt_dbHost if ($opt_dbHost); } ######################################################################### # * step: hard mask [workhorse] sub doHardMask { - # Set up and perform the cluster run to run the hardMask sequence. - my $runDir = "$buildDir"; - my $outRoot = 'hardMaskedFa'; - - # First, make sure we're starting clean. - if ( ! $opt_debug && ( -d "$runDir/$outRoot" || -s "$runDir/hardMask.done" ) ) { - die "doHardMask: looks like this was run successfully already " . - "(directory hardMaskedFa exists). Either run with -continue cpg or some later " . - "stage, or move aside/remove $runDir/$outRoot and run again.\n"; - } - - my $whatItDoes = "Constructs $outRoot/*.2bit files for processing with cphlh."; - my $bossScript = newBash HgRemoteScript("$runDir/doHardMask.bash", $workhorse, - $runDir, $whatItDoes); - - $bossScript->add(<<_EOF_ -rm -fr parts $outRoot -mkdir $outRoot -export twoBit=\"$maskedSeq\" -export maxSize=`sort -k2nr $chromSizes | head -1 | awk '{print \$2}'` -/cluster/bin/scripts/partitionSequence.pl -lstDir parts \$maxSize 0 \$twoBit $chromSizes 30 > /dev/null -for L in parts/part*.lst -do - B=`basename \$L | sed -e 's/.lst//;'` - sed -e 's/.*.2bit://; s/:0-.*//;' \${L} > \${B}.list - twoBitToFa -seqList=\$B.list \${twoBit} stdout | maskOutFa stdin hard stdout \\ - | faToTwoBit stdin $outRoot/\$B.t.2bit - rm -f \${B}.list - twoBitToFa $outRoot/\$B.t.2bit stdout | faCount stdin | egrep -v \"^total|^#seq\" | awk '\$2-\$7 > 200 { printf \"%s\\n\", \$1}' > \$B.list - if [ -s \$B.list ]; then - twoBitToFa -seqList=\$B.list $outRoot/\$B.t.2bit stdout | faToTwoBit stdin $outRoot/\$B.2bit - fi - rm -f $outRoot/\$B.t.2bit \$B.list -done -date > hardMask.done -_EOF_ - ); - $bossScript->execute(); + printf STDERR "# doHardMask: obsolete step, no longer needed\n"; + return 0; } # doHardMask ######################################################################### -# * step: cpg [bigClusterHub] +# * step: cpg [workhorse] sub doCpg { # Set up and perform the cluster run to run the CpG function on the # hard masked sequence. my $paraHub = $bigClusterHub; my $runDir = $buildDir; - # First, make sure previous step is completed - if (! $opt_debug && ! -s "$runDir/hardMask.done") { - die "doCpg: previous step hardMask has not completed\n"; - } # Second, make sure we're starting clean. - if (-e "$runDir/run.time") { + if (-e "$runDir/cpglh.result") { die "doCpg: looks like this was run successfully already " . - "(run.time exists). Either run with -continue makeBed or some later " . + "(cpglh.result exists). Either run with -continue makeBed or some later " . "stage, or move aside/remove $runDir/ and run again.\n"; - } elsif ((-e "$runDir/gsub" || -e "$runDir/jobList") && ! $opt_debug) { - die "doCpg: looks like we are not starting with a clean " . - "slate.\n\tclean\n $runDir/\n\tand run again.\n"; } &HgAutomate::mustMkdir($runDir); - my $templateCmd = ("./runCpg.csh " . '$(root1) ' - . '{check out exists results/$(root1).cpg}'); - &HgAutomate::makeGsub($runDir, $templateCmd); - `touch "$runDir/para_hub_$paraHub"`; - - my $fh = &HgAutomate::mustOpen(">$runDir/runCpg.csh"); - print $fh <<_EOF_ -#!/bin/csh -ef -set partName = \$1 -set part2bit = hardMaskedFa/\$partName.2bit -set result = \$2 -twoBitToFa \$part2bit stdout | /hive/data/staging/data/cpgIslandExt/cpglh /dev/stdin > \$result -_EOF_ - ; - close($fh); - my $whatItDoes = "Run /hive/data/staging/data/cpgIslandExt/cpglh on masked sequence."; - my $bossScript = new HgRemoteScript("$runDir/doCpg.csh", $paraHub, + my $bossScript = newBash HgRemoteScript("$runDir/doCpg.bcsh", $workhorse, $runDir, $whatItDoes); - my $paraRun = &HgAutomate::paraRun(); - my $gensub2 = &HgAutomate::gensub2(); $bossScript->add(<<_EOF_ -mkdir -p results -chmod a+x runCpg.csh -rm -f file.list -find ./hardMaskedFa -type f > file.list -$gensub2 file.list single gsub jobList -$paraRun +export twoBit=\"$maskedSeq\" +twoBitToFa \$twoBit stdout | maskOutFa stdin hard stdout \\ + | /hive/data/staging/data/cpgIslandExt/cpglh /dev/stdin 2> cpglh.stderr \\ + > cpglh.result _EOF_ ); + $bossScript->execute(); } # doCpg ######################################################################### # * step: make bed [workhorse] sub doMakeBed { my $runDir = $buildDir; &HgAutomate::mustMkdir($runDir); # First, make sure we're starting clean. if (-e "$runDir/cpgIsland.bed") { die "doMakeBed: looks like this was run successfully already " . "(cpgIsland.bed exists). Either run with -continue load or cleanup " . "or move aside/remove $runDir/cpgIsland.bed and run again.\n"; } + if (! -e "$runDir/cpglh.result") { + die "doMakeBed: previous step doCpg has not completed, cpglh.result not found\n"; + } my $whatItDoes = "Makes bed from cpglh output."; - my $bossScript = new HgRemoteScript("$runDir/doMakeBed.csh", $workhorse, + my $bossScript = newBash HgRemoteScript("$runDir/doMakeBed.bcsh", $workhorse, $runDir, $whatItDoes); $bossScript->add(<<_EOF_ -catDir -r results \\ - | awk \'\{\$2 = \$2 - 1; width = \$3 - \$2; printf\(\"\%s\\t\%d\\t\%s\\t\%s \%s\\t\%s\\t\%s\\t\%0.0f\\t\%0.1f\\t\%s\\t\%s\\n\", \$1, \$2, \$3, \$5, \$6, width, \$6, width\*\$7\*0.01, 100.0\*2\*\$6\/width, \$7, \$9\);}\' \\ +awk \'\{\$2 = \$2 - 1; width = \$3 - \$2; printf\(\"\%s\\t\%d\\t\%s\\t\%s \%s\\t\%s\\t\%s\\t\%0.0f\\t\%0.1f\\t\%s\\t\%s\\n\", \$1, \$2, \$3, \$5, \$6, width, \$6, width\*\$7\*0.01, 100.0\*2\*\$6\/width, \$7, \$9\);}\' cpglh.result \\ | sort -k1,1 -k2,2n > cpgIsland.bed bedToBigBed -tab -type=bed4+6 -as=\$HOME/kent/src/hg/lib/cpgIslandExt.as \\ cpgIsland.bed $chromSizes $db.$tableName.bb _EOF_ ); $bossScript->execute(); } # doMakeBed ######################################################################### # * step: load [dbHost] sub doLoadCpg { my $runDir = $buildDir; &HgAutomate::mustMkdir($runDir); my $whatItDoes = "Loads cpgIsland.bed."; my $bossScript = new HgRemoteScript("$runDir/doLoadCpg.csh", $dbHost, $runDir, $whatItDoes); + if (! -e "$runDir/cpgIsland.bed") { + die "doLoadCpg previous step doMakeBed has not completed, cpgIsland.bed not found\n"; + } + $bossScript->add(<<_EOF_ set C=`cut -f1 cpgIsland.bed | sort -u | awk '{print length(\$0)}' | sort -rn | sed -n -e '1,1 p'` sed -e "s/14/\${C}/" \$HOME/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandExt.sql hgLoadBed -noLoad -sqlTable=cpgIslandExt.sql -tab $db $tableName cpgIsland.bed hgLoadSqlTab $db $tableName cpgIslandExt.sql bed.tab checkTableCoords -verboseBlocks -table=$tableName $db featureBits $db $tableName >&fb.$db.$tableName.txt _EOF_ ); $bossScript->execute(); } # doLoad ######################################################################### # * step: cleanup [fileServer] sub doCleanup { my $runDir = $buildDir; my $whatItDoes = "It cleans up or compresses intermediate files."; my $fileServer = &HgAutomate::chooseFileServer($runDir); my $bossScript = new HgRemoteScript("$runDir/doCleanup.csh", $fileServer, $runDir, $whatItDoes); $bossScript->add(<<_EOF_ -rm -rf hardMaskedFa/ results/ err/ run.hardMask/err/ -rm -f batch.bak bed.tab cpgIslandExt.sql run.hardMask/batch.bak +rm -f bed.tab cpgIslandExt.sql gzip cpgIsland.bed _EOF_ ); $bossScript->execute(); } # doCleanup - ######################################################################### # main # Prevent "Suspended (tty input)" hanging: &HgAutomate::closeStdin(); # Make sure we have valid options and exactly 1 argument: &checkOptions(); &usage(1) if (scalar(@ARGV) != 1); $secondsStart = `date "+%s"`; chomp $secondsStart; ($db) = @ARGV; # Force debug and verbose until this is looking pretty solid: #$opt_debug = 1;