8d7fca788984ae4f0dde81e775574cc24bd58bbf hiram Mon Feb 3 14:00:13 2025 -0800 updated to new window masker binary and now bash scripts in place of the C-shell scripts no redmine diff --git src/hg/utils/automation/doWindowMasker.pl src/hg/utils/automation/doWindowMasker.pl index b946726b85c..d3d1c32646c 100755 --- src/hg/utils/automation/doWindowMasker.pl +++ src/hg/utils/automation/doWindowMasker.pl @@ -1,298 +1,305 @@ #!/usr/bin/env perl # DO NOT EDIT the /cluster/bin/scripts copy of this file -- # edit ~/kent/src/hg/utils/automation/doWindowMasker.pl instead. # $Id: doWindowMasker.pl,v 1.8 2009/03/13 22:27:12 hiram Exp $ use Getopt::Long; use warnings; use strict; use FindBin qw($Bin); use lib "$Bin"; use HgAutomate; use HgRemoteScript; use HgStepManager; # Option variable names, both common and peculiar to this script: use vars @HgAutomate::commonOptionVars; use vars @HgStepManager::optionVars; use vars qw/ $opt_buildDir $opt_unmaskedSeq /; # Specify the steps supported with -continue / -stop: my $stepper = new HgStepManager( [ { name => 'count', func => \&doCount }, - { name => 'mask', func => \&doMask }, { name => 'sdust', func => \&doSdust }, { name => 'twobit', func => \&doTwoBit }, { name => 'load', func => \&doLoad }, { name => 'cleanup', func => \&doCleanup }, ] ); +### new version of windowmasker 2025-01-29 +my $wmDir = "/hive/data/outside/ncbiToolKit/v28.0.11/build.v28.0.11/GCC1150-DebugMT64/bin"; + # Option defaults: my $defaultWorkhorse = 'least loaded'; my $dbHost = 'hgwdev'; my $unmaskedSeq = "$HgAutomate::clusterData/\$db/\$db.unmasked.2bit"; my $base = $0; $base =~ s/^(.*\/)?//; sub usage { # Usage / help / self-documentation: my ($status, $detailed) = @_; # Basic help (for incorrect usage): print STDERR " usage: $base db options: "; print STDERR $stepper->getOptionHelp(); print STDERR <<_EOF_ -buildDir dir Use dir instead of default $HgAutomate::clusterData/\$db/$HgAutomate::trackBuild/WindowMasker.\$date (necessary when continuing at a later date). -unmaskedSeq <file.2bit> Use file.2bit for unmasked sequence, default is: $unmaskedSeq _EOF_ ; print STDERR &HgAutomate::getCommonOptionHelp('dbHost' => $dbHost, 'workhorse' => $defaultWorkhorse); print STDERR " Automates UCSC's WindowMasker process for genome database \$db. Steps: count: Do the first pass of WindowMasker: collecting the counts. mask: The second pass of WindowMasker and collect output. sdust: Another pass of WindowMasker using -sdust true. twobit: Make masked twobit files. load: load and clean of gaps, reload cleaned table. cleanup: Removes or compresses intermediate files. All operations are performed in the build directory which is $HgAutomate::clusterData/\$db/$HgAutomate::trackBuild/WindowMasker.\$date unless -buildDir is given. "; # Detailed help (-help): print STDERR " Assumptions: 1. $HgAutomate::clusterData/\$db/\$db.unmasked.2bit contains sequence for database/assembly \$db. (This can be overridden with -unmaskedSeq.) " if ($detailed); print "\n"; exit $status; } # Globals: # Command line args: db my ($db); # Other: my ($buildDir); sub checkOptions { # Make sure command line options are valid/supported. my $ok = GetOptions(@HgStepManager::optionSpec, 'buildDir=s', 'unmaskedSeq=s', @HgAutomate::commonOptionSpec, ); &usage(1) if (!$ok); &usage(0, 1) if ($opt_help); &HgAutomate::processCommonOptions(); my $err = $stepper->processOptions(); usage(1) if ($err); $dbHost = $opt_dbHost if ($opt_dbHost); } ######################################################################### # * step: count [workhorse] sub doCount { my $runDir = "$buildDir"; &HgAutomate::checkCleanSlate('count', 'mask', "$runDir/windowmasker.counts"); &HgAutomate::mustMkdir($runDir); my $whatItDoes = "It does WindowMasker counts step."; my $workhorse = &HgAutomate::chooseWorkhorse(); - my $bossScript = new HgRemoteScript("$runDir/doCount.csh", $workhorse, + my $bossScript = newBash HgRemoteScript("$runDir/doCount.bash", $workhorse, $runDir, $whatItDoes); my $tmpDir = &HgAutomate::tmpDir(); $bossScript->add(<<_EOF_ -if ( -d "/data/tmp" ) then - setenv TMPDIR "/data/tmp" -else if ( -d "/scratch/tmp" ) then - setenv TMPDIR "/scratch/tmp" +unset TMPDIR +if [ -d "/data/tmp" ]; then + export TMPDIR="/data/tmp" +elif [ -d "/dev/shm" ]; then + export TMPDIR="/dev/shm" +elif [ -d "/scratch/tmp" ]; then + export TMPDIR="/scratch/tmp" else - setenv TMPDIR "/tmp" -endif -set windowMaskerDir = /cluster/bin/\$MACHTYPE -set windowMasker = \$windowMaskerDir/windowmasker -set fa = $db.fa -set tmpDir = `mktemp -d -p \$TMPDIR doWindowMasker.XXXXXX` -chmod 775 \$tmpDir -set inputTwoBit = $unmaskedSeq -pushd \$tmpDir + export TMPDIR="/tmp" +fi + +export windowMaskerDir="$wmDir" +export windowMasker="\$windowMaskerDir/windowmasker" + +export fa="$db.fa" +export tmpDir=`mktemp -d -p \$TMPDIR doWindowMasker.XXXXXX` +chmod 777 \$tmpDir +export inputTwoBit="$unmaskedSeq" +cd \$tmpDir twoBitToFa \$inputTwoBit \$fa -\$windowMasker -mk_counts true -input \$fa -output windowmasker.counts -popd +\$windowMasker -mem 32768 -mk_counts -in \$fa -out windowmasker.counts +cd $runDir cp \$tmpDir/windowmasker.counts . rm -rf \$tmpDir _EOF_ ); $bossScript->execute(); } # doCount - -######################################################################### -# * step: mask [workhorse] -sub doMask { - printf STDERR "# doMask: obsolete step, no longer needed\n"; - return 0; -} # doMask - ######################################################################### # * step: sdust [workhorse] sub doSdust { my $runDir = "$buildDir"; &HgAutomate::checkExistsUnlessDebug('mask', 'sdust', "$runDir/windowmasker.counts"); my $whatItDoes = "It does WindowMasker masking step with -sdust true."; my $workhorse = &HgAutomate::chooseWorkhorse(); - my $bossScript = new HgRemoteScript("$runDir/doSdust.csh", $workhorse, + my $bossScript = newBash HgRemoteScript("$runDir/doSdust.bash", $workhorse, $runDir, $whatItDoes); my $tmpDir = &HgAutomate::tmpDir(); $bossScript->add(<<_EOF_ -if ( -d "/data/tmp" ) then - setenv TMPDIR "/data/tmp" -else if ( -d "/scratch/tmp" ) then - setenv TMPDIR "/scratch/tmp" +unset TMPDIR +if [ -d "/data/tmp" ]; then + export TMPDIR="/data/tmp" +elif [ -d "/dev/shm" ]; then + export TMPDIR="/dev/shm" +elif [ -d "/scratch/tmp" ]; then + export TMPDIR="/scratch/tmp" else - setenv TMPDIR "/tmp" -endif -set windowMaskerDir = /cluster/bin/\$MACHTYPE -set windowMasker = \$windowMaskerDir/windowmasker -set fa = $db.fa -set tmpDir = `mktemp -d -p \$TMPDIR doWindowMasker.XXXXXX` -chmod 775 \$tmpDir -set inputTwoBit = $unmaskedSeq + export TMPDIR="/tmp" +fi +export windowMaskerDir="$wmDir" +export windowMasker="\$windowMaskerDir/windowmasker" +export fa="$db.fa" +export tmpDir=`mktemp -d -p \$TMPDIR doWindowMasker.XXXXXX` +chmod 777 \$tmpDir +export inputTwoBit="$unmaskedSeq" cp windowmasker.counts \$tmpDir -pushd \$tmpDir +cd \$tmpDir twoBitToFa \$inputTwoBit \$fa -\$windowMasker -ustat windowmasker.counts -sdust true -input \$fa -output windowmasker.intervals -perl -wpe \'if \(s\/^\>lcl\\\|\(\.\*\)\\n\$\/\/\) { \$chr = \$1\; } \\ - if \(\/^\(\\d+\) \- \(\\d+\)\/\) { \\ - \$s=\$1\; \$e=\$2+1\; s\/\(\\d+\) \- \(\\d+\)\/\$chr\\t\$s\\t\$e\/\; \\ - }\' windowmasker.intervals > windowmasker.sdust.bed -popd +\$windowMasker -ustat windowmasker.counts -dust T -in \$fa \\ + | awk \' +/^>/ { sub(/^>/, "", \$0); chr = \$0; next; } + +/^[0-9]+ - [0-9]+/ { split(\$0, arr, " - "); start = arr[1]; end = arr[2] + 1; + printf "%s\\t%d\\t%d\\n", chr, start, end; +} +\' > windowmasker.sdust.bed +cd $runDir cp \$tmpDir/windowmasker.sdust.bed . rm -rf \$tmpDir _EOF_ ); $bossScript->execute(); } # doSdust ######################################################################### # * step: twobit [fileServer] sub doTwoBit { my $runDir = "$buildDir"; my $whatItDoes = "Make .2bit files from the beds."; &HgAutomate::checkExistsUnlessDebug('sdust', 'twobit', ("$runDir/windowmasker.counts", "$runDir/windowmasker.sdust.bed")); my $fileServer = &HgAutomate::chooseFileServer($runDir); - my $bossScript = new HgRemoteScript("$runDir/doTwoBit.csh", $fileServer, + my $bossScript = newBash HgRemoteScript("$runDir/doTwoBit.bash", $fileServer, $runDir, $whatItDoes); $bossScript->add(<<_EOF_ twoBitMask $unmaskedSeq windowmasker.sdust.bed $db.wmsk.sdust.2bit -twoBitToFa $db.wmsk.sdust.2bit stdout | faSize stdin >&faSize.$db.wmsk.sdust.txt +twoBitToFa $db.wmsk.sdust.2bit stdout | faSize stdin > faSize.$db.wmsk.sdust.txt 2>&1 _EOF_ ); $bossScript->execute(); } #doTwoBit ######################################################################### # * step: load [dbHost] sub doLoad { my $runDir = "$buildDir"; my $whatItDoes = "load sdust.bed and filter with gaps to clean"; &HgAutomate::checkExistsUnlessDebug('twobit', 'load', ("$runDir/$db.wmsk.sdust.2bit", "$runDir/faSize.$db.wmsk.sdust.txt")); - my $bossScript = new HgRemoteScript("$runDir/doLoad.csh", $dbHost, + my $bossScript = newBash HgRemoteScript("$runDir/doLoad.bash", $dbHost, $runDir, $whatItDoes); $bossScript->add(<<_EOF_ -hgLoadBed $db windowmaskerSdust windowmasker.sdust.bed -featureBits -countGaps $db windowmaskerSdust >&fb.$db.windowmaskerSdust.beforeClean.txt -featureBits $db -not gap -bed=notGap.bed -featureBits $db windowmaskerSdust notGap.bed -bed=stdout | gzip -c > cleanWMask.bed.gz -hgLoadBed $db windowmaskerSdust cleanWMask.bed.gz -featureBits -countGaps $db windowmaskerSdust >&fb.$db.windowmaskerSdust.clean.txt -zcat cleanWMask.bed.gz | twoBitMask ../../$db.unmasked.2bit stdin -type=.bed $db.cleanWMSdust.2bit -twoBitToFa $db.cleanWMSdust.2bit stdout | faSize stdin >& faSize.$db.cleanWMSdust.txt -featureBits -countGaps $db rmsk windowmaskerSdust >&fb.$db.rmsk.windowmaskerSdust.txt +export db="$db" +hgLoadBed \$db windowmaskerSdust windowmasker.sdust.bed +featureBits -countGaps \$db windowmaskerSdust > fb.\$db.windowmaskerSdust.beforeClean.txt 2>&1 +featureBits \$db -not gap -bed=notGap.bed +featureBits \$db windowmaskerSdust notGap.bed -bed=stdout | gzip -c > cleanWMask.bed.gz +hgLoadBed \$db windowmaskerSdust cleanWMask.bed.gz +featureBits -countGaps \$db windowmaskerSdust > fb.\$db.windowmaskerSdust.clean.txt 2>&1 +zcat cleanWMask.bed.gz | twoBitMask ../../\$db.unmasked.2bit stdin -type=.bed \$db.cleanWMSdust.2bit +twoBitToFa \$db.cleanWMSdust.2bit stdout | faSize stdin >& faSize.\$db.cleanWMSdust.txt +featureBits -countGaps \$db rmsk windowmaskerSdust > fb.\$db.rmsk.windowmaskerSdust.txt 2>&1 _EOF_ ); $bossScript->execute(); } #doLoad ######################################################################### # * step: cleanup [fileServer] sub doCleanup { my $runDir = "$buildDir"; my $whatItDoes = "It cleans up or compresses intermediate files."; my $fileServer = &HgAutomate::chooseFileServer($runDir); - my $bossScript = new HgRemoteScript("$runDir/doCleanup.csh", $fileServer, + my $bossScript = newBash HgRemoteScript("$runDir/doCleanup.bash", $fileServer, $runDir, $whatItDoes); $bossScript->add(<<_EOF_ -gzip $runDir/windowmasker.counts +gzip $runDir/windowmasker.counts & gzip $runDir/windowmasker.sdust.bed +wait _EOF_ ); $bossScript->execute(); } # doCleanup ######################################################################### # main # Prevent "Suspended (tty input)" hanging: &HgAutomate::closeStdin(); # Make sure we have valid options and exactly 1 argument: &checkOptions(); &usage(1) if (scalar(@ARGV) != 1); my $secondsStart = `date "+%s"`; chomp $secondsStart; ($db) = @ARGV; # Force debug and verbose until this is looking pretty solid: #$opt_debug = 1; $opt_verbose = 3 if ($opt_verbose < 3); # Establish what directory we will work in. my $date = `date +%Y-%m-%d`; chomp $date; $buildDir = $opt_buildDir ? $opt_buildDir : "$HgAutomate::clusterData/$db/$HgAutomate::trackBuild/WindowMasker.$date"; $unmaskedSeq = $opt_unmaskedSeq ? $opt_unmaskedSeq : "$HgAutomate::clusterData/$db/$db.unmasked.2bit"; # Do everything. $stepper->execute(); # Tell the user anything they should know. my $stopStep = $stepper->getStopStep(); my $upThrough = ($stopStep eq 'cleanup') ? "" : " (through the '$stopStep' step)"; my $secondsEnd = `date "+%s"`; chomp $secondsEnd; my $elapsedSeconds = $secondsEnd - $secondsStart; my $elapsedMinutes = int($elapsedSeconds/60); $elapsedSeconds -= $elapsedMinutes * 60; &HgAutomate::verbose(1, "\n *** All done !$upThrough - Elapsed time: ${elapsedMinutes}m${elapsedSeconds}s\n"); &HgAutomate::verbose(1, " *** Steps were performed in $buildDir\n"); &HgAutomate::verbose(1, "\n");