29bf2c22b06f9d457bcd43ef2029bad054542ba8 hiram Thu Dec 15 11:35:24 2022 -0800 initial version of running RepeatModeler no redmine diff --git src/hg/utils/automation/doRepeatModeler.pl src/hg/utils/automation/doRepeatModeler.pl new file mode 100755 index 0000000..04a8658 --- /dev/null +++ src/hg/utils/automation/doRepeatModeler.pl @@ -0,0 +1,307 @@ +#!/usr/bin/env perl + +# DO NOT EDIT the /cluster/bin/scripts copy of this file -- +# edit ~/kent/src/hg/utils/automation/doRepeatModeler.pl instead. + +use Getopt::Long; +use warnings; +use strict; +use Carp; +use FindBin qw($Bin); +use lib "$Bin"; +use HgAutomate; +use HgRemoteScript; +use HgStepManager; + +# Hardcoded command path: +my $RepeatModelerPath = "/hive/data/outside/RepeatModeler-2.0.4"; +my $RepeatModeler = "$RepeatModelerPath/RepeatModeler"; +my $BuildDatabase = "$RepeatModelerPath/BuildDatabase"; +# configured to consume one entire ku machine node +my $threadCount = "-threads 32"; +my $parasolOpts = "-cpu=32 -ram=128g"; +# Option defaults +my $bigClusterHub = 'ku'; +my $workhorse = "hgwdev"; +my $defaultWorkhorse = 'hgwdev'; + +# Option variable names, both common and peculiar to this script: +use vars @HgAutomate::commonOptionVars; +use vars @HgStepManager::optionVars; +use vars qw/ + $opt_buildDir + $opt_unmaskedSeq + /; + +# Specify the steps supported with -continue / -stop: +my $stepper = new HgStepManager( + [ { name => 'blastDb', func => \&doBlastDb }, + { name => 'cluster', func => \&doCluster }, + { name => 'cleanup', func => \&doCleanup }, + ] + ); + +# Option defaults: +my $dbHost = 'hgwdev'; +my $unmaskedSeq = "\$db.unmasked.2bit"; + +my $base = $0; +$base =~ s/^(.*\/)?//; + +sub usage { + # Usage / help / self-documentation: + my ($status, $detailed) = @_; + # Basic help (for incorrect usage): + print STDERR " +usage: $base db +options: + the db argument is a UCSC database name or the assembly identifier + for a GenArk assembly hub build +"; + print STDERR $stepper->getOptionHelp(); + print STDERR <<_EOF_ + -buildDir dir Use dir instead of default + $HgAutomate::clusterData/\$db/$HgAutomate::trackBuild/RepeatModeler.\$date + (necessary when continuing at a later date). + -unmaskedSeq seq.2bit Use seq.2bit as the unmasked input sequence instead + of default ($unmaskedSeq). +_EOF_ + ; + print STDERR &HgAutomate::getCommonOptionHelp('dbHost' => $dbHost, + 'workhorse' => '', + 'bigClusterHub' => ''); + print STDERR " +Automates the RepeatModeler process for genome assembly \$db. Steps: + blastDb: construct fasta file from unmasked.2bit and rmblastn index files. + cluster: Parasol cluster run of RepeatModeler. + libResult: Collect the consensus library file from the RepeatModeler output. + cleanup: Removes or compresses intermediate files. +All operations are performed in the build directory which is +$HgAutomate::clusterData/\$db/$HgAutomate::trackBuild/RepeatModeler.\$date unless -buildDir is given. +Run -help to see what files are required for this script. +"; + # Detailed help (-help): + print STDERR " +Assumptions: +1. $HgAutomate::clusterData/\$db/\$db.unmasked.2bit contains sequence for + database/assembly \$db. (This can be overridden with -unmaskedSeq.) +2. When complete, the resulting RepeatMasker library file will be in the build + directory with the name: asmId-families.fa +" if ($detailed); + print STDERR "\n"; + exit $status; +} + +# Globals: +# Command line args: db +my ($db); +# Other: +my ($buildDir, $chromBased, $updateTable, $secondsStart, $secondsEnd); + +sub checkOptions { + # Make sure command line options are valid/supported. + my $ok = GetOptions(@HgStepManager::optionSpec, + 'buildDir=s', + 'unmaskedSeq=s', + @HgAutomate::commonOptionSpec, + ); + &usage(1) if (!$ok); + &usage(0, 1) if ($opt_help); + &HgAutomate::processCommonOptions(); + my $err = $stepper->processOptions(); + usage(1) if ($err); + $dbHost = $opt_dbHost if ($opt_dbHost); + $workhorse = $opt_workhorse if ($opt_workhorse); + $bigClusterHub = $opt_bigClusterHub if ($opt_bigClusterHub); +} + +######################################################################### +# * step: cluster [workhorse] +sub doBlastDb { + my $runDir = "$buildDir"; + # verify starting with a clean directory, not done before + if ( ! $opt_debug ) { + if ( -d "$runDir" ) { + if ( -s "$runDir/$db.nsq" ) { + &HgAutomate::verbose(1, "\nblastDb step previously completed\n"); + return; + } + } + } + &HgAutomate::mustMkdir($runDir); + + if (! -e $unmaskedSeq) { + die "Error: required file $unmaskedSeq does not exist."; + } + + my $whatItDoes = +"Construct .fa file from unmasked.2bit, then run BuildDatabase from +RepeatModeler to prepare rmblastn index files."; + + my $bossScript = newBash HgRemoteScript("$runDir/blastDb.bash", $workhorse, + $runDir, $whatItDoes); + + $bossScript->add(<<_EOF_ +export asmId="${db}" +export unmasked2Bit="${unmaskedSeq}" +export bDatabase="${BuildDatabase}" + +if [ "\${unmasked2Bit}" -nt "\${asmId}.fa" ]; then + twoBitToFa "\${unmasked2Bit}" "\${asmId}.fa" + touch -r "\${unmasked2Bit}" "\${asmId}.fa" +fi + +if [ "\${asmId}.fa" -nt "\${asmId}.nsq" ]; then + time (\$bDatabase -name "\${asmId}" -engine ncbi "\${asmId}.fa") > blastDb.log 2>&1 +fi +_EOF_ + ); + + $bossScript->execute() if (! $opt_debug); +} # sub doBlastDb + +######################################################################### +# * step: cluster [bigClusterHub] +sub doCluster { + my $runDir = "$buildDir"; + my $paraHub = $bigClusterHub; + + # First, make sure previous step has completed: + if ( ! $opt_debug ) { + if ( ! -s "$runDir/$db.nsq" ) { + die "doCluster: previous 'blastDb' step has not completed, $db.nsq not present\n"; + } + # And, verify this step has not run before + if ( -s "$runDir/run.time" && ! -s "$runDir/${db}-families.fa" ) { + die "cluster: this step appears to have run before, but is broken, run.time is present but ${db}-families.fa is not present ?"; + } + if ( -s "$runDir/${db}-families.fa" ) { + &HgAutomate::verbose(1, "\ncluster step previously completed\n"); + return; + } + } + + my $whatItDoes = +"runs single cluster job to perform the RepeatModeler process."; + + my $bossScript = newBash HgRemoteScript("$runDir/doCluster.bash", $paraHub, + $runDir, $whatItDoes); + $bossScript->add(<<_EOF_ +printf '#!/bin/bash + +set -beEu -o pipefail + +export asmId="\${1}" +export threadCount="${threadCount}" +export rModeler="${RepeatModeler}" + +time (\$rModeler -engine ncbi \$threadCount -database "\${asmId}") > modeler.log 2>&1 +' > oneJob +chmod +x oneJob +printf "oneJob ${db} {check out line+ ${db}-rmod.log}\n" > jobList +para make $parasolOpts jobList +para check +para time > run.time +cat run.time +_EOF_ + ); + $bossScript->execute() if (! $opt_debug); +} # doCluster + +######################################################################### +# * step: cleanup [workhorse] +sub doCleanup { + my $runDir = "$buildDir"; + + # First, make sure previous step has completed: + if ( ! $opt_debug ) { + if ( -s "$runDir/run.time" && ! -s "$runDir/${db}-families.fa" ) { + die "cleanup: previous 'cluster' step appears to be broken, run.time is present but ${db}-families.fa is not present ?"; + } + if ( ! -s "$runDir/${db}-families.fa" ) { + die "cleanup previous 'libResult' step has not completed, ${db}-families.fa not present\n"; + } + # And, verify this step has not run before + if ( ! -s "$runDir/${db}.fa" ) { + &HgAutomate::verbose(1, "\ncleanup step previously completed\n"); + return; + } + } + my $whatItDoes = "Cleans up or compresses intermediate files."; + my $bossScript = newBash HgRemoteScript("$runDir/doCleanup.bash", $workhorse, + $runDir, $whatItDoes); + $bossScript->add(<<_EOF_ +export asmId="${db}" + +rm -f \${asmId}.fa +rm -f \${asmId}.n?? +gzip -c \${asmId}.stk +c=`ls -d RM_* | wc -l` +if [ "\${c}" -eq 1 ]; then + RM_dir=`ls -d RM_*` + if [ -d "\${RM_dir}" ]; then + rm -fr "\${RM_dir}" + else + printf "directory RM_* not found ?\n" 1>&2 + ls -d RM* 1>&2 + exit 255 + fi +else + printf "single directory RM_* not found ?\n" 1>&2 + ls -d RM* 1>&2 + exit 255 +fi +_EOF_ + ); + $bossScript->execute() if (! $opt_debug); +} # doCleanup + +######################################################################### +# main + +# Prevent "Suspended (tty input)" hanging: +&HgAutomate::closeStdin(); + +# Make sure we have valid options and exactly 1 argument: +&checkOptions(); +&usage(1) if (scalar(@ARGV) != 1); +$secondsStart = `date "+%s"`; +chomp $secondsStart; +($db) = @ARGV; + +# Now that we know the $db, figure out our paths: +my $date = `date +%Y-%m-%d`; +chomp $date; +$buildDir = $opt_buildDir ? $opt_buildDir : + "$HgAutomate::clusterData/$db/$HgAutomate::trackBuild/RepeatModeler.$date"; +$unmaskedSeq = $opt_unmaskedSeq ? $opt_unmaskedSeq : + "$HgAutomate::clusterData/$db/$db.unmasked.2bit"; + +# Do everything. +$stepper->execute(); + +# Tell the user anything they should know. +my $stopStep = $stepper->getStopStep(); +my $upThrough = ($stopStep eq 'cleanup') ? "" : + " (through the '$stopStep' step)"; + +$secondsEnd = `date "+%s"`; +chomp $secondsEnd; +my $elapsedSeconds = $secondsEnd - $secondsStart; +my $elapsedMinutes = int($elapsedSeconds/60); +$elapsedSeconds -= $elapsedMinutes * 60; + +&HgAutomate::verbose(1, <<_EOF_ + + *** All done!$upThrough - Elapsed time: ${elapsedMinutes}m${elapsedSeconds}s + *** Steps were performed in $buildDir +_EOF_ +); +if ($stepper->stepPrecedes('cluster', $stopStep)) { + &HgAutomate::verbose(1, <<_EOF_ + *** Result library file should be present in\n$buildDir/${db}-families.fa + to be used by doRepeatMasker.pl -customLib=${db}-families.fa +_EOF_ + ); +} +&HgAutomate::verbose(1, "\n");