3dbc2036473e40fda245db7df8fa725ef6e8d5de hiram Mon Jan 9 13:54:49 2023 -0800 and adding in a families.fa.gz download file no redmine diff --git src/hg/utils/automation/doRepeatModeler.pl src/hg/utils/automation/doRepeatModeler.pl index dd17461..a791e8d 100755 --- src/hg/utils/automation/doRepeatModeler.pl +++ src/hg/utils/automation/doRepeatModeler.pl @@ -1,324 +1,326 @@ #!/usr/bin/env perl # DO NOT EDIT the /cluster/bin/scripts copy of this file -- # edit ~/kent/src/hg/utils/automation/doRepeatModeler.pl instead. use Getopt::Long; use warnings; use strict; use Carp; use FindBin qw($Bin); use lib "$Bin"; use HgAutomate; use HgRemoteScript; use HgStepManager; # Hardcoded command path: my $RepeatModelerPath = "/hive/data/outside/RepeatModeler-2.0.4"; my $RepeatModeler = "$RepeatModelerPath/RepeatModeler"; my $BuildDatabase = "$RepeatModelerPath/BuildDatabase"; # configured to consume one entire ku machine node my $threadCount = "-threads 32"; my $parasolOpts = "-cpu=32 -ram=128g"; # Option defaults my $bigClusterHub = 'ku'; my $workhorse = "hgwdev"; my $defaultWorkhorse = 'hgwdev'; # Option variable names, both common and peculiar to this script: use vars @HgAutomate::commonOptionVars; use vars @HgStepManager::optionVars; use vars qw/ $opt_buildDir $opt_unmaskedSeq /; # Specify the steps supported with -continue / -stop: my $stepper = new HgStepManager( [ { name => 'blastDb', func => \&doBlastDb }, { name => 'cluster', func => \&doCluster }, { name => 'cleanup', func => \&doCleanup }, ] ); # Option defaults: my $dbHost = 'hgwdev'; my $unmaskedSeq = "\$db.unmasked.2bit"; my $base = $0; $base =~ s/^(.*\/)?//; sub usage { # Usage / help / self-documentation: my ($status, $detailed) = @_; # Basic help (for incorrect usage): print STDERR " usage: $base db options: the db argument is a UCSC database name or the assembly identifier for a GenArk assembly hub build "; print STDERR $stepper->getOptionHelp(); print STDERR <<_EOF_ -buildDir dir Use dir instead of default $HgAutomate::clusterData/\$db/$HgAutomate::trackBuild/repeatModeler.\$date (necessary when continuing at a later date). -unmaskedSeq seq.2bit Use seq.2bit as the unmasked input sequence instead of default ($unmaskedSeq). _EOF_ ; print STDERR &HgAutomate::getCommonOptionHelp('dbHost' => $dbHost, 'workhorse' => '', 'bigClusterHub' => ''); print STDERR " Automates the RepeatModeler process for genome assembly \$db. Steps: blastDb: construct fasta file from unmasked.2bit and rmblastn index files. cluster: Parasol cluster run of RepeatModeler. cleanup: Removes or compresses intermediate files. All operations are performed in the build directory which is $HgAutomate::clusterData/\$db/$HgAutomate::trackBuild/repeatModeler.\$date unless -buildDir is given. Run -help to see what files are required for this script. "; # Detailed help (-help): print STDERR " Assumptions: 1. $HgAutomate::clusterData/\$db/\$db.unmasked.2bit contains sequence for database/assembly \$db. (This can be overridden with -unmaskedSeq.) 2. When complete, the resulting RepeatMasker library file will be in the build directory with the name: asmId-families.fa " if ($detailed); print STDERR "\n"; exit $status; } # Globals: # Command line args: db my ($db); # Other: my ($buildDir, $chromBased, $updateTable, $secondsStart, $secondsEnd); sub checkOptions { # Make sure command line options are valid/supported. my $ok = GetOptions(@HgStepManager::optionSpec, 'buildDir=s', 'unmaskedSeq=s', @HgAutomate::commonOptionSpec, ); &usage(1) if (!$ok); &usage(0, 1) if ($opt_help); &HgAutomate::processCommonOptions(); my $err = $stepper->processOptions(); usage(1) if ($err); $dbHost = $opt_dbHost if ($opt_dbHost); $workhorse = $opt_workhorse if ($opt_workhorse); $bigClusterHub = $opt_bigClusterHub if ($opt_bigClusterHub); } ######################################################################### # * step: cluster [workhorse] sub doBlastDb { my $runDir = "$buildDir"; # verify starting with a clean directory, not done before if ( ! $opt_debug ) { if ( -d "$runDir" ) { if ( -s "$runDir/$db.nsq" ) { &HgAutomate::verbose(1, "\nblastDb step previously completed\n"); return; } } } &HgAutomate::mustMkdir($runDir); if (! -e $unmaskedSeq) { die "Error: required file $unmaskedSeq does not exist."; } my $whatItDoes = "Construct .fa file from unmasked.2bit, then run BuildDatabase from RepeatModeler to prepare rmblastn index files."; my $bossScript = newBash HgRemoteScript("$runDir/blastDb.bash", $workhorse, $runDir, $whatItDoes); $bossScript->add(<<_EOF_ export asmId="${db}" export unmasked2Bit="${unmaskedSeq}" export bDatabase="${BuildDatabase}" if [ "\${unmasked2Bit}" -nt "\${asmId}.fa" ]; then twoBitToFa "\${unmasked2Bit}" "\${asmId}.fa" touch -r "\${unmasked2Bit}" "\${asmId}.fa" fi if [ "\${asmId}.fa" -nt "\${asmId}.nsq" ]; then time (\$bDatabase -name "\${asmId}" -engine ncbi "\${asmId}.fa") > blastDb.log 2>&1 fi _EOF_ ); $bossScript->execute() if (! $opt_debug); } # sub doBlastDb ######################################################################### # * step: cluster [bigClusterHub] sub doCluster { my $runDir = "$buildDir"; my $paraHub = $bigClusterHub; # First, make sure previous step has completed: if ( ! $opt_debug ) { if ( ! -s "$runDir/$db.nsq" ) { die "doCluster: previous 'blastDb' step has not completed, $db.nsq not present\n"; } # And, verify this step has not run before if ( -s "$runDir/run.time" && ! -s "$runDir/${db}-families.fa" ) { die "cluster: this step appears to have run before, but is broken, run.time is present but ${db}-families.fa is not present ?"; } if ( -s "$runDir/${db}-families.fa" ) { &HgAutomate::verbose(1, "\ncluster step previously completed\n"); return; } } my $whatItDoes = "runs single cluster job to perform the RepeatModeler process."; my $bossScript = newBash HgRemoteScript("$runDir/doCluster.bash", $paraHub, $runDir, $whatItDoes); $bossScript->add(<<_EOF_ printf '#!/bin/bash set -beEu -o pipefail export tmpDir=`mktemp -d -p /dev/shm rModeler.XXXXXX` # working directory cd "\${tmpDir}" rsync --exclude "do.log" -a -P "${runDir}/" "\${tmpDir}/" export asmId="\${1}" export threadCount="${threadCount}" export rModeler="${RepeatModeler}" time (\$rModeler -engine ncbi \$threadCount -database "\${asmId}") > modeler.log 2>&1 rsync --exclude "do.log" -a -P ./ "${runDir}/" cd "${runDir}" rm -fr "\${tmpDir}/" chmod 775 "${runDir}" ' > oneJob chmod +x oneJob printf "oneJob ${db} {check out line+ ${db}-rmod.log}\n" > jobList para make $parasolOpts jobList para check para time > run.time cat run.time \$rModeler -version > "${runDir}/modelerVersion.txt" _EOF_ ); $bossScript->execute() if (! $opt_debug); } # doCluster ######################################################################### # * step: cleanup [workhorse] sub doCleanup { my $runDir = "$buildDir"; # First, make sure previous step has completed: if ( ! $opt_debug ) { if ( -s "$runDir/run.time" && ! -s "$runDir/${db}-families.fa" ) { die "cleanup: previous 'cluster' step appears to be broken, run.time is present but ${db}-families.fa is not present ?"; } if ( ! -s "$runDir/${db}-families.fa" ) { die "cleanup previous 'libResult' step has not completed, ${db}-families.fa not present\n"; } # And, verify this step has not run before if ( ! -s "$runDir/${db}.fa" ) { &HgAutomate::verbose(1, "\ncleanup step previously completed\n"); return; } } my $whatItDoes = "Cleans up or compresses intermediate files."; my $bossScript = newBash HgRemoteScript("$runDir/modelerCleanup.bash", $workhorse, $runDir, $whatItDoes); $bossScript->add(<<_EOF_ export asmId="${db}" if [ ! -s "\${asmId}-families.fa" ]; then printf "cleanup expected result file: \${asmId}-families.fa does not exist\n" 1>&2 exit 255 fi rm -fr \${asmId}.fa \${asmId}.n?? ./err/ if [ -s "\${asmId}-families.stk" ]; then gzip \${asmId}-families.stk fi +gzip -c "\${asmId}-families.fa" > "${buildDir}/\${asmId}.repeatModeler.families.fa.gz" +touch -r "\${asmId}-families.fa" "${buildDir}/\${asmId}.repeatModeler.families.fa.gz" c=`ls -d RM_* | wc -l` if [ "\${c}" -eq 1 ]; then RM_dir=`ls -d RM_*` if [ -d "\${RM_dir}" ]; then rm -fr "\${RM_dir}" else printf "directory RM_* not found ?\\n" 1>&2 ls -d RM* 1>&2 exit 255 fi else printf "single directory RM_* not found ?\\n" 1>&2 ls -d RM* 1>&2 exit 255 fi _EOF_ ); $bossScript->execute() if (! $opt_debug); } # doCleanup ######################################################################### # main # Prevent "Suspended (tty input)" hanging: &HgAutomate::closeStdin(); # Make sure we have valid options and exactly 1 argument: &checkOptions(); &usage(1) if (scalar(@ARGV) != 1); $secondsStart = `date "+%s"`; chomp $secondsStart; ($db) = @ARGV; # Now that we know the $db, figure out our paths: my $date = `date +%Y-%m-%d`; chomp $date; $buildDir = $opt_buildDir ? $opt_buildDir : "$HgAutomate::clusterData/$db/$HgAutomate::trackBuild/repeatModeler.$date"; $unmaskedSeq = $opt_unmaskedSeq ? $opt_unmaskedSeq : "$HgAutomate::clusterData/$db/$db.unmasked.2bit"; # Do everything. $stepper->execute(); # Tell the user anything they should know. my $stopStep = $stepper->getStopStep(); my $upThrough = ($stopStep eq 'cleanup') ? "" : " (through the '$stopStep' step)"; $secondsEnd = `date "+%s"`; chomp $secondsEnd; my $elapsedSeconds = $secondsEnd - $secondsStart; my $elapsedMinutes = int($elapsedSeconds/60); $elapsedSeconds -= $elapsedMinutes * 60; &HgAutomate::verbose(1, <<_EOF_ *** All done!$upThrough - Elapsed time: ${elapsedMinutes}m${elapsedSeconds}s *** Steps were performed in $buildDir _EOF_ ); if ($stepper->stepPrecedes('cluster', $stopStep)) { &HgAutomate::verbose(1, <<_EOF_ *** Result library file should be present in\n$buildDir/${db}-families.fa to be used by doRepeatMasker.pl -customLib=${db}-families.fa _EOF_ ); } &HgAutomate::verbose(1, "\n");