3dbc2036473e40fda245db7df8fa725ef6e8d5de
hiram
  Mon Jan 9 13:54:49 2023 -0800
and adding in a families.fa.gz download file no redmine

diff --git src/hg/utils/automation/doRepeatModeler.pl src/hg/utils/automation/doRepeatModeler.pl
index dd17461..a791e8d 100755
--- src/hg/utils/automation/doRepeatModeler.pl
+++ src/hg/utils/automation/doRepeatModeler.pl
@@ -1,324 +1,326 @@
 #!/usr/bin/env perl
 
 # DO NOT EDIT the /cluster/bin/scripts copy of this file --
 # edit ~/kent/src/hg/utils/automation/doRepeatModeler.pl instead.
 
 use Getopt::Long;
 use warnings;
 use strict;
 use Carp;
 use FindBin qw($Bin);
 use lib "$Bin";
 use HgAutomate;
 use HgRemoteScript;
 use HgStepManager;
 
 # Hardcoded command path:
 my $RepeatModelerPath = "/hive/data/outside/RepeatModeler-2.0.4";
 my $RepeatModeler = "$RepeatModelerPath/RepeatModeler";
 my $BuildDatabase = "$RepeatModelerPath/BuildDatabase";
 # configured to consume one entire ku machine node
 my $threadCount = "-threads 32";
 my $parasolOpts = "-cpu=32 -ram=128g";
 # Option defaults
 my $bigClusterHub = 'ku';
 my $workhorse = "hgwdev";
 my $defaultWorkhorse = 'hgwdev';
 
 # Option variable names, both common and peculiar to this script:
 use vars @HgAutomate::commonOptionVars;
 use vars @HgStepManager::optionVars;
 use vars qw/
     $opt_buildDir
     $opt_unmaskedSeq
     /;
 
 # Specify the steps supported with -continue / -stop:
 my $stepper = new HgStepManager(
     [ { name => 'blastDb', func => \&doBlastDb },
       { name => 'cluster',     func => \&doCluster },
       { name => 'cleanup', func => \&doCleanup },
     ]
 				);
 
 # Option defaults:
 my $dbHost = 'hgwdev';
 my $unmaskedSeq = "\$db.unmasked.2bit";
 
 my $base = $0;
 $base =~ s/^(.*\/)?//;
 
 sub usage {
   # Usage / help / self-documentation:
   my ($status, $detailed) = @_;
   # Basic help (for incorrect usage):
   print STDERR "
 usage: $base db
 options:
     the db argument is a UCSC database name or the assembly identifier
     for a GenArk assembly hub build
 ";
   print STDERR $stepper->getOptionHelp();
   print STDERR <<_EOF_
     -buildDir dir         Use dir instead of default
                           $HgAutomate::clusterData/\$db/$HgAutomate::trackBuild/repeatModeler.\$date
                           (necessary when continuing at a later date).
     -unmaskedSeq seq.2bit Use seq.2bit as the unmasked input sequence instead
                           of default ($unmaskedSeq).
 _EOF_
   ;
   print STDERR &HgAutomate::getCommonOptionHelp('dbHost' => $dbHost,
 						'workhorse' => '',
 						'bigClusterHub' => '');
   print STDERR "
 Automates the RepeatModeler process for genome assembly \$db.  Steps:
     blastDb: construct fasta file from unmasked.2bit and rmblastn index files.
     cluster: Parasol cluster run of RepeatModeler.
     cleanup: Removes or compresses intermediate files.
 All operations are performed in the build directory which is
 $HgAutomate::clusterData/\$db/$HgAutomate::trackBuild/repeatModeler.\$date unless -buildDir is given.
 Run -help to see what files are required for this script.
 ";
   # Detailed help (-help):
   print STDERR "
 Assumptions:
 1. $HgAutomate::clusterData/\$db/\$db.unmasked.2bit contains sequence for
    database/assembly \$db.  (This can be overridden with -unmaskedSeq.)
 2. When complete, the resulting RepeatMasker library file will be in the build
    directory with the name: asmId-families.fa
 " if ($detailed);
   print STDERR "\n";
   exit $status;
 }
 
 # Globals:
 # Command line args: db
 my ($db);
 # Other:
 my ($buildDir, $chromBased, $updateTable, $secondsStart, $secondsEnd);
 
 sub checkOptions {
   # Make sure command line options are valid/supported.
   my $ok = GetOptions(@HgStepManager::optionSpec,
 		      'buildDir=s',
 		      'unmaskedSeq=s',
 		      @HgAutomate::commonOptionSpec,
 		      );
   &usage(1) if (!$ok);
   &usage(0, 1) if ($opt_help);
   &HgAutomate::processCommonOptions();
   my $err = $stepper->processOptions();
   usage(1) if ($err);
   $dbHost = $opt_dbHost if ($opt_dbHost);
   $workhorse = $opt_workhorse if ($opt_workhorse);
   $bigClusterHub = $opt_bigClusterHub if ($opt_bigClusterHub);
 }
 
 #########################################################################
 # * step: cluster [workhorse]
 sub doBlastDb {
   my $runDir = "$buildDir";
   # verify starting with a clean directory, not done before
   if ( ! $opt_debug ) {
     if ( -d "$runDir" ) {
        if ( -s "$runDir/$db.nsq" ) {
          &HgAutomate::verbose(1, "\nblastDb step previously completed\n");
          return;
        }
     }
   }
   &HgAutomate::mustMkdir($runDir);
 
   if (! -e $unmaskedSeq) {
     die "Error: required file $unmaskedSeq does not exist.";
   }
 
   my $whatItDoes =
 "Construct .fa file from unmasked.2bit, then run BuildDatabase from
 RepeatModeler to prepare rmblastn index files.";
 
   my $bossScript = newBash HgRemoteScript("$runDir/blastDb.bash", $workhorse,
 				      $runDir, $whatItDoes);
 
   $bossScript->add(<<_EOF_
 export asmId="${db}"
 export unmasked2Bit="${unmaskedSeq}"
 export bDatabase="${BuildDatabase}"
 
 if [ "\${unmasked2Bit}" -nt "\${asmId}.fa" ]; then
   twoBitToFa "\${unmasked2Bit}" "\${asmId}.fa"
   touch -r "\${unmasked2Bit}" "\${asmId}.fa"
 fi
 
 if [ "\${asmId}.fa" -nt "\${asmId}.nsq" ]; then
   time (\$bDatabase -name "\${asmId}" -engine ncbi "\${asmId}.fa") > blastDb.log 2>&1
 fi
 
 _EOF_
     );
 
   $bossScript->execute() if (! $opt_debug);
 } # sub doBlastDb
 
 #########################################################################
 # * step: cluster [bigClusterHub]
 sub doCluster {
   my $runDir = "$buildDir";
   my $paraHub = $bigClusterHub;
 
   # First, make sure previous step has completed:
   if ( ! $opt_debug ) {
     if ( ! -s "$runDir/$db.nsq" ) {
       die "doCluster: previous 'blastDb' step has not completed, $db.nsq not present\n";
     }
     # And, verify this step has not run before
     if ( -s "$runDir/run.time" && ! -s "$runDir/${db}-families.fa" ) {
       die "cluster: this step appears to have run before, but is broken, run.time is present but ${db}-families.fa is not present ?";
     }
     if ( -s "$runDir/${db}-families.fa" ) {
          &HgAutomate::verbose(1, "\ncluster step previously completed\n");
          return;
     }
   }
 
   my $whatItDoes =
 "runs single cluster job to perform the RepeatModeler process.";
 
   my $bossScript = newBash HgRemoteScript("$runDir/doCluster.bash", $paraHub,
 				      $runDir, $whatItDoes);
   $bossScript->add(<<_EOF_
 printf '#!/bin/bash
 
 set -beEu -o pipefail
 
 export tmpDir=`mktemp -d -p /dev/shm rModeler.XXXXXX`
 
 # working directory
 cd "\${tmpDir}"
 rsync --exclude "do.log" -a -P "${runDir}/" "\${tmpDir}/"
 
 export asmId="\${1}"
 export threadCount="${threadCount}"
 export rModeler="${RepeatModeler}"
 
 time (\$rModeler -engine ncbi \$threadCount -database "\${asmId}") > modeler.log 2>&1
 rsync --exclude "do.log" -a -P ./ "${runDir}/"
 cd "${runDir}"
 rm -fr "\${tmpDir}/"
 chmod 775 "${runDir}"
 ' > oneJob
 chmod +x oneJob
 printf "oneJob ${db} {check out line+ ${db}-rmod.log}\n" > jobList
 para make $parasolOpts jobList
 para check
 para time > run.time
 cat run.time
 \$rModeler -version > "${runDir}/modelerVersion.txt"
 
 _EOF_
   );
   $bossScript->execute() if (! $opt_debug);
 } # doCluster
 
 #########################################################################
 # * step: cleanup [workhorse]
 sub doCleanup {
   my $runDir = "$buildDir";
 
   # First, make sure previous step has completed:
   if ( ! $opt_debug ) {
     if ( -s "$runDir/run.time" && ! -s "$runDir/${db}-families.fa" ) {
       die "cleanup: previous 'cluster' step appears to be broken, run.time is present but ${db}-families.fa is not present ?";
     }
     if ( ! -s "$runDir/${db}-families.fa" ) {
       die "cleanup previous 'libResult' step has not completed, ${db}-families.fa not present\n";
     }
     # And, verify this step has not run before
     if ( ! -s "$runDir/${db}.fa" ) {
          &HgAutomate::verbose(1, "\ncleanup step previously completed\n");
          return;
     }
   }
   my $whatItDoes = "Cleans up or compresses intermediate files.";
   my $bossScript = newBash HgRemoteScript("$runDir/modelerCleanup.bash", $workhorse,
 				      $runDir, $whatItDoes);
   $bossScript->add(<<_EOF_
 export asmId="${db}"
 
 if [ ! -s "\${asmId}-families.fa" ]; then
   printf "cleanup expected result file: \${asmId}-families.fa does not exist\n" 1>&2
   exit 255
 fi
 rm -fr \${asmId}.fa \${asmId}.n?? ./err/
 if [ -s "\${asmId}-families.stk" ]; then
   gzip \${asmId}-families.stk
 fi
+gzip -c "\${asmId}-families.fa" > "${buildDir}/\${asmId}.repeatModeler.families.fa.gz"
+touch -r "\${asmId}-families.fa" "${buildDir}/\${asmId}.repeatModeler.families.fa.gz"
 c=`ls -d RM_* | wc -l`
 if [ "\${c}" -eq 1 ]; then
    RM_dir=`ls -d RM_*`
    if [ -d "\${RM_dir}" ]; then
      rm -fr "\${RM_dir}"
    else
      printf "directory RM_* not found ?\\n" 1>&2
      ls -d RM* 1>&2
      exit 255
    fi
 else
    printf "single directory RM_* not found ?\\n" 1>&2
    ls -d RM* 1>&2
    exit 255
 fi
 _EOF_
   );
   $bossScript->execute() if (! $opt_debug);
 } # doCleanup
 
 #########################################################################
 # main
 
 # Prevent "Suspended (tty input)" hanging:
 &HgAutomate::closeStdin();
 
 # Make sure we have valid options and exactly 1 argument:
 &checkOptions();
 &usage(1) if (scalar(@ARGV) != 1);
 $secondsStart = `date "+%s"`;
 chomp $secondsStart;
 ($db) = @ARGV;
 
 # Now that we know the $db, figure out our paths:
 my $date = `date +%Y-%m-%d`;
 chomp $date;
 $buildDir = $opt_buildDir ? $opt_buildDir :
   "$HgAutomate::clusterData/$db/$HgAutomate::trackBuild/repeatModeler.$date";
 $unmaskedSeq = $opt_unmaskedSeq ? $opt_unmaskedSeq :
   "$HgAutomate::clusterData/$db/$db.unmasked.2bit";
 
 # Do everything.
 $stepper->execute();
 
 # Tell the user anything they should know.
 my $stopStep = $stepper->getStopStep();
 my $upThrough = ($stopStep eq 'cleanup') ? "" :
   "  (through the '$stopStep' step)";
 
 $secondsEnd = `date "+%s"`;
 chomp $secondsEnd;
 my $elapsedSeconds = $secondsEnd - $secondsStart;
 my $elapsedMinutes = int($elapsedSeconds/60);
 $elapsedSeconds -= $elapsedMinutes * 60;
 
 &HgAutomate::verbose(1, <<_EOF_
 
  *** All done!$upThrough - Elapsed time: ${elapsedMinutes}m${elapsedSeconds}s
  *** Steps were performed in $buildDir
 _EOF_
 );
 if ($stepper->stepPrecedes('cluster', $stopStep)) {
   &HgAutomate::verbose(1, <<_EOF_
  *** Result library file should be present in\n$buildDir/${db}-families.fa
      to be used by doRepeatMasker.pl -customLib=${db}-families.fa
 _EOF_
   );
 }
 &HgAutomate::verbose(1, "\n");