29bf2c22b06f9d457bcd43ef2029bad054542ba8
hiram
  Thu Dec 15 11:35:24 2022 -0800
initial version of running RepeatModeler no redmine

diff --git src/hg/utils/automation/doRepeatModeler.pl src/hg/utils/automation/doRepeatModeler.pl
new file mode 100755
index 0000000..04a8658
--- /dev/null
+++ src/hg/utils/automation/doRepeatModeler.pl
@@ -0,0 +1,307 @@
+#!/usr/bin/env perl
+
+# DO NOT EDIT the /cluster/bin/scripts copy of this file --
+# edit ~/kent/src/hg/utils/automation/doRepeatModeler.pl instead.
+
+use Getopt::Long;
+use warnings;
+use strict;
+use Carp;
+use FindBin qw($Bin);
+use lib "$Bin";
+use HgAutomate;
+use HgRemoteScript;
+use HgStepManager;
+
+# Hardcoded command path:
+my $RepeatModelerPath = "/hive/data/outside/RepeatModeler-2.0.4";
+my $RepeatModeler = "$RepeatModelerPath/RepeatModeler";
+my $BuildDatabase = "$RepeatModelerPath/BuildDatabase";
+# configured to consume one entire ku machine node
+my $threadCount = "-threads 32";
+my $parasolOpts = "-cpu=32 -ram=128g";
+# Option defaults
+my $bigClusterHub = 'ku';
+my $workhorse = "hgwdev";
+my $defaultWorkhorse = 'hgwdev';
+
+# Option variable names, both common and peculiar to this script:
+use vars @HgAutomate::commonOptionVars;
+use vars @HgStepManager::optionVars;
+use vars qw/
+    $opt_buildDir
+    $opt_unmaskedSeq
+    /;
+
+# Specify the steps supported with -continue / -stop:
+my $stepper = new HgStepManager(
+    [ { name => 'blastDb', func => \&doBlastDb },
+      { name => 'cluster',     func => \&doCluster },
+      { name => 'cleanup', func => \&doCleanup },
+    ]
+				);
+
+# Option defaults:
+my $dbHost = 'hgwdev';
+my $unmaskedSeq = "\$db.unmasked.2bit";
+
+my $base = $0;
+$base =~ s/^(.*\/)?//;
+
+sub usage {
+  # Usage / help / self-documentation:
+  my ($status, $detailed) = @_;
+  # Basic help (for incorrect usage):
+  print STDERR "
+usage: $base db
+options:
+    the db argument is a UCSC database name or the assembly identifier
+    for a GenArk assembly hub build
+";
+  print STDERR $stepper->getOptionHelp();
+  print STDERR <<_EOF_
+    -buildDir dir         Use dir instead of default
+                          $HgAutomate::clusterData/\$db/$HgAutomate::trackBuild/RepeatModeler.\$date
+                          (necessary when continuing at a later date).
+    -unmaskedSeq seq.2bit Use seq.2bit as the unmasked input sequence instead
+                          of default ($unmaskedSeq).
+_EOF_
+  ;
+  print STDERR &HgAutomate::getCommonOptionHelp('dbHost' => $dbHost,
+						'workhorse' => '',
+						'bigClusterHub' => '');
+  print STDERR "
+Automates the RepeatModeler process for genome assembly \$db.  Steps:
+    blastDb: construct fasta file from unmasked.2bit and rmblastn index files.
+    cluster: Parasol cluster run of RepeatModeler.
+    libResult: Collect the consensus library file from the RepeatModeler output.
+    cleanup: Removes or compresses intermediate files.
+All operations are performed in the build directory which is
+$HgAutomate::clusterData/\$db/$HgAutomate::trackBuild/RepeatModeler.\$date unless -buildDir is given.
+Run -help to see what files are required for this script.
+";
+  # Detailed help (-help):
+  print STDERR "
+Assumptions:
+1. $HgAutomate::clusterData/\$db/\$db.unmasked.2bit contains sequence for
+   database/assembly \$db.  (This can be overridden with -unmaskedSeq.)
+2. When complete, the resulting RepeatMasker library file will be in the build
+   directory with the name: asmId-families.fa
+" if ($detailed);
+  print STDERR "\n";
+  exit $status;
+}
+
+# Globals:
+# Command line args: db
+my ($db);
+# Other:
+my ($buildDir, $chromBased, $updateTable, $secondsStart, $secondsEnd);
+
+sub checkOptions {
+  # Make sure command line options are valid/supported.
+  my $ok = GetOptions(@HgStepManager::optionSpec,
+		      'buildDir=s',
+		      'unmaskedSeq=s',
+		      @HgAutomate::commonOptionSpec,
+		      );
+  &usage(1) if (!$ok);
+  &usage(0, 1) if ($opt_help);
+  &HgAutomate::processCommonOptions();
+  my $err = $stepper->processOptions();
+  usage(1) if ($err);
+  $dbHost = $opt_dbHost if ($opt_dbHost);
+  $workhorse = $opt_workhorse if ($opt_workhorse);
+  $bigClusterHub = $opt_bigClusterHub if ($opt_bigClusterHub);
+}
+
+#########################################################################
+# * step: cluster [workhorse]
+sub doBlastDb {
+  my $runDir = "$buildDir";
+  # verify starting with a clean directory, not done before
+  if ( ! $opt_debug ) {
+    if ( -d "$runDir" ) {
+       if ( -s "$runDir/$db.nsq" ) {
+         &HgAutomate::verbose(1, "\nblastDb step previously completed\n");
+         return;
+       }
+    }
+  }
+  &HgAutomate::mustMkdir($runDir);
+
+  if (! -e $unmaskedSeq) {
+    die "Error: required file $unmaskedSeq does not exist.";
+  }
+
+  my $whatItDoes =
+"Construct .fa file from unmasked.2bit, then run BuildDatabase from
+RepeatModeler to prepare rmblastn index files.";
+
+  my $bossScript = newBash HgRemoteScript("$runDir/blastDb.bash", $workhorse,
+				      $runDir, $whatItDoes);
+
+  $bossScript->add(<<_EOF_
+export asmId="${db}"
+export unmasked2Bit="${unmaskedSeq}"
+export bDatabase="${BuildDatabase}"
+
+if [ "\${unmasked2Bit}" -nt "\${asmId}.fa" ]; then
+  twoBitToFa "\${unmasked2Bit}" "\${asmId}.fa"
+  touch -r "\${unmasked2Bit}" "\${asmId}.fa"
+fi
+
+if [ "\${asmId}.fa" -nt "\${asmId}.nsq" ]; then
+  time (\$bDatabase -name "\${asmId}" -engine ncbi "\${asmId}.fa") > blastDb.log 2>&1
+fi
+_EOF_
+    );
+
+  $bossScript->execute() if (! $opt_debug);
+} # sub doBlastDb
+
+#########################################################################
+# * step: cluster [bigClusterHub]
+sub doCluster {
+  my $runDir = "$buildDir";
+  my $paraHub = $bigClusterHub;
+
+  # First, make sure previous step has completed:
+  if ( ! $opt_debug ) {
+    if ( ! -s "$runDir/$db.nsq" ) {
+      die "doCluster: previous 'blastDb' step has not completed, $db.nsq not present\n";
+    }
+    # And, verify this step has not run before
+    if ( -s "$runDir/run.time" && ! -s "$runDir/${db}-families.fa" ) {
+      die "cluster: this step appears to have run before, but is broken, run.time is present but ${db}-families.fa is not present ?";
+    }
+    if ( -s "$runDir/${db}-families.fa" ) {
+         &HgAutomate::verbose(1, "\ncluster step previously completed\n");
+         return;
+    }
+  }
+
+  my $whatItDoes =
+"runs single cluster job to perform the RepeatModeler process.";
+
+  my $bossScript = newBash HgRemoteScript("$runDir/doCluster.bash", $paraHub,
+				      $runDir, $whatItDoes);
+  $bossScript->add(<<_EOF_
+printf '#!/bin/bash
+
+set -beEu -o pipefail
+
+export asmId="\${1}"
+export threadCount="${threadCount}"
+export rModeler="${RepeatModeler}"
+
+time (\$rModeler -engine ncbi \$threadCount -database "\${asmId}") > modeler.log 2>&1
+' > oneJob
+chmod +x oneJob
+printf "oneJob ${db} {check out line+ ${db}-rmod.log}\n" > jobList
+para make $parasolOpts jobList
+para check
+para time > run.time
+cat run.time
+_EOF_
+  );
+  $bossScript->execute() if (! $opt_debug);
+} # doCluster
+
+#########################################################################
+# * step: cleanup [workhorse]
+sub doCleanup {
+  my $runDir = "$buildDir";
+
+  # First, make sure previous step has completed:
+  if ( ! $opt_debug ) {
+    if ( -s "$runDir/run.time" && ! -s "$runDir/${db}-families.fa" ) {
+      die "cleanup: previous 'cluster' step appears to be broken, run.time is present but ${db}-families.fa is not present ?";
+    }
+    if ( ! -s "$runDir/${db}-families.fa" ) {
+      die "cleanup previous 'libResult' step has not completed, ${db}-families.fa not present\n";
+    }
+    # And, verify this step has not run before
+    if ( ! -s "$runDir/${db}.fa" ) {
+         &HgAutomate::verbose(1, "\ncleanup step previously completed\n");
+         return;
+    }
+  }
+  my $whatItDoes = "Cleans up or compresses intermediate files.";
+  my $bossScript = newBash HgRemoteScript("$runDir/doCleanup.bash", $workhorse,
+				      $runDir, $whatItDoes);
+  $bossScript->add(<<_EOF_
+export asmId="${db}"
+
+rm -f \${asmId}.fa
+rm -f \${asmId}.n??
+gzip -c \${asmId}.stk
+c=`ls -d RM_* | wc -l`
+if [ "\${c}" -eq 1 ]; then
+   RM_dir=`ls -d RM_*`
+   if [ -d "\${RM_dir}" ]; then
+     rm -fr "\${RM_dir}"
+   else
+     printf "directory RM_* not found ?\n" 1>&2
+     ls -d RM* 1>&2
+     exit 255
+   fi
+else
+   printf "single directory RM_* not found ?\n" 1>&2
+   ls -d RM* 1>&2
+   exit 255
+fi
+_EOF_
+  );
+  $bossScript->execute() if (! $opt_debug);
+} # doCleanup
+
+#########################################################################
+# main
+
+# Prevent "Suspended (tty input)" hanging:
+&HgAutomate::closeStdin();
+
+# Make sure we have valid options and exactly 1 argument:
+&checkOptions();
+&usage(1) if (scalar(@ARGV) != 1);
+$secondsStart = `date "+%s"`;
+chomp $secondsStart;
+($db) = @ARGV;
+
+# Now that we know the $db, figure out our paths:
+my $date = `date +%Y-%m-%d`;
+chomp $date;
+$buildDir = $opt_buildDir ? $opt_buildDir :
+  "$HgAutomate::clusterData/$db/$HgAutomate::trackBuild/RepeatModeler.$date";
+$unmaskedSeq = $opt_unmaskedSeq ? $opt_unmaskedSeq :
+  "$HgAutomate::clusterData/$db/$db.unmasked.2bit";
+
+# Do everything.
+$stepper->execute();
+
+# Tell the user anything they should know.
+my $stopStep = $stepper->getStopStep();
+my $upThrough = ($stopStep eq 'cleanup') ? "" :
+  "  (through the '$stopStep' step)";
+
+$secondsEnd = `date "+%s"`;
+chomp $secondsEnd;
+my $elapsedSeconds = $secondsEnd - $secondsStart;
+my $elapsedMinutes = int($elapsedSeconds/60);
+$elapsedSeconds -= $elapsedMinutes * 60;
+
+&HgAutomate::verbose(1, <<_EOF_
+
+ *** All done!$upThrough - Elapsed time: ${elapsedMinutes}m${elapsedSeconds}s
+ *** Steps were performed in $buildDir
+_EOF_
+);
+if ($stepper->stepPrecedes('cluster', $stopStep)) {
+  &HgAutomate::verbose(1, <<_EOF_
+ *** Result library file should be present in\n$buildDir/${db}-families.fa
+     to be used by doRepeatMasker.pl -customLib=${db}-families.fa
+_EOF_
+  );
+}
+&HgAutomate::verbose(1, "\n");