967b763f92c9dc4d3e57399de0c875029d828ef1
hiram
  Tue Dec 13 20:58:42 2022 -0800
now running the next version of RepeatMasker v4.1.4 refs #29545

diff --git src/hg/utils/automation/doRepeatMasker.pl src/hg/utils/automation/doRepeatMasker.pl
index 017ffa4..77d21ea 100755
--- src/hg/utils/automation/doRepeatMasker.pl
+++ src/hg/utils/automation/doRepeatMasker.pl
@@ -4,49 +4,52 @@
 # edit ~/kent/src/hg/utils/automation/doRepeatMasker.pl instead.
 
 # $Id: doRepeatMasker.pl,v 1.14 2009/03/19 16:15:29 hiram Exp $
 
 use Getopt::Long;
 use warnings;
 use strict;
 use Carp;
 use FindBin qw($Bin);
 use lib "$Bin";
 use HgAutomate;
 use HgRemoteScript;
 use HgStepManager;
 
 # Hardcoded command path:
-my $RepeatMaskerPath = "/hive/data/staging/data/RepeatMasker";
+my $RepeatMaskerPath = "/hive/data/staging/data/RepeatMasker221107";
 my $RepeatMasker = "$RepeatMaskerPath/RepeatMasker";
-my $RepeatMaskerEngine = "-engine crossmatch -s";
-# Let parasol pick defaults
-my $parasolRAM = "";
+# default engine changed from crossmatch to rmblast as of 2022-12
+# with RM version 4.1.4
+my $RepeatMaskerEngine = "-engine rmblast -pa 1";
+# per RM doc, rmblast uses 4 CPUs for each job
+my $parasolRAM = "-cpu=4 -ram=32g";
 
 # Option variable names, both common and peculiar to this script:
 use vars @HgAutomate::commonOptionVars;
 use vars @HgStepManager::optionVars;
 use vars qw/
     $opt_buildDir
     $opt_ncbiRmsk
     $opt_dupList
     $opt_liftSpec
     $opt_species
     $opt_unmaskedSeq
     $opt_customLib
     $opt_useHMMER
     $opt_useRMBlastn
+    $opt_useCrossMatch
     $opt_splitTables
     $opt_noSplit
     $opt_updateTable
     /;
 
 # Specify the steps supported with -continue / -stop:
 my $stepper = new HgStepManager(
     [ { name => 'cluster', func => \&doCluster },
       { name => 'cat',     func => \&doCat },
       { name => 'mask',    func => \&doMask },
       { name => 'install', func => \&doInstall },
       { name => 'cleanup', func => \&doCleanup },
     ]
 				);
 
@@ -77,31 +80,32 @@
     -buildDir dir         Use dir instead of default
                           $HgAutomate::clusterData/\$db/$HgAutomate::trackBuild/RepeatMasker.\$date
                           (necessary when continuing at a later date).
     -species sp           Use sp (which can be quoted genus and species, or
                           a common name that RepeatMasker recognizes.
                           Default: $defaultSpecies.
     -ncbiRmsk path/file_rm.out.gz - Use the repeat masker result supplied in
                           the assembly as calculated by NCBI
     -dupList .../download/asmId.remove.dups.list - to remove duplicates from
 			  NCBI repeat masker file
     -liftSpec path/file.lift - Use this lift file to lift the NCBI coordinates
                           to UCSC coordinates, usually used with ncbiRmsk
     -unmaskedSeq seq.2bit Use seq.2bit as the unmasked input sequence instead
                           of default ($unmaskedSeq).
     -customLib lib.fa     Use custom repeat library instead of RepeatMaskers\'s.
-    -useRMBlastn          Use NCBI rmblastn instead of crossmatch
+    -useRMBlastn          This is the default as of 2022-12 == NCBI rmblastn
+    -useCrossMatch        Use crossmatch instead of NCBI rmblastn
     -useHMMER             Use hmmer instead of crossmatch ( currently for human only )
     -updateTable          load into table name rmskUpdate (default: rmsk)
     -splitTables          split the _rmsk tables (default is not split)
     -noSplit              default behavior, this option no longer required.
 _EOF_
   ;
   print STDERR &HgAutomate::getCommonOptionHelp('dbHost' => $dbHost,
 						'workhorse' => '',
 						'bigClusterHub' => '');
   print STDERR "
 Automates UCSC's RepeatMasker process for genome database \$db.  Steps:
     cluster: Do a cluster run of RepeatMasker on 500kb sequence chunks.
     cat:     Concatenate the cluster run results into \$db.sorted.fa.out.
     mask:    Create a \$db.2bit masked by \$db.sorted.fa.out.
     install: Load \$db.sorted.fa.out into the rmsk table (possibly split) in \$db,
@@ -128,30 +132,31 @@
 my ($db);
 # Other:
 my ($buildDir, $chromBased, $updateTable, $secondsStart, $secondsEnd);
 
 sub checkOptions {
   # Make sure command line options are valid/supported.
   my $ok = GetOptions(@HgStepManager::optionSpec,
 		      'buildDir=s',
 		      'species=s',
 		      'ncbiRmsk=s',
 		      'dupList=s',
 		      'liftSpec=s',
 		      'unmaskedSeq=s',
 		      'customLib=s',
                       'useRMBlastn',
+                      'useCrossMatch',
                       'useHMMER',
 		      'splitTables',
 		      'noSplit',
 		      'updateTable',
 		      @HgAutomate::commonOptionSpec,
 		      );
   &usage(1) if (!$ok);
   &usage(0, 1) if ($opt_help);
   &HgAutomate::processCommonOptions();
   my $err = $stepper->processOptions();
   usage(1) if ($err);
   $dbHost = $opt_dbHost if ($opt_dbHost);
 }
 
 #########################################################################
@@ -193,35 +198,35 @@
   my $customLib = $opt_customLib;
   my $repeatLib = "";
   if ($opt_customLib && $opt_species) {
      $repeatLib = "-species \'$species\' -lib $customLib";
   }
   elsif ($opt_customLib) {
      $repeatLib = "-lib $customLib";
   }
   else {
      $repeatLib = "-species \'$species\'";
   }
 
   # updated for ku kluster operation -cpu option instead of ram option
   if ( $opt_useRMBlastn ) {
     $RepeatMaskerEngine = "-engine rmblast -pa 1";
+    $parasolRAM = "-cpu=4 -ram=32g";
+  } elsif ( $opt_useCrossMatch ) {
+    $RepeatMaskerEngine = "-engine crossmatch -s";
     $parasolRAM = "-cpu=1";
-  }
-
-  # updated for ku kluster operation -cpu option instead of ram option
-  if ( $opt_useHMMER ) {
+  } elsif ( $opt_useHMMER ) {
     # NOTE: This is only applicable for 8gb one-job-per-node scheduling
     $RepeatMaskerEngine = "-engine hmmer -pa 4";
     $parasolRAM = "-cpu=4 -ram=32g";
   }
 
   # Script to do a dummy run of RepeatMasker, to test our invocation and
   # unpack library files before kicking off a large cluster run.
   #  And now that RM is being run from local /scratch/data/RepeatMasker/
   #  this is also done in the cluster run script so each node will have
   #	its library initialized
   my $fh = &HgAutomate::mustOpen(">$runDir/dummyRun.csh");
   print $fh <<_EOF_
 #!/bin/csh -ef
 
 set path = (/cluster/software/bin \$path)
@@ -306,62 +311,71 @@
 rm -rf \$tmpDir
 _EOF_
   ;
   close($fh);
 
   &HgAutomate::makeGsub($runDir,
       "./RMRun.csh {check out line $partDir/\$(path1).out}");
 
   my $whatItDoes =
 "It computes a logical partition of unmasked 2bit into 500k chunks
 and runs it on the cluster with the most available bandwidth.";
   my $bossScript = new HgRemoteScript("$runDir/doCluster.csh", $paraHub,
 				      $runDir, $whatItDoes);
 
   $bossScript->add(<<_EOF_
+
+set path = (/cluster/software/bin \$path)
+
 chmod a+x dummyRun.csh
 chmod a+x RMRun.csh
 
 # Record RM version used:
 printf "The repeat files provided for this assembly were generated using RepeatMasker.\\
   Smit, AFA, Hubley, R & Green, P.,\\
-  RepeatMasker Open-4.0.\\
+  RepeatMasker version 4.1.4\\
   1996-2010 <http://www.repeatmasker.org>.\\
 \\
 VERSION:\\n" > ../versionInfo.txt
 
 ./dummyRun.csh | grep -v "dev/null" >> ../versionInfo.txt
 
 $RepeatMasker -v >> ../versionInfo.txt
-grep RELEASE $RepeatMaskerPath/Libraries/RepeatMaskerLib.embl >> ../versionInfo.txt
+printf "# RMRBMeta.embl library version: %s\\n" "`grep RELEASE $RepeatMaskerPath/Libraries/RMRBMeta.embl`" >> ../versionInfo.txt
 printf "# RepeatMasker engine: %s\\n" "${RepeatMaskerEngine}" >> ../versionInfo.txt
 
 ls -ld $RepeatMaskerPath $RepeatMasker
 $RepeatMasker -v
-grep RELEASE $RepeatMaskerPath/Libraries/RepeatMaskerLib.embl
+echo -n "# RMRBMeta.embl library version: "
+grep RELEASE $RepeatMaskerPath/Libraries/RMRBMeta.embl | sed -e 's/  *\\*\$//;'
 echo "# RepeatMasker engine: $RepeatMaskerEngine"
 _EOF_
   );
-  if ($opt_useRMBlastn) {
+  if ($opt_useCrossMatch) {
+    $bossScript->add(<<_EOF_
+printf "# using engine crossmatch\\n" >> ../versionInfo.txt
+echo "# useCrossMatch crossmatch"
+_EOF_
+    );
+  } elsif ($opt_useRMBlastn) {
     $bossScript->add(<<_EOF_
-printf "# using rmblastn:\\t" >> ../versionInfo.txt
+printf "# using engine rmblastn:\\t" >> ../versionInfo.txt
 echo "# useRMBlastn: rmblastn:"
 grep -w value $RepeatMaskerPath/RepeatMaskerConfig.pm | grep rmblastn | awk '{print \$NF}' >> ../versionInfo.txt
 _EOF_
     );
-  }
-  if ($opt_useHMMER) {
+  } elsif ($opt_useHMMER) {
     $bossScript->add(<<_EOF_
 printf "# using Dfam library and HMMER3:\\n" >> ../versionInfo.txt
 echo "# useHMMER: Dfam library: "
 ls -ld $RepeatMaskerPath/Libraries/Dfam.hmm
 grep Release: $RepeatMaskerPath/Libraries/Dfam.hmm >> ../versionInfo.txt
 echo "# useHMMER: HMMER3: "
 grep -m 1 ^HMMER3 $RepeatMaskerPath/Libraries/Dfam.hmm >> ../versionInfo.txt
 _EOF_
     );
   }
   if (length($repeatLib) > 0) {
     $bossScript->add(<<_EOF_
 printf "# RepeatMasker library options: %s\\n" "${repeatLib}" >> ../versionInfo.txt
 echo "# RepeatMasker library options: '$repeatLib'"
 _EOF_