src/hg/utils/automation/doRepeatMasker.pl 6a48e07c2fbefbf0a0e5e74662dd768ef93fca88

6a48e07c2fbefbf0a0e5e74662dd768ef93fca88
hiram
  Wed Apr 29 11:27:30 2020 -0700
adding ncbiRmsk option to use NCBI rm.out.gz file instead of cluster run and record versionInfo.txt of RepeatMasker operation refs #23734

diff --git src/hg/utils/automation/doRepeatMasker.pl src/hg/utils/automation/doRepeatMasker.pl
index 286e999..8e05e16 100755
--- src/hg/utils/automation/doRepeatMasker.pl
+++ src/hg/utils/automation/doRepeatMasker.pl
@@ -15,74 +15,85 @@
 use HgRemoteScript;
 use HgStepManager;
 
 # Hardcoded command path:
 my $RepeatMaskerPath = "/hive/data/staging/data/RepeatMasker";
 my $RepeatMasker = "$RepeatMaskerPath/RepeatMasker";
 my $RepeatMaskerEngine = "-engine crossmatch -s";
 # Let parasol pick defaults
 my $parasolRAM = "";
 
 # Option variable names, both common and peculiar to this script:
 use vars @HgAutomate::commonOptionVars;
 use vars @HgStepManager::optionVars;
 use vars qw/
     $opt_buildDir
+    $opt_ncbiRmsk
+    $opt_liftSpec
     $opt_species
     $opt_unmaskedSeq
     $opt_customLib
     $opt_useHMMER
     $opt_useRMBlastn
     $opt_splitTables
     $opt_noSplit
     $opt_updateTable
     /;
 
 # Specify the steps supported with -continue / -stop:
 my $stepper = new HgStepManager(
     [ { name => 'cluster', func => \&doCluster },
       { name => 'cat',     func => \&doCat },
       { name => 'mask',    func => \&doMask },
       { name => 'install', func => \&doInstall },
       { name => 'cleanup', func => \&doCleanup },
     ]
 				);
 
 # Option defaults:
 my $dbHost = 'hgwdev';
 my $unmaskedSeq = "\$db.unmasked.2bit";
 my $defaultSpecies = 'scientificName from dbDb';
+my $ncbiRmsk = "";	# path to NCBI *_rm.out.gz file to use NCBI result
+                        # for repeat masking result
+                        # in the assembly as calculated by NCBI
+my $liftSpec = "";	# lift file from NCBI coordinates to UCSC coordinates
+                        # usually required with ncbiRmsk
 
 my $base = $0;
 $base =~ s/^(.*\/)?//;
 
 sub usage {
   # Usage / help / self-documentation:
   my ($status, $detailed) = @_;
   # Basic help (for incorrect usage):
   print STDERR "
 usage: $base db
 options:
 ";
   print STDERR $stepper->getOptionHelp();
   print STDERR <<_EOF_
     -buildDir dir         Use dir instead of default
                           $HgAutomate::clusterData/\$db/$HgAutomate::trackBuild/RepeatMasker.\$date
                           (necessary when continuing at a later date).
     -species sp           Use sp (which can be quoted genus and species, or
                           a common name that RepeatMasker recognizes.
                           Default: $defaultSpecies.
+    -ncbiRmsk path/file_rm.out.gz - Use the repeat masker result supplied in
+                          the assembly as calculated by NCBI
+    -liftSpec path/file.lift - Use this lift file to lift the NCBI coordinates
+                          to UCSC coordinates, usually used with ncbiRmsk
     -unmaskedSeq seq.2bit Use seq.2bit as the unmasked input sequence instead
                           of default ($unmaskedSeq).
     -customLib lib.fa     Use custom repeat library instead of RepeatMaskers\'s.
     -useRMBlastn          Use NCBI rmblastn instead of crossmatch
     -useHMMER             Use hmmer instead of crossmatch ( currently for human only )
     -updateTable          load into table name rmskUpdate (default: rmsk)
     -splitTables          split the _rmsk tables (default is not split)
     -noSplit              default behavior, this option no longer required.
 _EOF_
   ;
   print STDERR &HgAutomate::getCommonOptionHelp('dbHost' => $dbHost,
 						'workhorse' => '',
 						'bigClusterHub' => '');
   print STDERR "
 Automates UCSC's RepeatMasker process for genome database \$db.  Steps:
@@ -107,51 +118,63 @@
   exit $status;
 }
 
 
 # Globals:
 # Command line args: db
 my ($db);
 # Other:
 my ($buildDir, $chromBased, $updateTable, $secondsStart, $secondsEnd);
 
 sub checkOptions {
   # Make sure command line options are valid/supported.
   my $ok = GetOptions(@HgStepManager::optionSpec,
 		      'buildDir=s',
 		      'species=s',
+		      'ncbiRmsk=s',
+		      'liftSpec=s',
 		      'unmaskedSeq=s',
 		      'customLib=s',
                       'useRMBlastn',
                       'useHMMER',
 		      'splitTables',
 		      'noSplit',
 		      'updateTable',
 		      @HgAutomate::commonOptionSpec,
 		      );
   &usage(1) if (!$ok);
   &usage(0, 1) if ($opt_help);
   &HgAutomate::processCommonOptions();
   my $err = $stepper->processOptions();
   usage(1) if ($err);
   $dbHost = $opt_dbHost if ($opt_dbHost);
 }
 
 #########################################################################
 # * step: cluster [bigClusterHub]
 sub doCluster {
+  return if ($ncbiRmsk);
   my $runDir = "$buildDir/run.cluster";
+  if ( -d "$runDir" ) {
+       if ( -s "$runDir/run.time" ) {
+         &HgAutomate::verbose(1,
+	  "\ncluster step previously completed\n");
+         return;
+       } else {
+         die "\nERROR: cluster step may not be completed properly\n";
+       }
+  }
   &HgAutomate::mustMkdir($runDir);
 
   my $paraHub = $opt_bigClusterHub ? $opt_bigClusterHub :
     &HgAutomate::chooseClusterByBandwidth();
   if (! -e $unmaskedSeq) {
     die "Error: required file $unmaskedSeq does not exist.";
   }
   my @okIn = grep !/scratch/,
     &HgAutomate::chooseFilesystemsForCluster($paraHub, "in");
   my @okOut = &HgAutomate::chooseFilesystemsForCluster($paraHub, "out");
   if (scalar(@okOut) > 1) {
     @okOut = grep !/$okIn[0]/, @okOut;
   }
   my $inHive = 0;
   $inHive = 1 if ($okIn[0] =~ m#/hive/data/genomes#);
@@ -277,58 +300,73 @@
   ;
   close($fh);
 
   &HgAutomate::makeGsub($runDir,
       "./RMRun.csh {check out line $partDir/\$(path1).out}");
 
   my $whatItDoes =
 "It computes a logical partition of unmasked 2bit into 500k chunks
 and runs it on the cluster with the most available bandwidth.";
   my $bossScript = new HgRemoteScript("$runDir/doCluster.csh", $paraHub,
 				      $runDir, $whatItDoes);
 
   $bossScript->add(<<_EOF_
 chmod a+x dummyRun.csh
 chmod a+x RMRun.csh
-./dummyRun.csh
 
 # Record RM version used:
+printf "The repeat files provided for this assembly were generated using RepeatMasker.\\
+  Smit, AFA, Hubley, R & Green, P.,\\
+  RepeatMasker Open-4.0.\\
+  1996-2010 <http://www.repeatmasker.org>.\\
+\\
+VERSION:\\n" > ../versionInfo.txt
+
+./dummyRun.csh | grep -v "dev/null" >> ../versionInfo.txt
+
+grep 'version of RepeatMasker\$' $RepeatMasker >> ../versionInfo.txt
+grep RELEASE $RepeatMaskerPath/Libraries/RepeatMaskerLib.embl >> ../versionInfo.txt
+printf "# RepeatMasker engine: %s\\n" "${RepeatMaskerEngine}" >> ../versionInfo.txt
+
 ls -ld $RepeatMaskerPath $RepeatMasker
 grep 'version of RepeatMasker\$' $RepeatMasker
 grep RELEASE $RepeatMaskerPath/Libraries/RepeatMaskerLib.embl
 echo "# RepeatMasker engine: $RepeatMaskerEngine"
 _EOF_
   );
   if ($opt_useRMBlastn) {
     $bossScript->add(<<_EOF_
+printf "# using rmblastn:\\n" >> ../versionInfo.txt
 echo "# useRMBlastn: rmblastn:"
-grep RMBLAST_DIR $RepeatMaskerPath/RepeatMaskerConfig.pm | awk '{print \$NF}'
+grep RMBLAST_DIR $RepeatMaskerPath/RepeatMaskerConfig.pm | grep rmblastn-2 | awk '{print \$NF}' >> ../versionInfo.txt
 _EOF_
     );
   }
   if ($opt_useHMMER) {
     $bossScript->add(<<_EOF_
+printf "# using Dfam library and HMMER3:\\n" >> ../versionInfo.txt
 echo "# useHMMER: Dfam library: "
 ls -ld $RepeatMaskerPath/Libraries/Dfam.hmm
-grep Release: $RepeatMaskerPath/Libraries/Dfam.hmm
+grep Release: $RepeatMaskerPath/Libraries/Dfam.hmm >> ../versionInfo.txt
 echo "# useHMMER: HMMER3: "
-grep -m 1 ^HMMER3 $RepeatMaskerPath/Libraries/Dfam.hmm
+grep -m 1 ^HMMER3 $RepeatMaskerPath/Libraries/Dfam.hmm >> ../versionInfo.txt
 _EOF_
     );
   }
   if (length($repeatLib) > 0) {
     $bossScript->add(<<_EOF_
+printf "# RepeatMasker library options: %s\\n" "${repeatLib}" >> ../versionInfo.txt
 echo "# RepeatMasker library options: '$repeatLib'"
 _EOF_
     );
   }
   if ( ! $opt_unmaskedSeq && ! $inHive) {
     $bossScript->add(<<_EOF_
 mkdir -p $clusterSeqDir
 rsync -av $unmaskedSeq $clusterSeq
 _EOF_
     );
   }
   if ($opt_unmaskedSeq) {
     $bossScript->add(<<_EOF_
 rm -rf $partDir
 $Bin/simplePartition.pl $clusterSeq 500000 $partDir
@@ -339,107 +377,147 @@
 rm -rf $partDir
 $Bin/simplePartition.pl $clusterSeq 500000 $partDir
 rm -f $buildDir/RMPart
 ln -s $partDir $buildDir/RMPart
 _EOF_
     );
   }
   my $binPara = "/parasol/bin/para";
   if ( ! -s "$binPara" ) {
     # allow PATH to find the para command
     $binPara = "para";
   }
   my $gensub2 = &HgAutomate::gensub2();
   $bossScript->add(<<_EOF_
 
+printf "\\nPARAMETERS:\\
+$RepeatMasker $RepeatMaskerEngine -align $repeatLib\\n" >> ../versionInfo.txt
+
 $gensub2 $partDir/partitions.lst single gsub jobList
+exit 0
 $binPara $parasolRAM make jobList
 $binPara check
 $binPara time > run.time
 cat run.time
 
 _EOF_
   );
   if (! $opt_unmaskedSeq && ! $inHive) {
     $bossScript->add(<<_EOF_
 rm -f $clusterSeq
 _EOF_
     );
   }
   $bossScript->execute();
 } # doCluster
 
 #########################################################################
 # * step: cat [fileServer]
 sub doCat {
   my $runDir = "$buildDir";
+  if (! -s "${ncbiRmsk}" ) {
       &HgAutomate::checkExistsUnlessDebug('cluster', 'cat',
 				      "$buildDir/run.cluster/run.time");
-
+  }
+  if ( -s "$runDir/doCat.bash" ) {
+     if ( ! (-s "$runDir/$db.sorted.fa.out.gz" || -s "$runDir/$db.sorted.fa.out" ) ) {
+         die "\nERROR: cat step may not be completed properly\n";
+     } else {
+         &HgAutomate::verbose(1,
+	  "\ncat step previously completed\n");
+         return;
+     }
+  }
   my $whatItDoes =
 "It concatenates .out files from cluster run into a single $db.sorted.fa.out.\n" .
 "liftUp (with no lift specs) is used to concatenate .out files because it\n" .
 "uniquifies (per input file) the .out IDs which can then be used to join\n" .
 "fragmented repeats in the Nested Repeats track.";
   my $fileServer = &HgAutomate::chooseFileServer($runDir);
   my $bossScript = newBash HgRemoteScript("$runDir/doCat.bash", $fileServer,
 				      $runDir, $whatItDoes);
 
-  # Use symbolic link created in cluster step:
-  my $partDir = "$buildDir/RMPart";
-  my $indent = "";
-  my $path = "";
-  my $levels = $opt_debug ? 1 : `cat $partDir/levels`;
-  chomp $levels;
   $bossScript->add("# Debug mode -- don't know the real number of levels\n")
     if ($opt_debug);
 
+  if ($ncbiRmsk) {
+    my $zippedSource = 0;	# asssume not a gz zipped file
+    $zippedSource = 1 if ($ncbiRmsk =~ m/.gz$/);
+    my $liftOpts = "";
+    if (-s "${liftSpec}" ) {
+      $liftOpts = "| liftUp -type=.out stdout $liftSpec carry stdin";
+    }
+    $bossScript->add(<<_EOF_
+export db="${db}"
+export ncbiRmOutFile="${ncbiRmsk}"
+cat /dev/null > "\${db}.sorted.fa.out"
+_EOF_
+    );
+    if ($zippedSource) {
+    $bossScript->add(<<_EOF_
+zcat "\${ncbiRmOutFile}" | tail -n +4 | sort -k5,5 -k6,6n $liftOpts >> "\${db}.sorted.fa.out"
+_EOF_
+    );
+    } else {
+    $bossScript->add(<<_EOF_
+tail -n +4 "\${ncbiRmOutFile}" sort -k5,5 -k6,6n $liftOpts >> "\${db}.sorted.fa.out"
+_EOF_
+    );
+    }
+  } else {	# not using the NCBI supplied file, use local cluster run result
+  my $partDir = "$buildDir/RMPart";
+  my $levels = $opt_debug ? 1 : `cat $partDir/levels`;
+  chomp $levels;
+  # Use symbolic link created in cluster step:
+  my $indent = "";
+  my $path = "";
   # Make the appropriate number of nested foreach's to match the number
   # of levels from the partitioning.  If $levels is 1, no foreach required.
   for (my $l = $levels - 2;  $l >= 0;  $l--) {
     my $dir = ($l == ($levels - 2)) ? $partDir : "\$d" . ($l + 1);
     $bossScript->add(<<_EOF_
 ${indent}for d$l in $dir/???
 do
 _EOF_
     );
     $indent .= '  ';
     $path .= "\$d$l/";
   }
   for (my $l = 0;  $l <= $levels - 2;  $l++) {
     $bossScript->add(<<_EOF_
 ${indent}  liftUp ${path}cat.out /dev/null carry ${path}???/*.out
 ${indent}  liftUp ${path}cat.align /dev/null carry ${path}???/*.align
 ${indent}done
 _EOF_
     );
     $indent =~ s/  //;
     $path =~ s/\$d\d+\/$//;
   }
   $bossScript->add(<<_EOF_
 
 liftUp $db.fa.out /dev/null carry $partDir/???/*.out
 liftUp $db.fa.align /dev/null carry $partDir/???/*.align
 head -3 $db.fa.out > $db.sorted.fa.out
 tail -n +4 $db.fa.out | sort -k5,5 -k6,6n >> $db.sorted.fa.out
 _EOF_
   );
+  }
   $bossScript->add(<<_EOF_
 
 # Use the ID column to join up fragments of interrupted repeats for the
 # Nested Repeats track.  Try to avoid the Undefined id errors.
-($Bin/extractNestedRepeats.pl $db.fa.out 2> nestRep.err || true) | sort -k1,1 -k2,2n > $db.nestedRepeats.bed
+($Bin/extractNestedRepeats.pl $db.sorted.fa.out 2> nestRep.err || true) | sort -k1,1 -k2,2n > $db.nestedRepeats.bed
 if [ -s "nestRep.err" ]; then
   export lineCount=`(grep "Undefined id, line" nestRep.err || true) | cut -d' ' -f6 | wc -l`
    if [ "\${lineCount}" -gt 0 ]; then
      if [ "\${lineCount}" -gt 10 ]; then
         printf "ERROR: too many Undefined id lines (> 10) reported by extractNestedRepeats.pl" 1>&2
         exit 255
      fi
      export sedExpr=`grep "Undefined id, line" nestRep.err | cut -d' ' -f6 | sed -e 's/\$/d;/;' | xargs echo`
      export sedCmd="sed -i.broken -e '\$sedExpr'"
      eval \$sedCmd $db.fa.out
      ($Bin/extractNestedRepeats.pl $db.fa.out 2> nestRep2.err || true) | sort -k1,1 -k2,2n > $db.nestedRepeats.bed
      if [ -s "nestRep2.err" ]; then
         printf "ERROR: the attempt of cleaning nestedRepeats did not work" 1>&2
         exit 255
      fi
@@ -584,70 +662,72 @@
 | splitFileByColumn -col=5 stdin /cluster/data/$db -chromDirs \\
     -ending=.fa.out -head=/tmp/rmskHead.txt
 _EOF_
     );
     $bossScript->execute();
   }
 } # doInstall
 
 
 #########################################################################
 # * step: cleanup [fileServer]
 sub doCleanup {
   my $runDir = "$buildDir";
   my $whatItDoes = "It cleans up or compresses intermediate files.";
   my $fileServer = &HgAutomate::chooseFileServer($runDir);
-  my $bossScript = new HgRemoteScript("$runDir/doCleanup.csh", $fileServer,
+  my $bossScript = newBash HgRemoteScript("$runDir/doCleanup.bash", $fileServer,
 				      $runDir, $whatItDoes);
   $bossScript->add(<<_EOF_
-if (-e $db.fa.align) then
+if [ -e "$db.fa.align" ]; then
   gzip $db.fa.align
-endif
+fi
 rm -fr RMPart/*
 rm -fr RMPart
-if ( -d /hive/data/genomes/$db/RMPart ) then
+if [ -d "/hive/data/genomes/$db/RMPart" ]; then
    rmdir /hive/data/genomes/$db/RMPart
-endif
+fi
 _EOF_
   );
   $bossScript->execute();
 } # doCleanup
 
 
 #########################################################################
 # main
 
 # Prevent "Suspended (tty input)" hanging:
 &HgAutomate::closeStdin();
 
 # Make sure we have valid options and exactly 1 argument:
 &checkOptions();
 &usage(1) if (scalar(@ARGV) != 1);
 $secondsStart = `date "+%s"`;
 chomp $secondsStart;
 ($db) = @ARGV;
 
 # Now that we know the $db, figure out our paths:
 my $date = `date +%Y-%m-%d`;
 chomp $date;
 $buildDir = $opt_buildDir ? $opt_buildDir :
   "$HgAutomate::clusterData/$db/$HgAutomate::trackBuild/RepeatMasker.$date";
 $unmaskedSeq = $opt_unmaskedSeq ? $opt_unmaskedSeq :
   "$HgAutomate::clusterData/$db/$db.unmasked.2bit";
 my $seqCount = `twoBitInfo $unmaskedSeq stdout | wc -l`;
 $chromBased = ($seqCount <= $HgAutomate::splitThreshold) && $opt_splitTables;
 $updateTable = $opt_updateTable ? "Update" : "";
+$ncbiRmsk = $opt_ncbiRmsk ? $opt_ncbiRmsk : "";
+$liftSpec = $opt_liftSpec ? $opt_liftSpec : "";
 
 # Do everything.
 $stepper->execute();
 
 # Tell the user anything they should know.
 my $stopStep = $stepper->getStopStep();
 my $upThrough = ($stopStep eq 'cleanup') ? "" :
   "  (through the '$stopStep' step)";
 
 $secondsEnd = `date "+%s"`;
 chomp $secondsEnd;
 my $elapsedSeconds = $secondsEnd - $secondsStart;
 my $elapsedMinutes = int($elapsedSeconds/60);
 $elapsedSeconds -= $elapsedMinutes * 60;