2e584a1c66bae4339bd009372f47e97f0636fe85
hiram
  Tue Oct 29 15:01:41 2024 -0700
indicate which paraHub the cluster job is on refs #34685

diff --git src/hg/utils/automation/doRepeatMasker.pl src/hg/utils/automation/doRepeatMasker.pl
index c09eafb..a45cb52 100755
--- src/hg/utils/automation/doRepeatMasker.pl
+++ src/hg/utils/automation/doRepeatMasker.pl
@@ -1,824 +1,825 @@
 #!/usr/bin/env perl
 
 # DO NOT EDIT the /cluster/bin/scripts copy of this file --
 # edit ~/kent/src/hg/utils/automation/doRepeatMasker.pl instead.
 
 # $Id: doRepeatMasker.pl,v 1.14 2009/03/19 16:15:29 hiram Exp $
 
 use Getopt::Long;
 use warnings;
 use strict;
 use Carp;
 use FindBin qw($Bin);
 use lib "$Bin";
 use HgAutomate;
 use HgRemoteScript;
 use HgStepManager;
 
 # Hardcoded command path:
 my $RepeatMaskerPath = "/hive/data/staging/data/RepeatMasker221107";
 my $RepeatMasker = "$RepeatMaskerPath/RepeatMasker";
 # default engine changed from crossmatch to rmblast as of 2022-12
 # with RM version 4.1.4
 my $RepeatMaskerEngine = "-engine rmblast -pa 1";
 # per RM doc, rmblast uses 4 CPUs for each job
 my $parasolRAM = "-cpu=4 -ram=32g";
 
 # Option variable names, both common and peculiar to this script:
 use vars @HgAutomate::commonOptionVars;
 use vars @HgStepManager::optionVars;
 use vars qw/
     $opt_buildDir
     $opt_ncbiRmsk
     $opt_dupList
     $opt_liftSpec
     $opt_species
     $opt_unmaskedSeq
     $opt_customLib
     $opt_useHMMER
     $opt_useRMBlastn
     $opt_useCrossMatch
     $opt_splitTables
     $opt_noSplit
     $opt_updateTable
     /;
 
 # Specify the steps supported with -continue / -stop:
 my $stepper = new HgStepManager(
     [ { name => 'cluster', func => \&doCluster },
       { name => 'cat',     func => \&doCat },
       { name => 'mask',    func => \&doMask },
       { name => 'install', func => \&doInstall },
       { name => 'cleanup', func => \&doCleanup },
     ]
 				);
 
 # Option defaults:
 my $dbHost = 'hgwdev';
 my $unmaskedSeq = "\$db.unmasked.2bit";
 my $defaultSpecies = 'scientificName from dbDb';
 my $ncbiRmsk = "";	# path to NCBI *_rm.out.gz file to use NCBI result
                         # for repeat masking result
                         # in the assembly as calculated by NCBI
 my $dupList = "";	# path to download/asmId.remove.dups.list for NCBI
 my $liftSpec = "";	# lift file from NCBI coordinates to UCSC coordinates
                         # usually required with ncbiRmsk
 
 my $base = $0;
 $base =~ s/^(.*\/)?//;
 
 sub usage {
   # Usage / help / self-documentation:
   my ($status, $detailed) = @_;
   # Basic help (for incorrect usage):
   print STDERR "
 usage: $base db
 options:
 ";
   print STDERR $stepper->getOptionHelp();
   print STDERR <<_EOF_
     -buildDir dir         Use dir instead of default
                           $HgAutomate::clusterData/\$db/$HgAutomate::trackBuild/RepeatMasker.\$date
                           (necessary when continuing at a later date).
     -species sp           Use sp (which can be quoted genus and species, or
                           a common name that RepeatMasker recognizes.
                           Default: $defaultSpecies.
     -ncbiRmsk path/file_rm.out.gz - Use the repeat masker result supplied in
                           the assembly as calculated by NCBI
     -dupList .../download/asmId.remove.dups.list - to remove duplicates from
 			  NCBI repeat masker file
     -liftSpec path/file.lift - Use this lift file to lift the NCBI coordinates
                           to UCSC coordinates, usually used with ncbiRmsk
     -unmaskedSeq seq.2bit Use seq.2bit as the unmasked input sequence instead
                           of default ($unmaskedSeq).
     -customLib lib.fa     Use custom repeat library instead of RepeatMaskers\'s.
     -useRMBlastn          This is the default as of 2022-12 == NCBI rmblastn
     -useCrossMatch        Use crossmatch instead of NCBI rmblastn
     -useHMMER             Use hmmer instead of crossmatch ( currently for human only )
     -updateTable          load into table name rmskUpdate (default: rmsk)
     -splitTables          split the _rmsk tables (default is not split)
     -noSplit              default behavior, this option no longer required.
 _EOF_
   ;
   print STDERR &HgAutomate::getCommonOptionHelp('dbHost' => $dbHost,
 						'workhorse' => '',
 						'bigClusterHub' => '');
   print STDERR "
 Automates UCSC's RepeatMasker process for genome database \$db.  Steps:
     cluster: Do a cluster run of RepeatMasker on 500kb sequence chunks.
     cat:     Concatenate the cluster run results into \$db.sorted.fa.out.
     mask:    Create a \$db.2bit masked by \$db.sorted.fa.out.
     install: Load \$db.sorted.fa.out into the rmsk table (possibly split) in \$db,
              install \$db.2bit in $HgAutomate::clusterData/\$db/, and if \$db is
              chrom-based, split \$db.sorted.fa.out into per-chrom files.
     cleanup: Removes or compresses intermediate files.
 All operations are performed in the build directory which is
 $HgAutomate::clusterData/\$db/$HgAutomate::trackBuild/RepeatMasker.\$date unless -buildDir is given.
 Run -help to see what files are required for this script.
 ";
   # Detailed help (-help):
   print STDERR "
 Assumptions:
 1. $HgAutomate::clusterData/\$db/\$db.unmasked.2bit contains sequence for
    database/assembly \$db.  (This can be overridden with -unmaskedSeq.)
 " if ($detailed);
   print "\n";
   exit $status;
 }
 
 
 # Globals:
 # Command line args: db
 my ($db);
 # Other:
 my ($buildDir, $chromBased, $updateTable, $secondsStart, $secondsEnd);
 
 sub checkOptions {
   # Make sure command line options are valid/supported.
   my $ok = GetOptions(@HgStepManager::optionSpec,
 		      'buildDir=s',
 		      'species=s',
 		      'ncbiRmsk=s',
 		      'dupList=s',
 		      'liftSpec=s',
 		      'unmaskedSeq=s',
 		      'customLib=s',
                       'useRMBlastn',
                       'useCrossMatch',
                       'useHMMER',
 		      'splitTables',
 		      'noSplit',
 		      'updateTable',
 		      @HgAutomate::commonOptionSpec,
 		      );
   &usage(1) if (!$ok);
   &usage(0, 1) if ($opt_help);
   &HgAutomate::processCommonOptions();
   my $err = $stepper->processOptions();
   usage(1) if ($err);
   $dbHost = $opt_dbHost if ($opt_dbHost);
 }
 
 #########################################################################
 # * step: cluster [bigClusterHub]
 sub doCluster {
   return if ($ncbiRmsk);
   my $runDir = "$buildDir/run.cluster";
   if ( -d "$runDir" ) {
        if ( -s "$runDir/run.time" ) {
          &HgAutomate::verbose(1,
 	  "\ncluster step previously completed\n");
          return;
        } else {
          die "\nERROR: cluster step may not be completed properly\n";
        }
   }
   &HgAutomate::mustMkdir($runDir);
 
   my $paraHub = $opt_bigClusterHub ? $opt_bigClusterHub :
     &HgAutomate::chooseClusterByBandwidth();
   if (! -e $unmaskedSeq) {
     die "Error: required file $unmaskedSeq does not exist.";
   }
   my @okIn = grep !/scratch/,
     &HgAutomate::chooseFilesystemsForCluster($paraHub, "in");
   my @okOut = &HgAutomate::chooseFilesystemsForCluster($paraHub, "out");
   if (scalar(@okOut) > 1) {
     @okOut = grep !/$okIn[0]/, @okOut;
   }
   my $inHive = 0;
   $inHive = 1 if ($okIn[0] =~ m#/hive/data/genomes#);
   my $clusterSeqDir = "$okIn[0]/$db";
   $clusterSeqDir = "$buildDir" if ($opt_unmaskedSeq);
   my $clusterSeq = "$clusterSeqDir/$db.unmasked.2bit";
   $clusterSeq = "$unmaskedSeq" if ($opt_unmaskedSeq);
   my $partDir .= "$okOut[0]/$db/RMPart";
   $partDir = "$buildDir/RMPart" if ($opt_unmaskedSeq);
   my $species = $opt_species ? $opt_species : &HgAutomate::getSpecies($dbHost, $db);
   my $customLib = $opt_customLib;
   my $repeatLib = "";
   if ($opt_customLib && $opt_species) {
      $repeatLib = "-species \'$species\' -lib $customLib";
   }
   elsif ($opt_customLib) {
      $repeatLib = "-lib $customLib";
   }
   else {
      $repeatLib = "-species \'$species\'";
   }
 
   # updated for ku kluster operation -cpu option instead of ram option
   if ( $opt_useRMBlastn ) {
     $RepeatMaskerEngine = "-engine rmblast -pa 1";
     $parasolRAM = "-cpu=4 -ram=32g";
   } elsif ( $opt_useCrossMatch ) {
     $RepeatMaskerEngine = "-engine crossmatch -s";
     $parasolRAM = "-cpu=1";
   } elsif ( $opt_useHMMER ) {
     # NOTE: This is only applicable for 8gb one-job-per-node scheduling
     $RepeatMaskerEngine = "-engine hmmer -pa 4";
     $parasolRAM = "-cpu=4 -ram=32g";
   }
 
   # Script to do a dummy run of RepeatMasker, to test our invocation and
   # unpack library files before kicking off a large cluster run.
   #  And now that RM is being run from local /scratch/data/RepeatMasker/
   #  this is also done in the cluster run script so each node will have
   #	its library initialized
   my $fh = &HgAutomate::mustOpen(">$runDir/dummyRun.csh");
   print $fh <<_EOF_
 #!/bin/csh -ef
 
 set path = (/cluster/software/bin \$path)
 $RepeatMasker $RepeatMaskerEngine $repeatLib /dev/null
 _EOF_
   ;
   close($fh);
 
   my $tmpDir = &HgAutomate::tmpDir();
   # Cluster job script:
   $fh = &HgAutomate::mustOpen(">$runDir/RMRun.csh");
   print $fh <<_EOF_
 #!/bin/csh -ef
 
 if ( -d "/data/tmp" ) then
   setenv TMPDIR "/data/tmp"
 else if ( -d "/scratch/tmp" ) then
   setenv TMPDIR "/scratch/tmp"
 else
   setenv TMPDIR "/tmp"
 endif
 set path = (/cluster/software/bin \$path)
 
 set finalOut = \$1
 
 set inLst = \$finalOut:r
 set inLft = \$inLst:r.lft
 set alignOut = \$finalOut:r.align
 set catOut = \$finalOut:r.cat
 
 # Use local disk for output, and move the final result to \$outPsl
 # when done, to minimize I/O.
 set tmpDir = `mktemp -d -p \$TMPDIR doRepeatMasker.cluster.XXXXXX`
 pushd \$tmpDir
 
 # Initialize local library
 $RepeatMasker $RepeatMaskerEngine $repeatLib /dev/null
 
 foreach spec (`cat \$inLst`)
   # Remove path and .2bit filename to get just the seq:start-end spec:
   set base = `echo \$spec | sed -r -e 's/^[^:]+://'`
   # RM has a limitation of the length of a sequence name
   # in the case of name too long, create a shorter name, then lift
   set nameLength = `echo \$base | wc -c`
   set shortName = `echo \$base  | sed -e 's/>//;' | md5sum | cut -d' ' -f1`
   # If \$spec is the whole sequence, twoBitToFa removes the :start-end part,
   # which causes liftUp to barf later.  So tweak the header back to
   # seq:start-end for liftUp's sake:
   if ( \$nameLength > 45 ) then
     twoBitToFa \$spec stdout | sed -e "s/^>.*/>\$shortName/" > \$base.fa
     set fastaSize = `faSize \$base.fa | grep bases | cut -d' ' -f1`
     /bin/printf "0\\t\$shortName\\t\$fastaSize\\t\$base\\t\$fastaSize" > lift.\$base.txt
   else
     twoBitToFa \$spec stdout | sed -e "s/^>.*/>\$base/" > \$base.fa
   endif
   $RepeatMasker $RepeatMaskerEngine -align $repeatLib \$base.fa
   if ( \$nameLength > 45 ) then
     liftUp -type=.out stdout lift.\$base.txt error \$base.fa.out > lift.\$base.out
     liftUp -type=.align stdout lift.\$base.txt error \$base.fa.align > lift.\$base.align
     mv -f lift.\$base.out \$base.fa.out
     mv -f lift.\$base.align \$base.fa.align
   endif
   if (-e \$base.fa.cat) then
     if ( \$nameLength > 45 ) then
       liftUp -type=.align stdout lift.\$base.txt error \$base.fa.cat > lift.\$base.cat
       mv -f lift.\$base.cat \$base.fa.cat
     endif
     mv \$base.fa.cat \$catOut
   endif
 end
 
 # Lift up (leave the RepeatMasker header in place because we'll liftUp
 # again later):
 liftUp -type=.out stdout \$inLft error *.fa.out > tmpOut__out
 
 set nonomatch
 set alignFiles = ( *.align )
 if ( \${#alignFiles} && -e \$alignFiles[1] ) then
   liftUp -type=.align stdout \$inLft error *.align \\
     > tmpOut__align
 else
   touch tmpOut__align
 endif
 
 # Move final result into place:
 mv tmpOut__out \$finalOut
 mv tmpOut__align \$alignOut
 
 popd
 rm -rf \$tmpDir
 _EOF_
   ;
   close($fh);
 
   &HgAutomate::makeGsub($runDir,
       "./RMRun.csh {check out line $partDir/\$(path1).out}");
 
   my $whatItDoes =
 "It computes a logical partition of unmasked 2bit into 500k chunks
 and runs it on the cluster with the most available bandwidth.";
   my $bossScript = new HgRemoteScript("$runDir/doCluster.csh", $paraHub,
 				      $runDir, $whatItDoes);
 
   $bossScript->add(<<_EOF_
 
 set path = (/cluster/software/bin \$path)
 
 chmod a+x dummyRun.csh
 chmod a+x RMRun.csh
 
 # Record RM version used:
 printf "The repeat files provided for this assembly were generated using RepeatMasker.\\
   Smit, AFA, Hubley, R & Green, P.,\\
   RepeatMasker version 4.1.4\\
   1996-2010 <http://www.repeatmasker.org>.\\
 \\
 VERSION:\\n" > ../versionInfo.txt
 
 ./dummyRun.csh | grep -v "dev/null" >> ../versionInfo.txt
 
 $RepeatMasker -v >> ../versionInfo.txt
 printf "# RMRBMeta.embl library version: %s\\n" "`grep RELEASE $RepeatMaskerPath/Libraries/RMRBMeta.embl`" >> ../versionInfo.txt
 printf "# RepeatMasker engine: %s\\n" "${RepeatMaskerEngine}" >> ../versionInfo.txt
 
 ls -ld $RepeatMaskerPath $RepeatMasker
 $RepeatMasker -v
 echo -n "# RMRBMeta.embl library version: "
 grep RELEASE $RepeatMaskerPath/Libraries/RMRBMeta.embl | sed -e 's/  *\\*\$//;'
 echo "# RepeatMasker engine: $RepeatMaskerEngine"
 _EOF_
   );
   if ($opt_useCrossMatch) {
     $bossScript->add(<<_EOF_
 printf "# using engine crossmatch\\n" >> ../versionInfo.txt
 echo "# useCrossMatch crossmatch"
 _EOF_
     );
   } elsif ($opt_useRMBlastn) {
     $bossScript->add(<<_EOF_
 printf "# using engine rmblastn:\\t" >> ../versionInfo.txt
 echo "# useRMBlastn: rmblastn:"
 grep -w value $RepeatMaskerPath/RepeatMaskerConfig.pm | grep rmblastn | awk '{print \$NF}' >> ../versionInfo.txt
 _EOF_
     );
   } elsif ($opt_useHMMER) {
     $bossScript->add(<<_EOF_
 printf "# using Dfam library and HMMER3:\\n" >> ../versionInfo.txt
 echo "# useHMMER: Dfam library: "
 ls -ld $RepeatMaskerPath/Libraries/Dfam.hmm
 grep Release: $RepeatMaskerPath/Libraries/Dfam.hmm >> ../versionInfo.txt
 echo "# useHMMER: HMMER3: "
 grep -m 1 ^HMMER3 $RepeatMaskerPath/Libraries/Dfam.hmm >> ../versionInfo.txt
 _EOF_
     );
   }
   if (length($repeatLib) > 0) {
     $bossScript->add(<<_EOF_
 printf "# RepeatMasker library options: %s\\n" "${repeatLib}" >> ../versionInfo.txt
 echo "# RepeatMasker library options: '$repeatLib'"
 _EOF_
     );
   }
   if ( ! $opt_unmaskedSeq && ! $inHive) {
     $bossScript->add(<<_EOF_
 mkdir -p $clusterSeqDir
 rsync -av $unmaskedSeq $clusterSeq
 _EOF_
     );
   }
   if ($opt_unmaskedSeq) {
     $bossScript->add(<<_EOF_
 rm -rf $partDir
 $Bin/simplePartition.pl $clusterSeq 500000 $partDir
 _EOF_
     );
   } else {
     $bossScript->add(<<_EOF_
 rm -rf $partDir
 $Bin/simplePartition.pl $clusterSeq 500000 $partDir
 rm -f $buildDir/RMPart
 ln -s $partDir $buildDir/RMPart
 _EOF_
     );
   }
   my $binPara = "/parasol/bin/para";
   if ( ! -s "$binPara" ) {
     # allow PATH to find the para command
     $binPara = "para";
   }
   my $gensub2 = &HgAutomate::gensub2();
   $bossScript->add(<<_EOF_
 
 printf "\\nPARAMETERS:\\
 $RepeatMasker $RepeatMaskerEngine -align $repeatLib\\n" >> ../versionInfo.txt
 
 $gensub2 $partDir/partitions.lst single gsub jobList
 $binPara $parasolRAM make jobList
 $binPara check
 $binPara time > run.time
 cat run.time
 
 _EOF_
   );
   if (! $opt_unmaskedSeq && ! $inHive) {
     $bossScript->add(<<_EOF_
 rm -f $clusterSeq
 _EOF_
     );
   }
+  `touch "$runDir/para_hub_$paraHub"`;
   $bossScript->execute();
 } # doCluster
 
 #########################################################################
 # * step: cat [fileServer]
 sub doCat {
   my $runDir = "$buildDir";
   if (! -s "${ncbiRmsk}" ) {
       &HgAutomate::checkExistsUnlessDebug('cluster', 'cat',
 				      "$buildDir/run.cluster/run.time");
   }
   if ( -s "$runDir/doCat.bash" ) {
      if ( ! (-s "$runDir/$db.sorted.fa.out.gz" || -s "$runDir/$db.sorted.fa.out" ) ) {
          die "\nERROR: cat step may not be completed properly\n";
      } else {
          &HgAutomate::verbose(1,
 	  "\ncat step previously completed\n");
          return;
      }
   }
   my $whatItDoes =
 "It concatenates .out files from cluster run into a single $db.sorted.fa.out.\n" .
 "liftUp (with no lift specs) is used to concatenate .out files because it\n" .
 "uniquifies (per input file) the .out IDs which can then be used to join\n" .
 "fragmented repeats in the Nested Repeats track.";
   my $fileServer = &HgAutomate::chooseFileServer($runDir);
   my $bossScript = newBash HgRemoteScript("$runDir/doCat.bash", $fileServer,
 				      $runDir, $whatItDoes);
 
   $bossScript->add("# Debug mode -- don't know the real number of levels\n")
     if ($opt_debug);
 
   if ($ncbiRmsk) {
     my $zippedSource = 0;	# asssume not a gz zipped file
     $zippedSource = 1 if ($ncbiRmsk =~ m/.gz$/);
     my $liftOpts = "";
     if (-s "${liftSpec}" ) {
       $liftOpts = "| liftUp -type=.out stdout $liftSpec warn stdin";
     }
     $bossScript->add(<<_EOF_
 export db="${db}"
 export ncbiRmOutFile="${ncbiRmsk}"
 cat /dev/null > "\${db}.sorted.fa.out"
 _EOF_
     );
     if (! -s "${liftSpec}" ) {
     printf STDERR "# WARNING: no liftSpec given with ncbiRmsk file, probably will need one ?\n";
     $bossScript->add(<<_EOF_
 #########
 # WARNING: no liftSpec given with ncbiRmsk file, probably will need one ?
 #########
 
 printf "   SW  perc perc perc  query      position in query           matching       repeat              position in  repeat
 score  div. del. ins.  sequence    begin     end    (left)    repeat         class/family         begin  end (left)   ID
 
 " >> "\${db}.sorted.fa.out"
 _EOF_
     );
     }
     if ($zippedSource) {
       if ( -s "${dupList}" ) {
         $bossScript->add(<<_EOF_
 zcat "\${ncbiRmOutFile}" | tail -n +4 | sort -k5,5 -k6,6n $liftOpts \\
    | grep -v -f ${dupList} >> "\${db}.sorted.fa.out"
 _EOF_
         );
       } else {
         $bossScript->add(<<_EOF_
 zcat "\${ncbiRmOutFile}" | tail -n +4 | sort -k5,5 -k6,6n $liftOpts >> "\${db}.sorted.fa.out"
 _EOF_
         );
       }
     } else {
       if ( -s "${dupList}" ) {
         $bossScript->add(<<_EOF_
 tail -n +4 "\${ncbiRmOutFile}" sort -k5,5 -k6,6n $liftOpts \\
     | grep -v -f ${dupList} >> "\${db}.sorted.fa.out"
 _EOF_
         );
       } else {
         $bossScript->add(<<_EOF_
 tail -n +4 "\${ncbiRmOutFile}" sort -k5,5 -k6,6n $liftOpts >> "\${db}.sorted.fa.out"
 _EOF_
         );
       }
     }
   } else {	# not using the NCBI supplied file, use local cluster run result
   my $partDir = "$buildDir/RMPart";
   my $levels = $opt_debug ? 1 : `cat $partDir/levels`;
   chomp $levels;
   # Use symbolic link created in cluster step:
   my $indent = "";
   my $path = "";
   # Make the appropriate number of nested foreach's to match the number
   # of levels from the partitioning.  If $levels is 1, no foreach required.
   for (my $l = $levels - 2;  $l >= 0;  $l--) {
     my $dir = ($l == ($levels - 2)) ? $partDir : "\$d" . ($l + 1);
     $bossScript->add(<<_EOF_
 ${indent}for d$l in $dir/???
 do
 _EOF_
     );
     $indent .= '  ';
     $path .= "\$d$l/";
   }
   for (my $l = 0;  $l <= $levels - 2;  $l++) {
     $bossScript->add(<<_EOF_
 ${indent}  liftUp ${path}cat.out /dev/null carry ${path}???/*.out
 ${indent}  liftUp ${path}cat.align /dev/null carry ${path}???/*.align
 done
 _EOF_
     );
     $indent =~ s/  //;
     $path =~ s/\$d\d+\/$//;
   }
   $bossScript->add(<<_EOF_
 
 liftUp $db.fa.out /dev/null carry $partDir/???/*.out
 liftUp $db.fa.align /dev/null carry $partDir/???/*.align
 head -3 $db.fa.out > $db.sorted.fa.out
 tail -n +4 $db.fa.out | sort -k5,5 -k6,6n >> $db.sorted.fa.out
 _EOF_
   );
   }
   $bossScript->add(<<_EOF_
 
 # Use the ID column to join up fragments of interrupted repeats for the
 # Nested Repeats track.  Try to avoid the Undefined id errors.
 ($Bin/extractNestedRepeats.pl $db.sorted.fa.out 2> nestRep.err || true) | sort -k1,1 -k2,2n > $db.nestedRepeats.bed
 if [ -s "nestRep.err" ]; then
   export lineCount=`(grep "Undefined id, line" nestRep.err || true) | cut -d' ' -f6 | wc -l`
    if [ "\${lineCount}" -gt 0 ]; then
      if [ "\${lineCount}" -gt 10 ]; then
         printf "ERROR: too many Undefined id lines (> 10) reported by extractNestedRepeats.pl" 1>&2
         exit 255
      fi
      export sedExpr=`grep "Undefined id, line" nestRep.err | cut -d' ' -f6 | sed -e 's/\$/d;/;' | xargs echo`
      export sedCmd="sed -i.broken -e '\$sedExpr'"
      eval \$sedCmd $db.sorted.fa.out
      ($Bin/extractNestedRepeats.pl $db.sorted.fa.out 2> nestRep2.err || true) | sort -k1,1 -k2,2n > $db.nestedRepeats.bed
      if [ -s "nestRep2.err" ]; then
         printf "ERROR: the attempt of cleaning nestedRepeats did not work" 1>&2
         exit 255
      fi
    fi
 fi
 _EOF_
   );
   $bossScript->execute();
 } # doCat
 
 #########################################################################
 # * step: mask [workhorse]
 sub doMask {
   my $runDir = "$buildDir";
   &HgAutomate::checkExistsUnlessDebug('cat', 'mask', "$buildDir/$db.sorted.fa.out");
 
   my $whatItDoes = "It makes a masked .2bit in this build directory.";
   my $workhorse = &HgAutomate::chooseWorkhorse();
   my $bossScript = newBash HgRemoteScript("$runDir/doMask.bash", $workhorse,
 				      $runDir, $whatItDoes);
 
   $bossScript->add(<<_EOF_
 export db=$db
 twoBitMask $unmaskedSeq \$db.sorted.fa.out \$db.rmsk$updateTable.2bit
 twoBitToFa \$db.rmsk$updateTable.2bit stdout | faSize stdin > faSize.rmsk$updateTable.txt
 _EOF_
   );
   $bossScript->execute();
 } # doMask
 
 #########################################################################
 # * step: install [dbHost, maybe fileServer]
 sub doInstall {
   my $runDir = "$buildDir";
   &HgAutomate::checkExistsUnlessDebug('cat', 'install', "$buildDir/$db.sorted.fa.out");
 
   my $split = "";
   $split = " (split)" if ($opt_splitTables);
   my $whatItDoes =
 "It loads $db.sorted.fa.out into the$split rmsk$updateTable table and $db.nestedRepeats.bed\n" .
 "into the nestedRepeats table.  It also installs the masked 2bit.";
   my $bossScript = newBash HgRemoteScript("$runDir/doLoad.bash", $dbHost,
 				      $runDir, $whatItDoes);
 
   $split = "-nosplit";
   $split = "-split" if ($opt_splitTables);
   my $installDir = "$HgAutomate::clusterData/$db";
   $bossScript->add(<<_EOF_
 export db=$db
 
 # ensure sort functions properly despite kluster node environment
 export LC_COLLATE=C
 
 hgLoadOut -table=rmsk$updateTable $split \$db \$db.sorted.fa.out
 hgLoadOut -verbose=2 -tabFile=\$db.rmsk$updateTable.tab -table=rmsk$updateTable -nosplit \$db \$db.sorted.fa.out 2> \$db.bad.records.txt
 # construct bbi files for assembly hub
 $RepeatMaskerPath/util/rmToTrackHub.pl -out \$db.sorted.fa.out -align \$db.fa.align
 # in place same file sort using the -o output option
 sort -k1,1 -k2,2n -o \$db.fa.align.tsv \$db.fa.align.tsv &
 sort -k1,1 -k2,2n -o \$db.sorted.fa.join.tsv \$db.sorted.fa.join.tsv
 wait
 bedToBigBed -tab -as=\$HOME/kent/src/hg/lib/bigRmskAlignBed.as -type=bed3+14 \\
   \$db.fa.align.tsv ../../chrom.sizes \$db.rmsk.align.bb &
 bedToBigBed -tab -as=\$HOME/kent/src/hg/lib/bigRmskBed.as -type=bed9+5 \\
   \$db.sorted.fa.join.tsv ../../chrom.sizes \$db.rmsk.bb
 wait
 rm -fr classBed classBbi rmskClass
 mkdir classBed classBbi rmskClass
 sort -k12,12 \$db.rmsk$updateTable.tab \\
   | splitFileByColumn -ending=tab  -col=12 -tab stdin rmskClass
 for T in SINE LINE LTR DNA Simple Low_complexity Satellite
 do
     fileCount=`(ls rmskClass/\${T}*.tab 2> /dev/null || true) | wc -l`
     if [ "\$fileCount" -gt 0 ]; then
        echo "working: "`ls rmskClass/\${T}*.tab | xargs echo`
        \$HOME/kent/src/hg/utils/automation/rmskBed6+10.pl rmskClass/\${T}*.tab \\
         | sort -k1,1 -k2,2n > classBed/\$db.rmsk.\${T}.bed
        bedToBigBed -tab -type=bed6+10 -as=\$HOME/kent/src/hg/lib/rmskBed6+10.as \\
          classBed/\$db.rmsk.\${T}.bed ../../chrom.sizes \\
            classBbi/\$db.rmsk.\${T}.bb
     fi
 done
 fileCount=`(ls rmskClass/*RNA.tab 2> /dev/null || true) | wc -l`
 if [ "\$fileCount" -gt 0 ]; then
   echo "working: "`ls rmskClass/*RNA.tab | xargs echo`
   \$HOME/kent/src/hg/utils/automation/rmskBed6+10.pl rmskClass/*RNA.tab \\
      | sort -k1,1 -k2,2n > classBed/\$db.rmsk.RNA.bed
   bedToBigBed -tab -type=bed6+10 -as=\$HOME/kent/src/hg/lib/rmskBed6+10.as \\
      classBed/\$db.rmsk.RNA.bed ../../chrom.sizes \\
         classBbi/\$db.rmsk.RNA.bb
 fi
 fileCount=`(ls rmskClass/*.tab | egrep -v "/SIN|/LIN|/LT|/DN|/Simple|/Low_complexity|/Satellit|RNA.tab" 2> /dev/null || true) | wc -l`
 if [ "\$fileCount" -gt 0 ]; then
   echo "working: "`ls rmskClass/*.tab | egrep -v "/SIN|/LIN|/LT|/DN|/Simple|/Low_complexity|/Satellit|RNA.tab" | xargs echo`
   \$HOME/kent/src/hg/utils/automation/rmskBed6+10.pl `ls rmskClass/*.tab | egrep -v "/SIN|/LIN|/LT|/DN|/Simple|/Low_complexity|/Satellit|RNA.tab"` \\
         | sort -k1,1 -k2,2n > classBed/\$db.rmsk.Other.bed
   bedToBigBed -tab -type=bed6+10 -as=\$HOME/kent/src/hg/lib/rmskBed6+10.as \\
     classBed/\$db.rmsk.Other.bed ../../chrom.sizes \\
       classBbi/\$db.rmsk.Other.bb
 fi
 
 export bbiCount=`for F in classBbi/*.bb; do bigBedInfo \$F | grep itemCount; done | awk '{print \$NF}' | sed -e 's/,//g' | ave stdin | grep total | awk '{print \$2}' | sed -e 's/.000000//'`
 
 export firstTabCount=`cat \$db.rmsk$updateTable.tab | wc -l`
 export splitTabCount=`cat rmskClass/*.tab | wc -l`
 
 if [ "\$firstTabCount" -ne \$bbiCount ]; then
    echo "\$db.rmsk$updateTable.tab count: \$firstTabCount, split class tab file count: \$splitTabCount, bbi class item count:  \$bbiCount"
    echo "ERROR: did not account for all items in rmsk class bbi construction" 1>&2
    exit 255
 fi
 wc -l classBed/*.bed > \$db.class.profile.txt
 wc -l rmskClass/*.tab >> \$db.class.profile.txt
 rm -fr rmskClass classBed \$db.rmsk$updateTable.tab
 _EOF_
   );
 
   if ($opt_updateTable) {
   $bossScript->add(<<_EOF_
 if [ -s \$db.nestedRepeats.bed ]; then
   sed -e 's/nestedRepeats/nestedRepeatsUpdate/g' \$HOME/kent/src/hg/lib/nestedRepeats.sql > nestedRepeatsUpdate.sql
   hgLoadBed \$db nestedRepeats$updateTable \$db.nestedRepeats.bed \\
     -sqlTable=nestedRepeatsUpdate.sql
 fi
 _EOF_
   );
   } else {
   $bossScript->add(<<_EOF_
 if [ -s \$db.nestedRepeats.bed ]; then
   hgLoadBed \$db nestedRepeats \$db.nestedRepeats.bed \\
     -sqlTable=\$HOME/kent/src/hg/lib/nestedRepeats.sql
 fi
 _EOF_
   );
   }
 
   $bossScript->add(<<_EOF_
 rm -f $installDir/\$db.rmsk$updateTable.2bit
 ln -s $buildDir/\$db.rmsk$updateTable.2bit $installDir/\$db.rmsk$updateTable.2bit
 _EOF_
   );
   $bossScript->execute();
 
   # Make a new script for the fileServer if chrom-based:
   if ($chromBased) {
     my $fileServer = &HgAutomate::chooseFileServer($runDir);
     $whatItDoes =
 "It splits $db.sorted.fa.out into per-chromosome files in chromosome directories\n" .
 "where makeDownload.pl will expect to find them.\n";
     my $bossScript = newBash HgRemoteScript("$runDir/doSplit.bash", $fileServer,
 					$runDir, $whatItDoes);
     $bossScript->add(<<_EOF_
 export db=$db
 head -3 \$db.sorted.fa.out > /tmp/rmskHead.txt
 tail -n +4 \$db.sorted.fa.out \\
 | splitFileByColumn -col=5 stdin /cluster/data/\$db -chromDirs \\
     -ending=.fa.out -head=/tmp/rmskHead.txt
 _EOF_
     );
     $bossScript->execute();
   }
 } # doInstall
 
 
 #########################################################################
 # * step: cleanup [fileServer]
 sub doCleanup {
   my $runDir = "$buildDir";
   my $whatItDoes = "It cleans up or compresses intermediate files.";
   my $fileServer = &HgAutomate::chooseFileServer($runDir);
   my $bossScript = newBash HgRemoteScript("$runDir/doCleanup.bash", $fileServer,
 				      $runDir, $whatItDoes);
   $bossScript->add(<<_EOF_
 if [ -e "$db.fa.align" ]; then
   gzip $db.fa.align
 fi
 rm -fr RMPart/*
 rm -fr RMPart
 if [ -d "/hive/data/genomes/$db/RMPart" ]; then
    rmdir /hive/data/genomes/$db/RMPart
 fi
 _EOF_
   );
   $bossScript->execute();
 } # doCleanup
 
 
 #########################################################################
 # main
 
 # Prevent "Suspended (tty input)" hanging:
 &HgAutomate::closeStdin();
 
 # Make sure we have valid options and exactly 1 argument:
 &checkOptions();
 &usage(1) if (scalar(@ARGV) != 1);
 $secondsStart = `date "+%s"`;
 chomp $secondsStart;
 ($db) = @ARGV;
 
 # Now that we know the $db, figure out our paths:
 my $date = `date +%Y-%m-%d`;
 chomp $date;
 $buildDir = $opt_buildDir ? $opt_buildDir :
   "$HgAutomate::clusterData/$db/$HgAutomate::trackBuild/RepeatMasker.$date";
 $unmaskedSeq = $opt_unmaskedSeq ? $opt_unmaskedSeq :
   "$HgAutomate::clusterData/$db/$db.unmasked.2bit";
 my $seqCount = `twoBitInfo $unmaskedSeq stdout | wc -l`;
 $chromBased = ($seqCount <= $HgAutomate::splitThreshold) && $opt_splitTables;
 $updateTable = $opt_updateTable ? "Update" : "";
 $ncbiRmsk = $opt_ncbiRmsk ? $opt_ncbiRmsk : "";
 $dupList = $opt_dupList ? $opt_dupList : "";
 $liftSpec = $opt_liftSpec ? $opt_liftSpec : "";
 
 # Do everything.
 $stepper->execute();
 
 # Tell the user anything they should know.
 my $stopStep = $stepper->getStopStep();
 my $upThrough = ($stopStep eq 'cleanup') ? "" :
   "  (through the '$stopStep' step)";
 
 $secondsEnd = `date "+%s"`;
 chomp $secondsEnd;
 my $elapsedSeconds = $secondsEnd - $secondsStart;
 my $elapsedMinutes = int($elapsedSeconds/60);
 $elapsedSeconds -= $elapsedMinutes * 60;
 
 &HgAutomate::verbose(1, <<_EOF_
 
  *** All done!$upThrough - Elapsed time: ${elapsedMinutes}m${elapsedSeconds}s
  *** Steps were performed in $buildDir
 _EOF_
 );
 if ($stepper->stepPrecedes('mask', $stopStep)) {
   &HgAutomate::verbose(1, <<_EOF_
  *** Please check the log file to see if hgLoadOut had warnings that we
      should pass on to Arian and Robert.
  *** Please also spot-check some repeats in the browser, run twoBitToFa on
      a portion of $db.2bit to make sure there's some masking, etc.
 
 _EOF_
   );
 }
 &HgAutomate::verbose(1, "\n");