06d7be056190c14b85e71bc12523f18ea6815b5e
markd
  Mon Dec 7 00:50:29 2020 -0800
BLAT mmap index support merge with master

diff --git src/hg/utils/automation/doIdKeys.pl src/hg/utils/automation/doIdKeys.pl
index b65c416..ec4770f 100755
--- src/hg/utils/automation/doIdKeys.pl
+++ src/hg/utils/automation/doIdKeys.pl
@@ -1,324 +1,325 @@
 #!/usr/bin/env perl
 
 # DO NOT EDIT the /cluster/bin/scripts copy of this file --
 # edit ~/kent/src/hg/utils/automation/doIdKeys.pl instead.
 
 # calculates md5sum strings for each sequence in a 2bit file
 # and constructs a single md5sum from all those individual md5sums for a
 # single 'keySignature' identifier for the entire sequence.
 
 use Getopt::Long;
 use warnings;
 use strict;
 use FindBin qw($Bin);
 use lib "$Bin";
 use HgAutomate;
 use HgRemoteScript;
 use HgStepManager;
 
 # Option variable names, both common and peculiar to this script:
 use vars @HgAutomate::commonOptionVars;
 use vars @HgStepManager::optionVars;
 use vars qw/
     $opt_buildDir
     $opt_twoBit
     /;
 
 # Specify the steps supported with -continue / -stop:
 my $stepper = new HgStepManager(
     [ { name => 'setup',   func => \&doSetup },
       { name => 'clusterRun',   func => \&doClusterRun },
       { name => 'finalResult',   func => \&doFinalResult },
       { name => 'cleanup', func => \&doCleanup },
     ]
 				);
 
 # Option defaults:
 my $dbHost = 'hgwdev';
 my $bigClusterHub = 'ku';
 my $workhorse = 'hgwdev';
 my $defaultWorkhorse = 'hgwdev';
 my $twoBit = "$HgAutomate::clusterData/\$db/\$db.2bit";
 
 my $base = $0;
 $base =~ s/^(.*\/)?//;
 
 sub usage {
   # Usage / help / self-documentation:
   my ($status, $detailed) = @_;
   # Basic help (for incorrect usage):
   print STDERR "
 usage: $base db
 options:
 ";
   print STDERR $stepper->getOptionHelp();
   print STDERR <<_EOF_
     -buildDir dir         Use dir instead of default
                           $HgAutomate::clusterData/\$db/$HgAutomate::trackBuild/idKeys
     -twoBit seq.2bit      Use seq.2bit as the input sequence instead
                           of default ($twoBit).
 _EOF_
   ;
   print STDERR &HgAutomate::getCommonOptionHelp('dbHost' => $dbHost,
 						'workhorse' => $defaultWorkhorse,
 						'bigClusterHub' => $bigClusterHub);
   print STDERR "
 Automates the construction of an 'idKeys' file for a 2bit sequence
     The 'idKeys' are the md5sum results of each sequence in the 2bit file.
     Steps:
     setup: establish work directores and scripts for processing
     clusterRun: perform the cluster run
                 cluster run is performed only when number of
                 sequences is >= 5,000, else twoBitDup is used once
     finalResult: gather the results of the clusterRun (or twoBitDup) into
                 a single results file '<db>.idKeys.txt': two colums:
                 1. md5sum string (sorted on this column)
                 2. sequence name (chromosome name, scaffold name, contig...)
     cleanup: Removes or compresses intermediate files.
 All operations are performed in the build directory which is
 $HgAutomate::clusterData/\$db/$HgAutomate::trackBuild/idKeys unless -buildDir is given.";
   # Detailed help (-help):
   print STDERR "
 Assumptions:
 1. 2bit file is a valid sequence file
 " if ($detailed);
   print "\n";
   exit $status;
 }
 
 # Globals:
 # Command line args: db
 my ($db);
 # Other:
 my ($buildDir, $secondsStart, $secondsEnd);
 
 sub checkOptions {
   # Make sure command line options are valid/supported.
   my $ok = GetOptions(@HgStepManager::optionSpec,
 		      'buildDir=s',
 		      'twoBit=s',
 		      @HgAutomate::commonOptionSpec,
 		      );
   &usage(1) if (!$ok);
   &usage(0, 1) if ($opt_help);
   &HgAutomate::processCommonOptions();
   my $err = $stepper->processOptions();
   usage(1) if ($err);
   $workhorse = $opt_workhorse if ($opt_workhorse);
   $bigClusterHub = $opt_bigClusterHub if ($opt_bigClusterHub);
   $dbHost = $opt_dbHost if ($opt_dbHost);
 }
 
 #########################################################################
 # * step: setup [workhorse]
 sub doSetup {
   my $runDir = "$buildDir";
 
   # First, make sure we're starting clean.
   if ( ! $opt_debug && ( -s "$runDir/doSetup.bash" ) ) {
     die "doSetup: looks like this was run successfully already " .
       "(directory db/bed/idKeys exists).  Either run with -continue clusterRun or some later " .
         "stage, or move aside/remove $runDir and run again.\n";
   }
 
   &HgAutomate::mustMkdir($runDir);
 
   my $whatItDoes = "Establish working directory and scripts to run the job.";
   my $bossScript = newBash HgRemoteScript("$runDir/doSetup.bash", $workhorse,
 				      $runDir, $whatItDoes);
 
+  # improved twoBitDup 2020-12-04 can now do billions in one go
   $bossScript->add(<<_EOF_
 twoBitInfo $twoBit stdout | sort -k2nr | cut -f1 > part.list
 export partCount=`cat part.list | wc -l`
-if [ "\${partCount}" -lt 5000 ]; then
+if [ "\${partCount}" -lt 10000000000 ]; then
   time ( twoBitDup -keyList=stdout $twoBit | grep -v "are identical" | sort > $db.idKeys.txt) > twoBitDup.log 2>&1
 else
   mkdir -p splitList
   split -a 3 -d -l 5000 part.list splitList/part
   for F in splitList/part*
   do
     export B=`basename \$F`
     cat \$F | while read P
     do
       printf "runOne %s {check out exists+ result/%s/%s.txt}\n" \\
              "\${P}" "\${B}" "\${P}"
     done
   done > jobList
 
   printf '#!/bin/bash
 set -beEu -o pipefail
 
 export contig=\$1
 export result=\$2
 mkdir -p `dirname \$result`
 sleep 1
 touch `dirname \$result`
 sleep 1
 
 printf "%%s\\\\t%%s\\\\n" `twoBitToFa -noMask \\
   $twoBit:\${contig} stdout \\
    | grep -v "^>" | tr "[A-Z]" "[a-z]" | tr --delete "\\\\n" | md5sum \\
       | awk '"'"'{print \$1}'"'"'` "\${contig}" > "\${result}"
 ' > runOne
 fi
 _EOF_
   );
   $bossScript->execute();
 } # doSetup
 
 #########################################################################
 # * step: clusterRun [bigClusterHub]
 sub doClusterRun {
   my $runDir = "$buildDir";
   my $paraHub = $bigClusterHub;
 
   # First, make sure previous step has completed:
   if ( ! $opt_debug && ( ! -s "$runDir/part.list" ) ) {
     die "doClusterRun: previous 'setup' step has not completed, no part.list file present.\n";
   }
   # Then, make sure we're starting clean.
   if ( ! $opt_debug && ( -s "$runDir/run.time" ) ) {
     die "doClusterRun: looks like this was run successfully already " .
       "(file <db>/bed/idKeys/run.time exists).  Either run with -continue finalResult or some later " .
         "stage, or move aside/remove $runDir and run again.\n";
   }
 
   my $whatItDoes = "Perform cluster run if necessary.";
   my $bossScript = newBash HgRemoteScript("$runDir/clusterRun.bash", $paraHub,
 				      $runDir, $whatItDoes);
 
   my $paraRun = &HgAutomate::paraRun();
   $bossScript->add(<<_EOF_
 if [ -s runOne ]; then
   chmod +x runOne
   $paraRun
 else
   if [ ! -s $db.idKeys.txt ]; then
      printf "ERROR: previous step doIdKeys failed twoBitDup procedure" 1>&2
      exit 255
   else
      cp -p twoBitDup.log run.time
   fi
 fi
 _EOF_
   );
   $bossScript->execute();
 } # doClusterRun
 
 #########################################################################
 # * step: finalResult [workhorse]
 sub doFinalResult {
   my $runDir = "$buildDir";
 
   # First, make sure previous step has completed:
   if ( ! $opt_debug && ( ! -s "$runDir/run.time" ) ) {
     die "doFinalResult: previous 'clusterRun' step has not completed, no run.time file present.\n";
   }
   # Then, make sure we're starting clean.
   if ( ! $opt_debug && ( -s "$runDir/doFinalResult.bash" ) ) {
     die "doFinalResult: looks like this was run successfully already " .
       "(file db/bed/doFinalResult.bash exists).  Either run with -continue cleanup " .
         ", or move aside/remove $runDir and run again.\n";
   }
 
   my $whatItDoes = "Collect cluster run results into one single result file, and construct the 'keySignature' for the entire sequence.";
   my $bossScript = newBash HgRemoteScript("$runDir/doFinalResult.bash",
                    $workhorse, $runDir, $whatItDoes);
 
   $bossScript->add(<<_EOF_
 if [ -d result ]; then
   catDir result/part* | sort -k1,1 > $db.idKeys.txt
 fi
 if [ -s $db.idKeys.txt ]; then
   printf "# finalStep: $db.idKeys.txt file is present and done\\n" 1>&2
   cut -f 1 $db.idKeys.txt | md5sum | awk '{print \$1}' > $db.keySignature.txt
   cut -f1 $db.idKeys.txt | sort | uniq -c | awk '\$1 > 1' > $db.hasDups.txt
   if [ ! -s $db.hasDups.txt ]; then
     rm -f $db.hasDups.txt
   fi
 else
   printf "ERROR: finalstep: $db.idKeys.txt file is missing\\n" 1>&2
   exit 255
 fi
 _EOF_
   );
   $bossScript->execute();
 } # doFinalResult
 
 #########################################################################
 # * step: cleanup [workhorse]
 sub doCleanup {
   my $runDir = "$buildDir";
 
   # Make sure we're starting clean.
   if ( ! $opt_debug && ( -s "$runDir/doCleanup.bash" ) ) {
     die "doCleanup: looks like this was run successfully already " .
       "(file db/bed/doCleanup.bash exists).\n";
   }
   # Verify previous step has completed
   if ( ! $opt_debug && ( ! -s "$runDir/$db.idKeys.txt" ) ) {
     die "doCleanup: ERROR: previous steps have not completed, there is no " .
       "file $runDir/$db.idKeys.txt present.\n";
   }
 
   my $whatItDoes = "It cleans up or compresses intermediate files.";
   my $bossScript = newBash HgRemoteScript("$runDir/doCleanup.bash", $workhorse,
 				      $runDir, $whatItDoes);
   $bossScript->add(<<_EOF_
 rm -fr err splitList result para.results batch para.bookmark batch.bak
 gzip part.list
 if [ -s jobList ]; then
   gzip jobList
 fi
 _EOF_
   );
   $bossScript->execute();
 } # doCleanup
 
 
 #########################################################################
 # main
 
 # Prevent "Suspended (tty input)" hanging:
 &HgAutomate::closeStdin();
 
 # Make sure we have valid options and exactly 1 argument:
 &checkOptions();
 &usage(1) if (scalar(@ARGV) != 1);
 $secondsStart = `date "+%s"`;
 chomp $secondsStart;
 ($db) = @ARGV;
 
 # Force debug and verbose until this is looking pretty solid:
 # $opt_debug = 1;
 $opt_verbose = 3 if ($opt_verbose < 3);
 
 # Establish what directory we will work in.
 $buildDir = $opt_buildDir ? $opt_buildDir :
   "$HgAutomate::clusterData/$db/$HgAutomate::trackBuild/idKeys";
 $twoBit = $opt_twoBit ? $opt_twoBit :
   "$HgAutomate::clusterData/$db/$db.2bit";
 
 if ( ! -s "$twoBit" ) {
   die "can not find 2bit file:\n\t$twoBit";
 }
 
 # Do everything.
 $stepper->execute();
 
 # Tell the user anything they should know.
 my $stopStep = $stepper->getStopStep();
 my $upThrough = ($stopStep eq 'cleanup') ? "" :
   "  (through the '$stopStep' step)";
 
 $secondsEnd = `date "+%s"`;
 chomp $secondsEnd;
 my $elapsedSeconds = $secondsEnd - $secondsStart;
 my $elapsedMinutes = int($elapsedSeconds/60);
 $elapsedSeconds -= $elapsedMinutes * 60;
 
 &HgAutomate::verbose(1,
 	"\n *** All done !$upThrough  Elapsed time: ${elapsedMinutes}m${elapsedSeconds}s\n");
 &HgAutomate::verbose(1,
 	" *** Steps were performed in $buildDir\n");
 &HgAutomate::verbose(1, "\n");