src/hg/utils/automation/HgAutomate.pm 75892bebe2505e10302cb56e9e1050bd811351d9

75892bebe2505e10302cb56e9e1050bd811351d9
hiram
  Tue Sep 27 09:03:45 2022 -0700
accomodate promoted hubs with db but no chromInfo refs #29819

diff --git src/hg/utils/automation/HgAutomate.pm src/hg/utils/automation/HgAutomate.pm
index 7534b16..461722b 100755
--- src/hg/utils/automation/HgAutomate.pm
+++ src/hg/utils/automation/HgAutomate.pm
@@ -1,883 +1,897 @@
 #
 # HgAutomate: common cluster, postprocessing and database loading operations.
 #
 # DO NOT EDIT the /cluster/bin/scripts copy of this file --
 # edit ~/kent/src/hg/utils/automation/HgAutomate.pm instead.
 
 # $Id: HgAutomate.pm,v 1.29 2010/05/20 23:13:33 hiram Exp $
 package HgAutomate;
 
 use warnings;
 use strict;
 use Carp;
 use vars qw(@ISA @EXPORT_OK);
 use Exporter;
 use File::Basename;
 use File::Spec;
 
 @ISA = qw(Exporter);
 
 # This is a listing of the public methods and variables (which should be
 # treated as constants) exported by this module:
 @EXPORT_OK = (
     # Support for common command line options:
     qw( getCommonOptionHelp processCommonOptions
 	@commonOptionVars @commonOptionSpec
       ),
     # Some basic smarts about our compute infrastructure:
     qw( choosePermanentStorage
 	chooseWorkhorse chooseFileServer
 	chooseClusterByBandwidth chooseSmallClusterByBandwidth
 	chooseFilesystemsForCluster checkClusterPath
       ),
     # General-purpose utility routines:
     qw( checkCleanSlate checkExistsUnlessDebug closeStdin
 	getAssemblyInfo getSpecies hubDateName gensub2 machineHasFile
-	databaseExists makeGsub mustMkdir asmHubBuildDir asmHubDownloadDir
-	mustOpen nfsNoodge paraRun run verbose
+	databaseExists dbTableExists makeGsub mustMkdir asmHubBuildDir
+	asmHubDownloadDir mustOpen nfsNoodge paraRun run verbose
       ),
     # Hardcoded paths/commands/constants:
     qw( $centralDbSql $git
 	$clusterData $trackBuild $goldenPath $images $gbdb
 	$splitThreshold $runSSH $setMachtype
       ),
 );
 
 #########################################################################
 # A simple model of our local compute environment with some subroutines
 # for checking the validity of path+machine combos and for suggesting
 # appropriate storage and machines.
 
 use vars qw( %cluster %clusterFilesystem $defaultDbHost );
 
 
 sub readMainCluster(); # forward declaration to keep code order
 
 # the name of the cluster is in a separate text file, so it's easier to
 # use from bash scripts
 
 %cluster =
     ( readMainCluster() =>
         { 'enabled' => 1, 'gigaHz' => 1.4, 'ram' => 8,
 	  'hostCount' => 992, },
       'hgwdev-101' =>
         { 'enabled' => 1, 'gigaHz' => 2.1, 'ram' => 1,
 	  'hostCount' => 32, },
       'hgwdev' =>
         { 'enabled' => 1, 'gigaHz' => 2.1, 'ram' => 1,
 	  'hostCount' => 64, },
     );
 
 my %obsoleteCluster =
     ( 'swarm' => ,
         { 'enabled' => 1, 'gigaHz' => 2.33, 'ram' => 8,
 	  'hostCount' => 1024, },
       'memk' =>
         { 'enabled' => 1, 'gigaHz' => 1.0, 'ram' => 32,
 	  'hostCount' => 32, },
       'encodek' =>
         { 'enabled' => 1, 'gigaHz' => 2.0, 'ram' => 16,
 	  'hostCount' => 48, },
     );
 
 my @allClusters = (keys %cluster);
 
 %clusterFilesystem =
     ( 'scratch' =>
         { root => '/scratch/data', clusterLocality => 1.0,
 	  distrHost => [], distrCommand => '',
 	  inputFor => \@allClusters, outputFor => [], },
       'hive' =>
         { root => '/hive/data/genomes', clusterLocality => 0.3,
 	  distrHost => ['ku'], distrCommand => '',
 	  inputFor => ['ku'],
 	  outputFor => ['ku'], },
     );
 
 my %obsoleteClusterFilesystem =
     ( 'scratch' =>
         { root => '/scratch/data', clusterLocality => 1.0,
 	  distrHost => [], distrCommand => '',
 	  inputFor => \@allClusters, outputFor => [], },
       'hive' =>
         { root => '/hive/data/genomes', clusterLocality => 0.3,
 	  distrHost => ['ku'], distrCommand => '',
 	  inputFor => ['memk', 'encodek', 'swarm'],
 	  outputFor => ['memk', 'encodek', 'swarm'], },
     );
 
 $defaultDbHost = 'hgwdev';
 
 sub readMainCluster() {
     # return the first line of the file cluster.txt in same directory as
     # HgAutomate.pm. This file is easy to parse from bash scripts and
     # other languages, easier than to have the value in this .pm file
     #
     my ($volume, $directory, $file) = File::Spec->splitpath(__FILE__);
     my $mainClusterFname = $directory."cluster.txt";
     open (my $clusterFile, '<', $mainClusterFname) || die "Couldn't open \"$mainClusterFname\": $!\n";
     my $mainCluster = <$clusterFile>; 
     close $clusterFile;
     chomp $mainCluster;
     return $mainCluster;
 }
 
 sub choosePermanentStorage {
   # Return the disk drive with the most available space.
   #*** would be good to parameterize instead of hardcoding this!
   confess "Too many arguments" if (scalar(@_) != 0);
   my $maxAvail;
   my $bestRaid;
   for (my $i=1;  $i < 20;  $i++) {
     my $raid = "/cluster/store$i";
     my $df = `df $raid/ 2>&1 | grep -v "No such" | egrep -v '^[A-Za-z]'`;
     if ($df =~ s/.*\s+(\d+)\s+\d+\%.*/$1/) {
       if (! defined $maxAvail || $df > $maxAvail) {
 	$maxAvail = $df;
 	$bestRaid = $raid;
       }
     }
   }
   if (! defined $bestRaid) {
     $bestRaid = "/hive/data/genomes";
   }
   confess "Could not df any /cluster/store's" if (! defined $bestRaid);
   return $bestRaid;
 }
 
 sub getMountPoint {
   # Extract the mount point for a given path from df.
   # This can hang if filesystem is unhappy -- c'est la vie.
   my ($path) = @_;
   my $df = `df $path`;
   if ($df =~ m@\d+\s+\d+\%\s+([/\w]+)$@) {
     return $1;
   } else {
     return undef;
   }
 }
 
 sub getClusterFsInfo {
   # Get clusterFilesystem record for the given path, if there is one.
   # Unless path starts with /scratch or /iscratch which may not be the
   # same on localhost as on the cluster nodes,
   #*** would be good to parameterize instead of hardcoding this!
   # use df to determine real location of path.
   my ($path) = @_;
   confess "must have complete, not relative, path" if ($path !~ m@^/@);
   if ($path =~ m@^/(scratch|iscratch)/@) {
     return $clusterFilesystem{$1};
   } else {
     my $mountPoint = &getMountPoint($path);
     foreach my $fs (keys %clusterFilesystem) {
       my $info = $clusterFilesystem{$fs};
       return $info if ($info->{'root'} =~ /^$mountPoint/);
     }
   }
   return undef;
 }
 
 sub getOkClusters {
   # Return a list of clusters that are known to be OK for the given path.
   my ($path, $isInput) = @_;
   my $fsInfo = &getClusterFsInfo($path);
   my @okClusters = ();
   if ($fsInfo) {
     @okClusters = $isInput ?
                        @{$fsInfo->{'inputFor'}} : @{$fsInfo->{'outputFor'}};
   }
   return @okClusters;
 }
 
 sub getWarnClusters {
   # If path is not on a clusterFilesytem, and it is used as input to a big
   # cluster or output from small cluster, warn but don't die.
   # Would be nice to use cluster parameters here instead of hardcoding.
   my ($path, $isInput) = @_;
   my $fsInfo = &getClusterFsInfo($path);
   if (! $fsInfo) {
     if ($isInput) {
       return @allClusters;
     } else {
       return ('encodek');
     }
   }
 }
 
 sub checkClusterPath {
   # Make sure that the list of paths is OK for the given cluster and in/out.
   my ($cluster, $inOrOut, @pathList) = @_;
   confess "Must have at least 3 arguments" if (scalar(@_) < 3);
   my $clusterInfo = $cluster{$cluster};
   if (! defined $clusterInfo) {
     confess "Unrecognized cluster \"$cluster\"";
   }
   if ($inOrOut ne "in" and $inOrOut ne "out") {
     confess "\$inOrOut must be either \"in\" or \"out\"";
   }
   foreach my $p (@pathList) {
     my $isInput = ($inOrOut eq 'in');
     my @okClusters = &getOkClusters($p, $isInput);
     my @warnClusters = &getWarnClusters($p, $isInput);
     my $do = $isInput ? 'take input from' : 'send output to';
     if (scalar(grep /^$cluster$/, @warnClusters)) {
       warn "Warning: Cluster $cluster probably should not $do $p .\n";
     } elsif (! scalar(grep /^$cluster$/, @okClusters)) {
       die "Error: Cluster $cluster cannot $do $p .\n";
     }
   }
 }
 
 sub getLoadFactor {
   # Return the load factor (most-recent) for the given machine.
   # If it doesn't produce a recognizable uptime result, return a
   # very high load.
   my ($mach) = @_;
   confess "Must have exactly 1 argument" if (scalar(@_) != 1);
   my $cmd = "$HgAutomate::runSSH $mach uptime 2>&1 | grep load";
   verbose(4, "about to run '$cmd'\n");
   my $load = `$cmd`;
   if ($load =~ s/.*load average: (\d+\.\d+).*/$1/) {
     return $load;
   }
   return 1000;
 }
 
 sub getWorkhorseLoads {
   #*** Would be nice to parameterize instead of hardcoding hostnames...
   # Return a hash of workhorses (all idle small cluster machines),
   # associated with their load factors.
   # a valid workhorse needs to have access to hive.
   confess "Too many arguments" if (scalar(@_) != 0);
   my %horses = ();
   foreach my $machLine ('ku', 'hgwdev') {
     my $mach = $machLine;
     $mach =~ s/[\. ].*//;
     chomp $mach;
     $horses{$mach} = &getLoadFactor($mach) if (! exists $horses{$mach});
   }
   return %horses;
 }
 
 sub chooseWorkhorse {
   # Choose a suitable "workhorse" machine.  If -workhorse was given, use that.
   # Otherwise, randomly pick a fast machine with low load factor, or wait if
   # none are available.  This can wait indefinitely, so if it's broken or if
   # all workhorses are down, it's up to the engineer to halt the script.
   confess "Too many arguments" if (shift);
   if ($main::opt_workhorse) {
     return $main::opt_workhorse;
   }
   &verbose(2, "chooseWorkhorse: polling load factors of " .
 	   "idle small cluster machines.  This may take a minute...\n");
   while (1) {
     my %horses = &getWorkhorseLoads();
     foreach my $maxLoad (0.1, 0.5, 1.0, 2.0) {
       my @fastHorses = ();
       foreach my $horse (keys %horses) {
 	push @fastHorses, $horse if ($horses{$horse} <= $maxLoad);
       }
       if (scalar(@fastHorses) > 0) {
 	my $randomFastEnough = $fastHorses[int(rand(scalar(@fastHorses)))];
 	&verbose(2, "chooseWorkhorse: $randomFastEnough meets load " .
 		 "threshold of $maxLoad.\n");
 	return $randomFastEnough;
       }
     }
     my $delay = 120;
     &HgAutomate::verbose(1, "chooseWorkhorse: all machines have high load." .
 			 "  waiting $delay seconds...\n");
     sleep($delay);
   }
 }
 
 sub getFileServer {
   # Use df to determine the fileserver for $path.
   my ($path) = @_;
   confess "Must have exactly 1 argument" if (scalar(@_) != 1);
   my $host = `df $path 2>&1 | grep -v Filesystem`;
   if ($host =~ /(\S+):\/.*/) {
     return $1;
   } else {
     my $localhost = `uname -n`;	# HOST not always present
     if ($localhost =~ s/^(\w+)(\..*)?$/$1/) {
       return $localhost;
     }
   }
   confess "Could not extract server from output of \"df $path\":\n$host\n";
 }
 
 sub canLogin {
   # Return true if logins are enabled on the given fileserver.
   #*** hardcoded
   my ($mach) = @_;
   return ($mach =~ /^kkstore/ || $mach eq 'eieio');
   confess "Must have exactly 1 argument" if (scalar(@_) != 1);
 }
 
 sub chooseFileServer {
   # Choose a suitable machine for an I/O-intensive task.
   # If -fileServer was given, use that.
   # Otherwise, determine the fileserver for $path, and if we can log in
   # on the fileserver, and its load is not too high, return it.
   # Otherwise, use a workhorse machine.
   my ($path) = @_;
   confess "Must have exactly 1 argument" if (scalar(@_) != 1);
   if ($main::opt_fileServer) {
     return $main::opt_fileServer;
   }
   my $server = &getFileServer($path);
   verbose(4, "Fileserver from df is '$server'\n");
   $server =~ s/-10$//;
   if ($server && &canLogin($server) && (&getLoadFactor($server) < 2.0)) {
     return $server;
   }
 #*** SMALL CLUSTER MACHINES CANNOT WGET OUTSIDE, SO NOT ALWAYS A GOOD CHOICE HERE
   return &chooseWorkhorse();
 }
 
 sub chooseClusterByBandwidth {
   # Choose cluster by apparent available bandwidth.
   # Note: this does not take I/O into account, so it's best to call this
   # before distributing inputs instead of after (unless they have been
   # distributed somewhere that is fast for all clusters like /scratch).
   my $onlySmallFast = shift;
   confess "Too many arguments" if (shift);
   my $maxOomph;
   my $bestCluster;
   foreach my $paraHub (keys %cluster) {
     my $clusterInfo = $cluster{$paraHub};
     next if (! $clusterInfo->{'enabled'});
     next if ($onlySmallFast && $clusterInfo->{'gigaHz'} < 2.0);
     my @machInfo = `$HgAutomate::runSSH $paraHub parasol list machines | grep -v dead`;
     my $idleCount = 0;
     my $busyCount = 0;
     foreach my $info (@machInfo) {
       if ($info =~ /idle$/) {
 	$idleCount++;
       } else {
 	$busyCount++;
       }
     }
     my $batchCount =
       `$HgAutomate::runSSH $paraHub parasol list batches | grep -v ^# | wc -l`;
     my $expectedPortion = 1 / (1 + $batchCount);
     my $oomph = (($idleCount + ($busyCount * $expectedPortion)) *
 		 $clusterInfo->{'gigaHz'});
     &verbose(3, "chooseClusterByBandwidth: " .
 	     "$paraHub: ((idle=$idleCount + " .
 	     "(busy=$busyCount * portion=$expectedPortion)) " .
 	     "* speed=$clusterInfo->{gigaHz}) = $oomph\n");
     if (! defined $maxOomph || ($oomph > $maxOomph)) {
       $maxOomph = $oomph;
       $bestCluster = $paraHub;
     }
   }
   if (! defined $bestCluster) {
     confess "Failed to find a live cluster";
   }
   &verbose(2, "chooseClusterByBandwidth: $bestCluster " .
 	   "($maxOomph Gop/s est)\n");
   return $bestCluster;
 }
 
 sub chooseSmallClusterByBandwidth {
   # Choose small cluster (fast nodes) by apparent available bandwidth.
   # Note: this does not take I/O into account, so it's best to call this
   # before distributing inputs instead of after (unless they have been
   # distributed somewhere that is fast for all clusters like /scratch).
   return chooseClusterByBandwidth(1);
 }
 
 sub chooseFilesystemsForCluster {
   # Return a list of suitable filesystems for given cluster and direction.
   my ($cluster, $inOrOut) = @_;
   confess "Must have exactly 2 arguments" if (scalar(@_) != 2);
   my $clusterInfo = $cluster{$cluster};
   confess "Unrecognized cluster $cluster" if (! $clusterInfo);
   confess "Second arg must be either \"in\" or \"out\""
     if ($inOrOut ne 'in' && $inOrOut ne 'out');
   my @filesystems;
   foreach my $fs (keys %clusterFilesystem) {
     my $fsInfo = $clusterFilesystem{$fs};
     my @okClusters = ($inOrOut eq 'in') ?
        @{$fsInfo->{'inputFor'}} :  @{$fsInfo->{'outputFor'}};
     if (scalar(grep /^$cluster$/, @okClusters)) {
       push @filesystems, $fsInfo->{'root'} if (-d $fsInfo->{'root'});
     }
   }
   push @filesystems, '/hive/data/genomes' if (scalar(@filesystems)<1);
   return @filesystems;
 }
 
 
 #########################################################################
 # Support for command line options expected to be common to many
 # automation scripts:
 
 use vars qw( @commonOptionVars @commonOptionSpec );
 
 # Common option defaults:
 my $defaultVerbose = 1;
 
 @commonOptionVars = qw(
     $opt_workhorse
     $opt_fileServer
     $opt_dbHost
     $opt_bigClusterHub
     $opt_smallClusterHub
     $opt_priority
     $opt_debug
     $opt_verbose
     $opt_help
     );
 
 @commonOptionSpec = ("workhorse=s",
 		     "fileServer=s",
 		     "dbHost=s",
 		     "bigClusterHub=s",
 		     "smallClusterHub=s",
 		     "priority=n",
 		     "verbose=n",
 		     "debug",
 		     "help",
 		    );
 
 my %optionHelpText = ( 'workhorse' =>
 '    -workhorse machine    Use machine (default: %s) for compute or
                           memory-intensive steps.
 ',
 		       'fileServer' =>
 '    -fileServer mach      Use mach (default: fileServer of the build directory)
                           for I/O-intensive steps.
 ',
 		       'dbHost' =>
 '    -dbHost mach          Use mach (default: %s) as database server.
 ',
 		       'bigClusterHub' =>
 '    -bigClusterHub mach   Use mach (default: %s) as parasol hub
                           for cluster runs with very large job counts.
 ',
 		       'smallClusterHub' =>
 '    -smallClusterHub mach Use mach (default: %s) as parasol hub
                           for cluster runs with smallish job counts.
 ',
 		       'priority' =>
 '    -priority num        Use this priority for parasol jobs.
 ',
 		       'debug' =>
 '    -debug                Don\'t actually run commands, just display them.
 ',
 		       'verbose' =>
 '    -verbose num          Set verbose level to num (default %d).
 ',
 		       'help' =>
 '    -help                 Show detailed help and exit.
 ',
 		     );
 
 my %optionDefaultDefaults = ( 'workhorse' => 'least loaded',
 			      'dbHost' => $defaultDbHost,
 			      'priority' => '10',
 			      'bigClusterHub' => 'most available',
 			      'smallClusterHub' => 'most available',
 			      'verbose' => $defaultVerbose,
 			    );
 
 
 sub getCommonOptionHelp {
   # Return description of common options, given defaults, for usage message.
   # Input is a hash of applicable options and default values (which can be
   # empty, in which case %optionDefaultDefaults will be used).
   # debug, verbose and help will be added if not specified.
   my %optionSpec = @_;
   my $help = "";
   foreach my $opName (sort keys %optionSpec) {
     if (exists $optionHelpText{$opName}) {
       $help .= sprintf $optionHelpText{$opName},
 	($optionSpec{$opName} || $optionDefaultDefaults{$opName});
     } else {
       die "HgAutomate::getCommonOptionHelp: unrecognized option '$opName'.\n" .
 	"Supported values: " . join(", ", sort keys %optionHelpText) . ".\n";
     }
   }
   $help .= $optionHelpText{'debug'} if (! exists $optionSpec{'debug'});
   if (! exists $optionSpec{'verbose'}) {
     $help .= sprintf $optionHelpText{'verbose'},
       $optionDefaultDefaults{'verbose'};
   }
   $help .= $optionHelpText{'help'} if (! exists $optionSpec{'help'});
   return $help;
 }
 
 sub processCommonOptions {
   # Process common command line options as specified above
   # (except -help is up to caller):
   $main::opt_verbose = $defaultVerbose if (! defined $main::opt_verbose);
 }
 
 
 #########################################################################
 #	These items should come from a configuration file so this
 #	business can be easily set up in other environments.
 # Hardcoded paths/command sequences:
 use vars qw( 	$centralDbSql $git
 		$clusterData $trackBuild $goldenPath $images $gbdb
 		$splitThreshold $runSSH $setMachtype
 	   );
 use vars qw( $clusterData $trackBuild
 	     $goldenPath $gbdb $centralDbSql $splitThreshold $runSSH );
 $centralDbSql = "hgsql -h localhost -A -N hgcentraltest";
 $git = "/usr/bin/git";
 
 $clusterData = '/hive/data/genomes';
 $trackBuild = 'bed';
 my $apacheRoot = '/usr/local/apache';
 $goldenPath = "$apacheRoot/htdocs-hgdownload/goldenPath";
 $images = "$apacheRoot/htdocs/images";
 $gbdb = '/gbdb';
 
 # This is the max number of sequences in an assembly that we will consider
 # "chrom-based" (allow split tables; per-seq files can fit in one directory)
 # as opposed to "scaffold-based" (no split tables; multi-level directory for
 # per-seq files, or use set of multi-seq files).
 $splitThreshold = 100;
 
 # ssh command and its options, the extra -o options prevent asking
 # questions about adding machines to known hosts
 $runSSH = "ssh -x -o 'StrictHostKeyChecking = no' -o 'BatchMode = yes'";
 
 $setMachtype = "setenv MACHTYPE `uname -m | sed -e 's/i686/i386/;'`";
 
 #########################################################################
 # General utility subroutines:
 
 sub checkCleanSlate {
   # Exit with an error message if it looks like this step has already been run
   # based on the existence of the given file(s) or directory(ies).
   my ($step, $nextStep, @files) = @_;
   confess "Must have at least 3 arguments" if (scalar(@_) < 3);
   confess "undef input" if (! defined $step || ! defined $nextStep);
   my $problem = 0;
   foreach my $f (@files) {
     confess "undef input" if (! defined $f);
     if (-e $f) {
       warn "$step: looks like this was run successfully already " .
 	"($f exists).  Either run with -continue $nextStep or some later " .
 	"step, or move aside/remove $f and run again.\n";
       $problem = 1;
     }
   }
   exit 1 if ($problem);
 }
 
 sub checkExistsUnlessDebug {
   # Exit with an error message if required files don't exist,
   # unless $opt_debug.
   my ($prevStep, $step, @files) = @_;
   confess "Must have at least 3 arguments" if (scalar(@_) < 3);
   confess "undef input" if (! defined $prevStep || ! defined $step);
   return if ($main::opt_debug);
   my $problem = 0;
   foreach my $f (@files) {
     confess "undef input" if (! defined $f);
     if (! -e $f) {
       warn "$step: output of previous step $prevStep, $f , is required " .
 	"but does not appear to exist.\n" .
 	"If it actually does exist, then this error is probably due to " .
 	"network/filesystem delays -- wait a minute and restart with " .
 	"-continue $step.\n" .
 	"If it really doesn't exist, either fix things manually or " .
 	"try -continue $prevStep\n";
       $problem = 1;
     }
   }
   exit 1 if ($problem);
 }
 
 sub paraRun {
  my $para = '/parasol/bin/para';
  if ( ! -e "$para" ) {
     # allow PATH to find the para command
     $para = "para";
  }
  my $pargs = "";
  if (defined $main::opt_priority) {
      $pargs .= "-priority=$main::opt_priority";
  }
  return ("$para make $pargs jobList\n" .
 "$para check\n" .
 "$para time > run.time\n" .
 'cat run.time');
 }
 
 sub gensub2 {
  my $answer = '/parasol/bin/gensub2';
  if ( ! -s "$answer" ) {
     # allow PATH to find the gensub2 command
     $answer = "gensub2";
   }
  return $answer;
 }
 
 sub closeStdin {
   # If we don't do this, the script can hang ("Suspended (tty input)")
   # when it is run backgrounded (&) and then something is typed into the
   # terminal... or something like that.  Anyway, doesn't hurt.  It does not
   # prevent hanging on ssh prompts, however.
   close(STDIN);
   open(STDIN, '/dev/null');
 }
 
 sub asmHubDownloadDir {
   # return path to assembly hub build directory
   my ($asmId) = @_;
   confess "Must have exactly 1 argument" if (scalar(@_) != 1);
   confess "must supply GC[AF]_... assembly ID" if ($asmId !~ m/^GC/);
   my $gcX = substr($asmId,0,3);
   my $d0 = substr($asmId,4,3);
   my $d1 = substr($asmId,7,3);
   my $d2 = substr($asmId,10,3);
   my $downloadDir = $goldenPath . "/$gcX/$d0/$d1/$d2";
   return $downloadDir;
 }
 
 sub asmHubBuildDir {
   # return path to assembly hub build directory
   my ($asmId) = @_;
   confess "Must have exactly 1 argument" if (scalar(@_) != 1);
   confess "must supply GC[AF]_... assembly ID" if ($asmId !~ m/^GC/);
   my $gcX = substr($asmId,0,3);
   my $d0 = substr($asmId,4,3);
   my $d1 = substr($asmId,7,3);
   my $d2 = substr($asmId,10,3);
   my $buildDir = "/hive/data/genomes/asmHubs/allBuild/$gcX/$d0/$d1/$d2/$asmId";
   return $buildDir;
 }
 
 sub asmHubSubmitter {
   # common name is in (parens) in the assembly_report 'Organism name:' line
   my ($asmReport) = @_;
   my $submitter = `grep -i "submitter" $asmReport | head -1 | tr -d '\r'`;
   chomp $submitter;
   $submitter =~ s/.*ubmitter:\s+//i;
   return $submitter;
 }
 
 sub asmHubDate {
   # common name is in (parens) in the assembly_report 'Organism name:' line
   my ($asmReport) = @_;
   my $date = `grep -i "date:" $asmReport | head -1 | tr -d '\r'`;
   chomp $date;
   $date =~ s/.*ate:\s+//i;
   return $date;
 }
 
 sub asmHubCommonName {
   # common name is in (parens) in the assembly_report 'Organism name:' line
   my ($asmReport) = @_;
   my $names = `grep -i "organism name:" $asmReport | head -1 | tr -d '\r'`;
   chomp $names;
   $names =~ s/.*\(//;
   $names =~ s/\).*//;
   return $names;
 }
 
 sub getAssemblyInfo {
   # Do a quick dbDb lookup to get assembly descriptive info for README.txt.
   my ($dbHost, $db) = @_;
   confess "Must have exactly 2 arguments" if (scalar(@_) != 2);
   if ($db =~ m/^GC/) {
     my $asmReport = asmHubBuildDir($db) . "/download/${db}_assembly_report.txt";
     confess "Can not find $asmReport" if ( ! -s "${asmReport}" );
     my $genome = asmHubCommonName($asmReport);
     my $date = asmHubDate($asmReport);
     my $source = asmHubSubmitter($asmReport);
     return ($genome, $date, $source);
   } else {
     my $query = "select genome,description,sourceName from dbDb " .
               "where name = \"$db\";";
     my $line = `echo '$query' | $HgAutomate::runSSH $dbHost $centralDbSql`;
     chomp $line;
     my ($genome, $date, $source) = split("\t", $line);
     return ($genome, $date, $source);
   }
 }
 
 sub getSpecies {
   # fetch scientificName from dbDb
   my ($dbHost, $db) = @_;
   confess "Must have exactly 2 arguments" if (scalar(@_) != 2);
   my $query = "select scientificName from dbDb " .
               "where name = \"$db\";";
   my $line = `echo '$query' | $HgAutomate::runSSH $dbHost $centralDbSql`;
   chomp $line;
   my ($scientificName) = split("\t", $line);
   if (length($scientificName) < 1) {
      if ( -s "$HgAutomate::clusterData/$db/species.name.txt" ) {
         $scientificName = `cat $HgAutomate::clusterData/$db/species.name.txt`;
         chomp $scientificName;
      } else {
         $scientificName = "species name not found";
      }
   }
   return ($scientificName);
 } # getSpecies
 
 sub getOrganism {
   # fetch organism from dbDb
   my ($dbHost, $db) = @_;
   confess "Must have exactly 2 arguments" if (scalar(@_) != 2);
   my $query = "select organism from dbDb " .
               "where name = \"$db\";";
   my $line = `echo '$query' | $HgAutomate::runSSH $dbHost $centralDbSql`;
   chomp $line;
   my ($organism) = split("\t", $line);
   return ($organism);
 } # getOrganism
 
 # try to find the date and assembly name for a hub given just the accession
 sub hubDateName($) {
   my ($accession) = @_;
   my $returnDate = "some date";
   my $returnAsmName = "";
 
   if ($accession !~ m/^GC/) {	# not a GenArk hub, database assembly
     my ($comName, $asmDate, $submitter) = getAssemblyInfo("hgwdev", $accession);
      $returnDate = $asmDate;
      $returnAsmName = $accession;
   } else {
     my $gcX = substr($accession, 0, 3);
     my $d0 = substr($accession, 4, 3);
     my $d1 = substr($accession, 7, 3);
     my $d2 = substr($accession, 10, 3);
     # a couple assemblies unfortunately have two different assembly names
     # can't work with those with just an accession
     # special case the CHM13 assembly
     my $betterId = $accession;
     if ($accession =~ m/GCA_009914755.4/) {
        $betterId = "GCA_009914755.4_T2T-CHM13v2.0";
     }
     my $dirCount = `ls -d /hive/data/outside/ncbi/genomes/$gcX/$d0/$d1/$d2/${betterId}* | wc -l`;
     chomp $dirCount;
     if (1 == $dirCount) {
        my $srcDir = `ls -d /hive/data/outside/ncbi/genomes/$gcX/$d0/$d1/$d2/${betterId}*`;
        chomp $srcDir;
        if ( -d "${srcDir}" ) {
           my $asmId = basename($srcDir);
           my $asmRpt = "$srcDir/${asmId}_assembly_report.txt";
           if ( -s "${asmRpt}" ) {
              (undef, undef, $returnAsmName) = split('_', $asmId, 3);
              if (defined($returnAsmName)) {
                $returnAsmName =~ s/\r//;
                $returnAsmName =~ s/^/_/;
              } else {
                $returnAsmName = "";
              }
              my $tDate = `egrep -m 1 -i "^#[[:space:]]*Date:" "${asmRpt}" | sed -e 's/.*ate: \\+//;' | tr -d '\r'`;
              chomp $tDate;
              $returnDate = $tDate if (length($tDate));
           }
        }
     }
   }
   return ($returnDate, $returnAsmName);
 }	#	sub hubDateName($)
 
 sub machineHasFile {
   # Return a positive integer if $mach appears to have $file or 0 if it
   # does not.
   my ($mach, $file) = @_;
   confess "Must have exactly 2 arguments" if (scalar(@_) != 2);
   confess "undef input" if (! defined $mach || ! defined $file);
   my $count = `$HgAutomate::runSSH $mach ls -1 $file 2>>/dev/null | wc -l`;
   chomp $count;
   return $count + 0;
 }
 
 sub databaseExists {
   my ($dbHost, $db) = @_;
   return 0 if ($dbHost =~ m/nohost/i);
   confess "Must have exactly 2 arguments" if (scalar(@_) != 2);
   my $query = "show databases like \"$db\";";
   my $line = `echo '$query' | $HgAutomate::runSSH $dbHost $centralDbSql`;
   chomp $line;
   return length($line);     # will be zero if not existing, >0 if exists
 }
 
+sub dbTableExists {
+  my ($dbHost, $db, $table) = @_;
+  return 0 if ($dbHost =~ m/nohost/i);
+  confess "Must have exactly 3 arguments" if (scalar(@_) != 3);
+  if (&HgAutomate::databaseExists($dbHost, $db)) {
+    my $query = "select count(*) from $db.$table;";
+    my $line = `echo '$query' | $HgAutomate::runSSH $dbHost $centralDbSql 2>>/dev/null`;
+    chomp $line;
+    return length($line);     # will be zero if not existing, >0 if exists
+  } else {	# no DB, no table
+    return 0;
+  }
+}
+
 sub makeGsub {
   # Create a gsub file in the given dir with the given contents.
   my ($runDir, $templateCmd) = @_;
   confess "Must have exactly 2 arguments" if (scalar(@_) != 2);
   confess "undef input" if (! defined $runDir || ! defined $templateCmd);
   chomp $templateCmd;
   my $fh = mustOpen(">$runDir/gsub");
   print $fh  <<_EOF_
 #LOOP
 $templateCmd
 #ENDLOOP
 _EOF_
     ;
   close($fh);
 }
 
 sub mustMkdir {
   # mkdir || die.  Immune to -debug -- we need to create the dir structure
   # and dump out the scripts even if we don't actually execute the scripts.
   my ($dir) = @_;
   confess "Must have exactly 1 argument" if (scalar(@_) != 1);
   confess "undef input" if (! defined $dir);
   system("mkdir -p $dir") == 0 || die "Couldn't mkdir $dir\n";
 }
 
 sub mustOpen {
   # Open a file or else die with informative error message.
   my ($fileSpec) = @_;
   confess "Must have exactly 1 argument" if (scalar(@_) != 1);
   confess "undef input" if (! defined $fileSpec);
   open(my $handle, $fileSpec)
     || die "Couldn't open \"$fileSpec\": $!\n";
   return $handle;
 }
 
 sub nfsNoodge {
   # the touch of the directory causes NFS to refresh its directory
   # information and thus pick up status change to the file.
   # sometimes localhost can't see the newly created file immediately,
   # so insert some artificial delay in order to prevent the next step
   # from dieing on lack of file:
   my ($file) = @_;
   confess "Must have exactly 1 argument" if (scalar(@_) != 1);
   confess "undef input" if (! defined $file);
   return if ($main::opt_debug);
   my $dir = dirname($file);
   for (my $i=0;  $i < 5;  $i++) {
     `touch $dir`;
     sleep(4);
     last if ( -s $file );
   }
 }
 
 sub run {
   # Run a command in sh (unless -debug).
   my ($cmd) = @_;
   confess "Must have exactly 1 argument" if (scalar(@_) != 1);
   confess "undef input" if (! defined $cmd);
   if ($main::opt_debug) {
     print "#DEBUG# $cmd\n";
   } else {
     verbose(1, "# $cmd\n");
     system($cmd) == 0 || die "Command failed:\n$cmd\n";
   }
 }
 
 sub verbose {
   my ($level, $message) = @_;
   confess "Must have exactly 2 arguments" if (scalar(@_) != 2);
   confess "undef input" if (! defined $level || ! defined $message);
   print STDERR $message if ($main::opt_verbose >= $level);
 }
 
 
 # perl packages need to end by returning a positive value:
 1;