66a4bb8eccef8184fe80dbeb638c99f2dcbda169
hiram
  Wed Feb 26 14:53:01 2020 -0800
better detection if steps need to be done or have been done or are currently running refs #23891

diff --git src/hg/utils/automation/doAssemblyHub.pl src/hg/utils/automation/doAssemblyHub.pl
index cdd137e..58dd6e3 100755
--- src/hg/utils/automation/doAssemblyHub.pl
+++ src/hg/utils/automation/doAssemblyHub.pl
@@ -67,31 +67,31 @@
 my $ftpDir = "";	# will be determined from given asmId
 my $rmskSpecies = "";
 my $augustusSpecies = "human";
 my $xenoRefSeq = "/hive/data/genomes/asmHubs/VGP/xenoRefSeq";
 my $ucscNames = 0;  # default 'FALSE' (== 0)
 my $asmHubName = "n/a";  # directory name in: /gbdb/hubs/asmHubName
 my $workhorse = "hgwdev";  # default workhorse when none chosen
 my $fileServer = "hgwdev";  # default when none chosen
 my $bigClusterHub = "ku";  # default when none chosen
 my $smallClusterHub = "ku";  # default when none chosen
 
 my $base = $0;
 $base =~ s/^(.*\/)?//;
 
 # key is original accession name from the remove.dups.list, value is 1
-my %dupAccessionList = {};
+my %dupAccessionList;
 
 sub usage {
   # Usage / help / self-documentation:
   my ($status, $detailed) = @_;
   # Basic help (for incorrect usage):
   print STDERR "
 usage: $base [options] asmId
 required arguments:
     asmId          - assembly identifier at NCBI FTP site, examples:
                    - GCF_000001405.32_GRCh38.p6 GCF_000001635.24_GRCm38.p4 etc..
 
 options:
 ";
   print STDERR $stepper->getOptionHelp();
   print STDERR <<_EOF_
@@ -924,41 +924,43 @@
 sub doGatewayPage {
   if ($asmHubName eq "n/a") {
     printf STDERR "ERROR: step gatewayPage needs argument -asmHubName <name>\n";
     exit 255;
   }
   my $runDir = "$buildDir/html";
   &HgAutomate::mustMkdir($runDir);
 
   my $whatItDoes = "construct html/$asmId.description.html";
   my $bossScript = newBash HgRemoteScript("$runDir/doGatewayPage.bash",
                     $workhorse, $runDir, $whatItDoes);
 
   my $photoJpg = "noPhoto";
   my $photoCredit = "noPhoto";
   my $photoLink = "";
-  if ( -s "$runDir/../photo/$species.jpg" ) {
+  my $speciesNoBlank = $species;
+  $speciesNoBlank =~ s/ /_/g;
+  if ( -s "$runDir/../photo/$speciesNoBlank.jpg" ) {
      $photoJpg = "../photo/\${species}.jpg";
      $photoCredit = "../photo/photoCredits.txt";
      $photoLink = "rm -f \${species}.jpg; ln -s ../photo/\${species}.jpg ."
   } else {
      printf STDERR "# gatewayPage: warning: no photograph available\n";
   }
 
   $bossScript->add(<<_EOF_
 export asmId=$asmId
-export species=$species
+export species=$speciesNoBlank
 
 \$HOME/kent/src/hg/utils/automation/asmHubGatewayPage.pl \\
      $asmHubName ../download/\${asmId}_assembly_report.txt \\
        ../\${asmId}.chrom.sizes \\
          $photoJpg $photoCredit \\
            > \$asmId.description.html 2> \$asmId.names.tab
 \$HOME/kent/src/hg/utils/automation/genbank/buildStats.pl \\
        ../\$asmId.chrom.sizes 2> \$asmId.build.stats.txt
 touch -r ../download/\${asmId}_assembly_report.txt \$asmId.description.html
 $photoLink
 _EOF_
   );
   $bossScript->execute();
 } # gatewayPage
 
@@ -1016,30 +1018,37 @@
   rm -f \$asmId.wigVarStep.gz
   touch -r ../../\$asmId.2bit \$asmId.gc5Base.bw
 else
   printf "# gc5Base step previously completed\\n" 1>&2
   exit 0
 fi
 _EOF_
   );
   $bossScript->execute();
 } # gc5Base
 
 #########################################################################
 # * step: repeatMasker [workhorse]
 sub doRepeatMasker {
   my $runDir = "$buildDir/trackData/repeatMasker";
+  if ( -d "$buildDir/trackData/repeatMasker/run.cluster" ) {
+     if ( ! -s "$buildDir/trackData/repeatMasker/faSize.rmsk.txt" ) {
+       &HgAutomate::verbose(1,
+	"\nERROR: step repeatmasker may be running\n");
+       exit 255;
+     }
+  }
   &HgAutomate::mustMkdir($runDir);
 
   my $whatItDoes = "construct repeatMasker track data";
   my $bossScript = newBash HgRemoteScript("$runDir/doRepeatMasker.bash",
                     $workhorse, $runDir, $whatItDoes);
 
   $bossScript->add(<<_EOF_
 export asmId=$asmId
 
 if [ $buildDir/\$asmId.2bit -nt faSize.rmsk.txt ]; then
 export species=`echo $rmskSpecies | sed -e 's/_/ /g;'`
 
 doRepeatMasker.pl -stop=mask -buildDir=`pwd` -unmaskedSeq=$buildDir/\$asmId.2bit \\
   -bigClusterHub=$bigClusterHub -workhorse=$workhorse -species="\$species" \$asmId
 
@@ -1174,41 +1183,51 @@
   rm -f toBbi.bed
   gzip *.bed
 else
   printf "# allgaps step previously completed\\n" 1>&2
   exit 0
 fi
 _EOF_
   );
   $bossScript->execute();
 } # allGaps
 
 #########################################################################
 # * step: idKeys [workhorse]
 sub doIdKeys {
   my $runDir = "$buildDir/trackData/idKeys";
+  if (! -s "$buildDir/$asmId.2bit") {
+    &HgAutomate::verbose(1, "ERROR: idKeys can not find $asmId.2bit\n");
+    exit 255;
+  }
+  if (! needsUpdate("$buildDir/$asmId.2bit", "$runDir/$asmId.keySignature.txt")) {
+     &HgAutomate::verbose(1, "# idKeys step previously completed\n");
+     return;
+  }
   &HgAutomate::mustMkdir($runDir);
 
   my $whatItDoes = "construct ID key data for each contig/chr";
   my $bossScript = newBash HgRemoteScript("$runDir/doIdKeys.bash",
                     $workhorse, $runDir, $whatItDoes);
 
   $bossScript->add(<<_EOF_
 export asmId=$asmId
+export twoBit=$buildDir/\$asmId.2bit
 
 if [ ../../\$asmId.2bit -nt \$asmId.keySignature.txt ]; then
-  doIdKeys.pl \$asmId -buildDir=`pwd` -twoBit=../../\$asmId.2bit
+  doIdKeys.pl \$asmId -buildDir=`pwd` -twoBit=\$twoBit
+  touch -r \$twoBit \$asmId.keySignature.txt
 else
   printf "# idKeys step previously completed\\n" 1>&2
   exit 0
 fi
 _EOF_
   );
   $bossScript->execute();
 } # doIdKeys
 
 #########################################################################
 # * step: addMask [workhorse]
 sub doAddMask {
   my $runDir = "$buildDir/trackData/addMask";
 
   my $goNoGo = 0;
@@ -1330,42 +1349,60 @@
 
 if [ ../../\$asmId.unmasked.2bit -nt \$asmId.gapOverlap.bed.gz ]; then
   doGapOverlap.pl -buildDir=`pwd` -bigClusterHub=$bigClusterHub -smallClusterHub=$smallClusterHub -workhorse=$workhorse -twoBit=../../\$asmId.2bit \$asmId
 else
   printf "# gapOverlap step previously completed\\n" 1>&2
   exit 0
 fi
 _EOF_
   );
   $bossScript->execute();
 } # doGapOverlap
 
 #########################################################################
 # * step: tandemDups [workhorse]
 sub doTandemDups {
-  my $runDir = "$buildDir/trackData/gapOverlap";
+  my $runDir = "$buildDir/trackData/tandemDups";
+  if (! -s "$buildDir/$asmId.unmasked.2bit") {
+    &HgAutomate::verbose(1,
+	"ERROR: tandemDups: can not find $buildDir/$asmId.unmasked.2bit\n");
+    exit 255;
+  }
+  if (-d "${runDir}" ) {
+     if (! -s "$runDir/$asmId.tandemDups.bb") {
+       &HgAutomate::verbose(1,
+       "WARNING tandemDups step may already be running, but not completed ?\n");
+       return;
+     } elsif (! needsUpdate("$buildDir/$asmId.unmasked.2bit", "$runDir/$asmId.tandemDups.bb")) {
+       &HgAutomate::verbose(1, "# tandemDups step previously completed\n");
+       return;
+     }
+  }
+
   &HgAutomate::mustMkdir($runDir);
 
-  my $whatItDoes = "construct gap overlap track (duplicate sequence on each side of a gap)";
+  my $whatItDoes = "construct tandem dups track (nearby pairs of exact duplicate sequence)";
   my $bossScript = newBash HgRemoteScript("$runDir/doTandemDups.bash",
                     $workhorse, $runDir, $whatItDoes);
 
   $bossScript->add(<<_EOF_
 export asmId=$asmId
+export twoBit=$buildDir/\$asmId.unmasked.2bit
 
-if [ ../../\$asmId.unmasked.2bit -nt \$asmId.gapOverlap.bed.gz ]; then
-  doTandemDup.pl -buildDir=`pwd` -bigClusterHub=$bigClusterHub -smallClusterHub=$smallClusterHub -workhorse=$workhorse -twoBit=../../\$asmId.2bit \$asmId
+if [ \$twoBit -nt \$asmId.tandemDups.bb ]; then
+  doTandemDup.pl -buildDir=`pwd` -bigClusterHub=$bigClusterHub -smallClusterHub=$smallClusterHub -workhorse=$workhorse -twoBit=\$twoBit \$asmId
+  touch -r \$twoBit \$asmId.tandemDups.bb
 else
   printf "# tandemDups step previously completed\\n" 1>&2
   exit 0
 fi
 _EOF_
   );
   $bossScript->execute();
 } # doTandemDups
 
 #########################################################################
 # * step: cpgIslands [workhorse]
 sub doCpgIslands {
   my $runDir = "$buildDir/trackData/cpgIslands";
 
   &HgAutomate::mustMkdir($runDir);
@@ -1409,31 +1446,31 @@
 fi
 _EOF_
   );
   $bossScript->execute();
 } # sub doCpgIslands
 
 #########################################################################
 # * step: ncbiGene [workhorse]
 sub doNcbiGene {
   my $gffFile = "$assemblySource/${asmId}_genomic.gff.gz";
   if ( ! -s "${gffFile}" ) {
     printf STDERR "# step ncbiGene: no gff file found at:\n#  %s\n", $gffFile;
     return;
   }
   if ( ! -s "$buildDir/sequence/$asmId.ncbiToUcsc.lift" ) {
-    printf STDERR "# ERROR: ncbiGene: can not find ../../sequence/$asmId.ncbiToUcsc.lift\n";
+    printf STDERR "# ERROR: ncbiGene: can not find $buildDir/sequence/$asmId.ncbiToUcsc.lift\n";
     exit 255;
   }
   my $runDir = "$buildDir/trackData/ncbiGene";
 
   &HgAutomate::mustMkdir($runDir);
 
   my $whatItDoes = "translate NCBI GFF3 gene definitions into a track";
   my $bossScript = newBash HgRemoteScript("$runDir/doNcbiGene.bash",
                     $workhorse, $runDir, $whatItDoes);
 
   $bossScript->add(<<_EOF_
 export asmId=$asmId
 export gffFile=$gffFile
 
 function cleanUp() {
@@ -1463,31 +1500,31 @@
   rm -f \$asmId.ncbiGene.ix.txt
   genePredToBigGenePred \$asmId.ncbiGene.ucsc.genePred.gz stdout \\
       | sort -k1,1 -k2,2n > \$asmId.ncbiGene.bed
   (bedToBigBed -type=bed12+8 -tab -as=\$HOME/kent/src/hg/lib/bigGenePred.as \\
       -extraIndex=name \$asmId.ncbiGene.bed \\
         ../../\$asmId.chrom.sizes \$asmId.ncbiGene.bb || true)
   if [ ! -s "\$asmId.ncbiGene.bb" ]; then
     printf "# ncbiGene: failing bedToBigBed\\n" 1>&2
     exit 255
   fi
   touch -r\$gffFile \$asmId.ncbiGene.bb
   bigBedInfo \$asmId.ncbiGene.bb | egrep "^itemCount:|^basesCovered:" \\
     | sed -e 's/,//g' > \$asmId.ncbiGene.stats.txt
   LC_NUMERIC=en_US /usr/bin/printf "# ncbiGene %s %'d %s %'d\\n" `cat \$asmId.ncbiGene.stats.txt` | xargs echo
 else
-  printf "# ncbiGene previously completed\\n" 1>&2
+  printf "# ncbiGene step previously completed\\n" 1>&2
 fi
 _EOF_
   );
   $bossScript->execute();
 } # doNcbiGene
 
 #########################################################################
 # * step: ncbiRefSeq [workhorse]
 sub doNcbiRefSeq {
   my $runDir = "$buildDir/trackData/ncbiRefSeq";
   my $gffFile = "$assemblySource/${asmId}_genomic.gff.gz";
   if ( ! -s "${gffFile}" ) {
     printf STDERR "# step ncbiRefSeq no gff file found at:\n#  %s\n", $gffFile;
     return;
   }
@@ -1501,91 +1538,91 @@
   $bossScript->add(<<_EOF_
 export asmId="$asmId"
 export buildDir="$buildDir"
 export liftFile="\$buildDir/sequence/\$asmId.ncbiToUcsc.lift"
 export target2bit="\$buildDir/\$asmId.2bit"
 
 if [ $buildDir/\$asmId.2bit -nt \$asmId.ncbiRefSeq.bb ]; then
 
 ~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -toGpWarnOnly -buildDir=`pwd` \\
       -bigClusterHub=$bigClusterHub -dbHost=$dbHost \\
       -liftFile="\$liftFile" \\
       -target2bit="\$target2bit" \\
       -stop=load -fileServer=$fileServer -smallClusterHub=$smallClusterHub -workhorse=$workhorse \\
       \$asmId \$asmId
 else
-  printf "# ncbiRefSeq previously completed\\n" 1>&2
+  printf "# ncbiRefSeq step previously completed\\n" 1>&2
 fi
 _EOF_
   );
   $bossScript->execute();
 } # ncbiRefSeq
 
 #########################################################################
 # * step: augustus [workhorse]
 sub doAugustus {
   my $runDir = "$buildDir/trackData/augustus";
 
   &HgAutomate::mustMkdir($runDir);
 
   my $whatItDoes = "run Augustus gene prediction procedures";
   my $bossScript = newBash HgRemoteScript("$runDir/doAugustus.bash",
                     $workhorse, $runDir, $whatItDoes);
 
   $bossScript->add(<<_EOF_
 export asmId=$asmId
 
 if [ $buildDir/\$asmId.2bit -nt \$asmId.augustus.bb ]; then
   time (~/kent/src/hg/utils/automation/doAugustus.pl -stop=makeGp -buildDir=`pwd` -dbHost=$dbHost \\
     -bigClusterHub=$bigClusterHub -species=$augustusSpecies -workhorse=$workhorse \\
     -noDbGenePredCheck -maskedSeq=$buildDir/\$asmId.2bit \$asmId) > makeDb.log 2>&1
   time (~/kent/src/hg/utils/automation/doAugustus.pl -continue=cleanup -stop=cleanup -buildDir=`pwd` -dbHost=$dbHost \\
     -bigClusterHub=$bigClusterHub -species=$augustusSpecies -workhorse=$workhorse \\
     -noDbGenePredCheck -maskedSeq=$buildDir/\$asmId.2bit \$asmId) > cleanup.log 2>&1
 else
-  printf "# augustus genes previously completed\\n" 1>&2
+  printf "# augustus genes step previously completed\\n" 1>&2
 fi
 _EOF_
   );
   $bossScript->execute();
 } # doAugustus
 
 #########################################################################
 # * step: xenoRefGene [bigClusterHub]
 sub doXenoRefGene {
   my $runDir = "$buildDir/trackData/xenoRefGene";
 
   &HgAutomate::mustMkdir($runDir);
 
   my $whatItDoes = "run xeno RefSeq gene mapping procedures";
   my $bossScript = newBash HgRemoteScript("$runDir/doXenoRefGene.bash",
                     $workhorse, $runDir, $whatItDoes);
 
   $bossScript->add(<<_EOF_
 export asmId=$asmId
 
 if [ $buildDir/\$asmId.2bit -nt \$asmId.xenoRefGene.bb ]; then
   time (~/kent/src/hg/utils/automation/doXenoRefGene.pl -buildDir=`pwd` -dbHost=$dbHost \\
     -bigClusterHub=$bigClusterHub -mrnas=$xenoRefSeq -workhorse=$workhorse \\
     -maskedSeq=$buildDir/trackData/addMask/\$asmId.masked.2bit \$asmId) > do.log 2>&1
   if [ -s "\$asmId.xenoRefGene.bb" ]; then
   bigBedInfo \$asmId.xenoRefGene.bb | egrep "^itemCount:|^basesCovered:" \\
     | sed -e 's/,//g' > \$asmId.xenoRefGene.stats.txt
   LC_NUMERIC=en_US /usr/bin/printf "# xenoRefGene %s %'d %s %'d\\n" `cat \$asmId.xenoRefGene.stats.txt` | xargs echo
   fi
 else
-  printf "# xenoRefGene previously completed\\n" 1>&2
+  printf "# xenoRefGene step previously completed\\n" 1>&2
 fi
 _EOF_
   );
   $bossScript->execute();
 } # doXenoRefGene
 
 #########################################################################
 # * step: trackDb [workhorse]
 sub doTrackDb {
   my $runDir = "$buildDir";
   &HgAutomate::mustMkdir($runDir);
 
   my $whatItDoes = "construct asmId.trackDb.txt file";
   my $bossScript = newBash HgRemoteScript("$runDir/doTrackDb.bash",
                     $workhorse, $runDir, $whatItDoes);
@@ -1634,67 +1671,70 @@
 # GC[AF]_123456789.3_assembly_Name
 #   0         1         2      3 ....
 my @partNames = split('_', $asmId);
 $ftpDir = sprintf("%s/%s/%s/%s/%s", $partNames[0],
    substr($partNames[1],0,3), substr($partNames[1],3,3),
    substr($partNames[1],6,3), $asmId);
 
 # Force debug and verbose until this is looking pretty solid:
 # $opt_debug = 1;
 # $opt_verbose = 3 if ($opt_verbose < 3);
 
 # Establish what directory we will work in.
 $buildDir = $opt_buildDir ? $opt_buildDir :
   "$HgAutomate::clusterData/asmHubs/refseqBuild/$ftpDir";
 
-$assemblySource = $opt_sourceDir ? "$sourceDir" : "$sourceDir/$ftpDir";
+$sourceDir = $opt_sourceDir ? $opt_sourceDir : $sourceDir;
+$assemblySource = $opt_sourceDir ? "$opt_sourceDir" : "$sourceDir/$ftpDir";
 my $asmReport = "$assemblySource/${asmId}_assembly_report.txt";
 
 $species = $opt_species ? $opt_species : $species;
 
 if (length($species) < 1) {
   if (-s "$asmReport") {
      $species = `grep -i "organism name:" $asmReport`;
      chomp $species;
      $species =~ s/.*organism\s+name:\s+//i;
      $species =~ s/\s+\(.*//;
   } else {
      die "no -species specified and can not find $asmReport";
   }
   if (length($species) < 1) {
      die "no -species specified and can not find Organism name: in $asmReport";
   }
 }
 
-$sourceDir = $opt_sourceDir ? $opt_sourceDir : $sourceDir;
 $rmskSpecies = $opt_rmskSpecies ? $opt_rmskSpecies : $species;
 $augustusSpecies = $opt_augustusSpecies ? $opt_augustusSpecies : $augustusSpecies;
 $xenoRefSeq = $opt_xenoRefSeq ? $opt_xenoRefSeq : $xenoRefSeq;
 $ucscNames = $opt_ucscNames ? 1 : $ucscNames;   # '1' == 'TRUE'
 $workhorse = $opt_workhorse ? $opt_workhorse : $workhorse;
 $bigClusterHub = $opt_bigClusterHub ? $opt_bigClusterHub : $bigClusterHub;
 $smallClusterHub = $opt_smallClusterHub ? $opt_smallClusterHub : $smallClusterHub;
 $fileServer = $opt_fileServer ? $opt_fileServer : $fileServer;
 $asmHubName = $opt_asmHubName ? $opt_asmHubName : $asmHubName;
 
 
 die "can not find assembly source directory\n$assemblySource" if ( ! -d $assemblySource);
 printf STDERR "# buildDir: %s\n", $buildDir;
 printf STDERR "# sourceDir %s\n", $sourceDir;
 printf STDERR "# augustusSpecies %s\n", $augustusSpecies;
 printf STDERR "# xenoRefSeq %s\n", $xenoRefSeq;
 printf STDERR "# assemblySource: %s\n", $assemblySource;
+printf STDERR "# asmHubName %s\n", $asmHubName;
+printf STDERR "# rmskSpecies %s\n", $rmskSpecies;
+printf STDERR "# augustusSpecies %s\n", $augustusSpecies;
 
 # Do everything.
 $stepper->execute();
 
 # Tell the user anything they should know.
 my $stopStep = $stepper->getStopStep();
 my $upThrough = ($stopStep eq 'cleanup') ? "" :
   "  (through the '$stopStep' step)";
 
 $secondsEnd = `date "+%s"`;
 chomp $secondsEnd;
 my $elapsedSeconds = $secondsEnd - $secondsStart;
 my $elapsedMinutes = int($elapsedSeconds/60);
 $elapsedSeconds -= $elapsedMinutes * 60;