2aa890be7b759af7095a891fa999cb27a63bf5a2
hiram
  Fri Feb 28 15:32:32 2020 -0800
better detection if step already done and need to fixup spaces in gene names from NCBI refs #23891

diff --git src/hg/utils/automation/doAssemblyHub.pl src/hg/utils/automation/doAssemblyHub.pl
index 58dd6e3..bd3e7fb 100755
--- src/hg/utils/automation/doAssemblyHub.pl
+++ src/hg/utils/automation/doAssemblyHub.pl
@@ -1442,38 +1442,52 @@
     -chromSizes=$buildDir/\$asmId.chrom.sizes \$asmId
 else
   printf "# cpgIslands masked previously completed\\n" 1>&2
   exit 0
 fi
 _EOF_
   );
   $bossScript->execute();
 } # sub doCpgIslands
 
 #########################################################################
 # * step: ncbiGene [workhorse]
 sub doNcbiGene {
   my $gffFile = "$assemblySource/${asmId}_genomic.gff.gz";
   if ( ! -s "${gffFile}" ) {
-    printf STDERR "# step ncbiGene: no gff file found at:\n#  %s\n", $gffFile;
+    &HgAutomate::verbose(1, "# step ncbiGene: no gff file found at:\n#  %s\n", $gffFile);
     return;
   }
   if ( ! -s "$buildDir/sequence/$asmId.ncbiToUcsc.lift" ) {
-    printf STDERR "# ERROR: ncbiGene: can not find $buildDir/sequence/$asmId.ncbiToUcsc.lift\n";
+    &HgAutomate::verbose(1, "# ERROR: ncbiGene: can not find $buildDir/sequence/$asmId.ncbiToUcsc.lift\n");
     exit 255;
   }
   my $runDir = "$buildDir/trackData/ncbiGene";
+  if (-d "${runDir}" ) {
+     if (! -s "$runDir/$asmId.ncbiGene.bb") {
+       &HgAutomate::verbose(1,
+       "WARNING ncbiGene step may already be running, but not completed ?\n");
+       return;
+     } elsif (! needsUpdate("$gffFile", "$runDir/$asmId.ncbiGene.bb")) {
+       &HgAutomate::verbose(1, "# ncbiGene step previously completed\n");
+       return;
+     }
+  }
+  if (! -s "$buildDir/$asmId.faSize.txt") {
+    &HgAutomate::verbose(1, "# step ncbiGene: can not find faSize.txt at:\n#  %s\n", "$buildDir/$asmId.faSize.txt");
+    exit 255;
+  }
 
   &HgAutomate::mustMkdir($runDir);
 
   my $whatItDoes = "translate NCBI GFF3 gene definitions into a track";
   my $bossScript = newBash HgRemoteScript("$runDir/doNcbiGene.bash",
                     $workhorse, $runDir, $whatItDoes);
 
   $bossScript->add(<<_EOF_
 export asmId=$asmId
 export gffFile=$gffFile
 
 function cleanUp() {
   rm -f \$asmId.ncbiGene.genePred.gz \$asmId.ncbiGene.genePred
   rm -f \$asmId.geneAttrs.ncbi.txt
 }
@@ -1482,47 +1496,54 @@
   (gff3ToGenePred -warnAndContinue -useName \\
     -attrsOut=\$asmId.geneAttrs.ncbi.txt \$gffFile stdout \\
       2>> \$asmId.ncbiGene.log.txt || true) | genePredFilter stdin stdout \\
         | gzip -c > \$asmId.ncbiGene.genePred.gz
   genePredCheck \$asmId.ncbiGene.genePred.gz
   export howMany=`genePredCheck \$asmId.ncbiGene.genePred.gz 2>&1 | grep "^checked" | awk '{print \$2}'`
   if [ "\${howMany}" -eq 0 ]; then
      printf "# ncbiGene: no gene definitions found in \$gffFile\n";
      cleanUp
      exit 0
   fi
   liftUp -extGenePred -type=.gp stdout \\
       ../../sequence/\$asmId.ncbiToUcsc.lift warn \\
        \$asmId.ncbiGene.genePred.gz | gzip -c \\
           > \$asmId.ncbiGene.ucsc.genePred.gz
+  genePredToBed -tab -fillSpace \$asmId.ncbiGene.ucsc.genePred.gz stdout \\
+    | bedToExons stdin stdout | bedSingleCover.pl stdin > \$asmId.exons.bed
+  export baseCount=`awk '{sum+=\$3-\$2}END{printf "%d", sum}' \$asmId.exons.bed`
+  export asmSizeNoGaps=`grep sequences ../../\$asmId.faSize.txt | awk '{print \$1}'`
+  export perCent=`echo \$baseCount \$asmSizeNoGaps | awk '{printf "%.3f", 100.0*\$1/\$2}'`
+  rm -f \$asmId.exons.bed
   ~/kent/src/hg/utils/automation/gpToIx.pl \$asmId.ncbiGene.ucsc.genePred.gz \\
     | sort -u > \$asmId.ncbiGene.ix.txt
   ixIxx \$asmId.ncbiGene.ix.txt \$asmId.ncbiGene.ix \$asmId.ncbiGene.ixx
   rm -f \$asmId.ncbiGene.ix.txt
   genePredToBigGenePred \$asmId.ncbiGene.ucsc.genePred.gz stdout \\
       | sort -k1,1 -k2,2n > \$asmId.ncbiGene.bed
   (bedToBigBed -type=bed12+8 -tab -as=\$HOME/kent/src/hg/lib/bigGenePred.as \\
       -extraIndex=name \$asmId.ncbiGene.bed \\
         ../../\$asmId.chrom.sizes \$asmId.ncbiGene.bb || true)
   if [ ! -s "\$asmId.ncbiGene.bb" ]; then
     printf "# ncbiGene: failing bedToBigBed\\n" 1>&2
     exit 255
   fi
   touch -r\$gffFile \$asmId.ncbiGene.bb
   bigBedInfo \$asmId.ncbiGene.bb | egrep "^itemCount:|^basesCovered:" \\
     | sed -e 's/,//g' > \$asmId.ncbiGene.stats.txt
   LC_NUMERIC=en_US /usr/bin/printf "# ncbiGene %s %'d %s %'d\\n" `cat \$asmId.ncbiGene.stats.txt` | xargs echo
+  printf "%d bases of %d (%s%%) in intersection\\n" "\$baseCount" "\$asmSizeNoGaps" "\$perCent" > fb.\$asmId.ncbiGene.txt
 else
   printf "# ncbiGene step previously completed\\n" 1>&2
 fi
 _EOF_
   );
   $bossScript->execute();
 } # doNcbiGene
 
 #########################################################################
 # * step: ncbiRefSeq [workhorse]
 sub doNcbiRefSeq {
   my $runDir = "$buildDir/trackData/ncbiRefSeq";
   my $gffFile = "$assemblySource/${asmId}_genomic.gff.gz";
   if ( ! -s "${gffFile}" ) {
     printf STDERR "# step ncbiRefSeq no gff file found at:\n#  %s\n", $gffFile;
@@ -1549,30 +1570,45 @@
       -target2bit="\$target2bit" \\
       -stop=load -fileServer=$fileServer -smallClusterHub=$smallClusterHub -workhorse=$workhorse \\
       \$asmId \$asmId
 else
   printf "# ncbiRefSeq step previously completed\\n" 1>&2
 fi
 _EOF_
   );
   $bossScript->execute();
 } # ncbiRefSeq
 
 #########################################################################
 # * step: augustus [workhorse]
 sub doAugustus {
   my $runDir = "$buildDir/trackData/augustus";
+  if (! -s "$buildDir/$asmId.2bit") {
+    &HgAutomate::verbose(1,
+	"ERROR: augustus step can not find $buildDir/$asmId.2bit\n");
+    exit 255;
+  }
+  if (-d "${runDir}" ) {
+     if (! -s "$runDir/$asmId.augustus.bb") {
+       &HgAutomate::verbose(1,
+       "WARNING augustus step may already be running, but not completed ?\n");
+       return;
+     } elsif (! needsUpdate("$buildDir/$asmId.2bit", "$runDir/$asmId.augustus.bb")) {
+       &HgAutomate::verbose(1, "# augustus step previously completed\n");
+       return;
+     }
+  }
 
   &HgAutomate::mustMkdir($runDir);
 
   my $whatItDoes = "run Augustus gene prediction procedures";
   my $bossScript = newBash HgRemoteScript("$runDir/doAugustus.bash",
                     $workhorse, $runDir, $whatItDoes);
 
   $bossScript->add(<<_EOF_
 export asmId=$asmId
 
 if [ $buildDir/\$asmId.2bit -nt \$asmId.augustus.bb ]; then
   time (~/kent/src/hg/utils/automation/doAugustus.pl -stop=makeGp -buildDir=`pwd` -dbHost=$dbHost \\
     -bigClusterHub=$bigClusterHub -species=$augustusSpecies -workhorse=$workhorse \\
     -noDbGenePredCheck -maskedSeq=$buildDir/\$asmId.2bit \$asmId) > makeDb.log 2>&1
   time (~/kent/src/hg/utils/automation/doAugustus.pl -continue=cleanup -stop=cleanup -buildDir=`pwd` -dbHost=$dbHost \\