src/hg/utils/automation/doAssemblyHub.pl 653eaa635409806f4885b0486125efd4390c0931

653eaa635409806f4885b0486125efd4390c0931
hiram
  Thu Aug 27 14:00:07 2020 -0700
fix scripts to behave properly when not using UCSC sequence names refs #23734

diff --git src/hg/utils/automation/doAssemblyHub.pl src/hg/utils/automation/doAssemblyHub.pl
index bc49a4b..222bfa4 100755
--- src/hg/utils/automation/doAssemblyHub.pl
+++ src/hg/utils/automation/doAssemblyHub.pl
@@ -843,44 +843,52 @@
 # verify everything is there
 twoBitInfo ../download/\$asmId.2bit stdout | sort -k2nr > source.\$asmId.chrom.sizes
 export newTotal=`ave -col=2 ../\$asmId.chrom.sizes | grep "^total"`
 export oldTotal=`ave -col=2 source.\$asmId.chrom.sizes | grep "^total"`
 if [ "\$newTotal" != "\$oldTotal" ]; then
   printf "# ERROR: sequence construction error: not same totals source vs. new:\\n" 1>&2
   printf "# \$newTotal != \$oldTotal\\n" 1>&2
   exit 255
 fi
 rm source.\$asmId.chrom.sizes
 export checkAgp=`checkAgpAndFa ../\$asmId.agp.gz ../\$asmId.2bit 2>&1 | tail -1`
 if [ "\$checkAgp" != "All AGP and FASTA entries agree - both files are valid" ]; then
   printf "# ERROR: checkAgpAndFa \$asmId.agp.gz \$asmId.2bit failing\\n" 1>&2
   exit 255
 fi
+_EOF_
+  );
+  if ($ucscNames) {
+    $bossScript->add(<<_EOF_
 join -t\$'\\t' <(sort ../\$asmId.chrom.sizes) <(sort \${asmId}*.names) | awk '{printf "0\\t%s\\t%d\\t%s\\t%d\\n", \$3,\$2,\$1,\$2}' > \$asmId.ncbiToUcsc.lift
 join -t\$'\\t' <(sort ../\$asmId.chrom.sizes) <(sort \${asmId}*.names) | awk '{printf "0\\t%s\\t%d\\t%s\\t%d\\n", \$1,\$2,\$3,\$2}' > \$asmId.ucscToNcbi.lift
 export c0=`cat \$asmId.ncbiToUcsc.lift | wc -l`
 export c1=`cat \$asmId.ucscToNcbi.lift | wc -l`
 export c2=`cat ../\$asmId.chrom.sizes | wc -l`
 # verify all names are accounted for
 if [ "\$c0" -ne "\$c2" ]; then
   printf "# ERROR: not all names accounted for in \$asmId.ncbiToUcsc.lift" 1>&2
   exit 255
 fi
 if [ "\$c1" -ne "\$c2" ]; then
   printf "# ERROR: not all names accounted for in \$asmId.ucscToNcbi.lift" 1>&2
   exit 255
 fi
+_EOF_
+    );
+  }
+  $bossScript->add(<<_EOF_
 twoBitToFa ../\$asmId.2bit stdout | faCount stdin | gzip -c > \$asmId.faCount.txt.gz
 touch -r ../\$asmId.2bit \$asmId.faCount.txt.gz
 zgrep -P "^total\t" \$asmId.faCount.txt.gz > \$asmId.faCount.signature.txt
 touch -r ../\$asmId.2bit \$asmId.faCount.signature.txt
 _EOF_
   );
   $bossScript->execute();
 } # doSequence
 
 #########################################################################
 # * step: assemblyGap [workhorse]
 sub doAssemblyGap {
   my $runDir = "$buildDir/trackData/assemblyGap";
   &HgAutomate::mustMkdir($runDir);
 
@@ -1474,34 +1482,36 @@
   exit 0
 fi
 _EOF_
   );
   $bossScript->execute();
 } # sub doCpgIslands
 
 #########################################################################
 # * step: ncbiGene [workhorse]
 sub doNcbiGene {
   my $gffFile = "$assemblySource/${asmId}_genomic.gff.gz";
   if ( ! -s "${gffFile}" ) {
     &HgAutomate::verbose(1, "# step ncbiGene: no gff file found at:\n#  $gffFile\n");
     return;
   }
+  if ($ucscNames) {
     if ( ! -s "$buildDir/sequence/$asmId.ncbiToUcsc.lift" ) {
       &HgAutomate::verbose(1, "# ERROR: ncbiGene: can not find $buildDir/sequence/$asmId.ncbiToUcsc.lift\n");
       exit 255;
     }
+  }
   my $runDir = "$buildDir/trackData/ncbiGene";
   if (-d "${runDir}" ) {
      if (! -s "$runDir/$asmId.ncbiGene.bb") {
        &HgAutomate::verbose(1,
        "WARNING ncbiGene step may already be running, but not completed ?\n");
        return;
      } elsif (! needsUpdate("$gffFile", "$runDir/$asmId.ncbiGene.bb")) {
        &HgAutomate::verbose(1, "# ncbiGene step previously completed\n");
        return;
      }
   }
   if (! -s "$buildDir/$asmId.faSize.txt") {
     &HgAutomate::verbose(1, "# step ncbiGene: can not find faSize.txt at:\n#  $buildDir/$asmId.faSize.txt\n");
     exit 255;
   }
@@ -1521,45 +1531,55 @@
   rm -f \$asmId.geneAttrs.ncbi.txt
 }
 
 if [ \$gffFile -nt \$asmId.ncbiGene.bb ]; then
   (gff3ToGenePred -warnAndContinue -useName \\
     -attrsOut=\$asmId.geneAttrs.ncbi.txt \$gffFile stdout \\
       2>> \$asmId.ncbiGene.log.txt || true) | genePredFilter stdin stdout \\
         | gzip -c > \$asmId.ncbiGene.genePred.gz
   genePredCheck \$asmId.ncbiGene.genePred.gz
   export howMany=`genePredCheck \$asmId.ncbiGene.genePred.gz 2>&1 | grep "^checked" | awk '{print \$2}'`
   if [ "\${howMany}" -eq 0 ]; then
      printf "# ncbiGene: no gene definitions found in \$gffFile\n";
      cleanUp
      exit 0
   fi
+  export ncbiGenePred="\$asmId.ncbiGene.genePred.gz"
+_EOF_
+  );
+  if ($ucscNames) {
+    $bossScript->add(<<_EOF_
   liftUp -extGenePred -type=.gp stdout \\
     ../../sequence/\$asmId.ncbiToUcsc.lift warn \\
       \$asmId.ncbiGene.genePred.gz | gzip -c \\
         > \$asmId.ncbiGene.ucsc.genePred.gz
-  genePredToBed -tab -fillSpace \$asmId.ncbiGene.ucsc.genePred.gz stdout \\
+  ncbiGenePred="\$asmId.ncbiGene.ucsc.genePred.gz"
+_EOF_
+      );
+  }
+  $bossScript->add(<<_EOF_
+  genePredToBed -tab -fillSpace \$ncbiGenePred stdout \\
     | bedToExons stdin stdout | bedSingleCover.pl stdin > \$asmId.exons.bed
   export baseCount=`awk '{sum+=\$3-\$2}END{printf "%d", sum}' \$asmId.exons.bed`
   export asmSizeNoGaps=`grep sequences ../../\$asmId.faSize.txt | awk '{print \$5}'`
   export perCent=`echo \$baseCount \$asmSizeNoGaps | awk '{printf "%.3f", 100.0*\$1/\$2}'`
   rm -f \$asmId.exons.bed
-  ~/kent/src/hg/utils/automation/gpToIx.pl \$asmId.ncbiGene.ucsc.genePred.gz \\
+  ~/kent/src/hg/utils/automation/gpToIx.pl \$ncbiGenePred \\
     | sort -u > \$asmId.ncbiGene.ix.txt
   ixIxx \$asmId.ncbiGene.ix.txt \$asmId.ncbiGene.ix \$asmId.ncbiGene.ixx
   rm -f \$asmId.ncbiGene.ix.txt
-  genePredToBigGenePred \$asmId.ncbiGene.ucsc.genePred.gz stdout \\
+  genePredToBigGenePred \$ncbiGenePred stdout \\
       | sort -k1,1 -k2,2n > \$asmId.ncbiGene.bed
   (bedToBigBed -type=bed12+8 -tab -as=\$HOME/kent/src/hg/lib/bigGenePred.as \\
       -extraIndex=name \$asmId.ncbiGene.bed \\
         ../../\$asmId.chrom.sizes \$asmId.ncbiGene.bb || true)
   if [ ! -s "\$asmId.ncbiGene.bb" ]; then
     printf "# ncbiGene: failing bedToBigBed\\n" 1>&2
     exit 255
   fi
   touch -r\$gffFile \$asmId.ncbiGene.bb
   bigBedInfo \$asmId.ncbiGene.bb | egrep "^itemCount:|^basesCovered:" \\
     | sed -e 's/,//g' > \$asmId.ncbiGene.stats.txt
   LC_NUMERIC=en_US /usr/bin/printf "# ncbiGene %s %'d %s %'d\\n" `cat \$asmId.ncbiGene.stats.txt` | xargs echo
   printf "%d bases of %d (%s%%) in intersection\\n" "\$baseCount" "\$asmSizeNoGaps" "\$perCent" > fb.\$asmId.ncbiGene.txt
 else
   printf "# ncbiGene step previously completed\\n" 1>&2
@@ -1589,41 +1609,44 @@
     }
   }
 
   if ($filesFound < $filesExpected) {
     printf STDERR "# step ncbiRefSeq does not have all files required\n";
     return;
   }
 
   my $runDir = "$buildDir/trackData/ncbiRefSeq";
   &HgAutomate::mustMkdir($runDir);
 
   my $whatItDoes = "run NCBI RefSeq gene procedures";
   my $bossScript = newBash HgRemoteScript("$runDir/doNcbiRefSeq.bash",
                     $workhorse, $runDir, $whatItDoes);
 
+  my $liftSpec = "";
+  if ($ucscNames) {
+    $liftSpec="-liftFile=\"\$buildDir/sequence/\$asmId.ncbiToUcsc.lift\"";
+  }
   $bossScript->add(<<_EOF_
 export asmId="$asmId"
 export buildDir="$buildDir"
-export liftFile="\$buildDir/sequence/\$asmId.ncbiToUcsc.lift"
+export liftSpec="$liftSpec"
 export target2bit="\$buildDir/\$asmId.2bit"
 
 if [ $buildDir/\$asmId.2bit -nt \$asmId.ncbiRefSeq.bb ]; then
 
 ~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -toGpWarnOnly -buildDir=`pwd` \\
-      -bigClusterHub=$bigClusterHub -dbHost=$dbHost \\
-      -liftFile="\$liftFile" \\
+      -assemblyHub -bigClusterHub=$bigClusterHub -dbHost=$dbHost $liftSpec \\
       -target2bit="\$target2bit" \\
       -stop=load -fileServer=$fileServer -smallClusterHub=$smallClusterHub -workhorse=$workhorse \\
       \$asmId \$asmId
 else
   printf "# ncbiRefSeq step previously completed\\n" 1>&2
 fi
 _EOF_
   );
   $bossScript->execute();
 } # ncbiRefSeq
 
 #########################################################################
 # * step: augustus [workhorse]
 sub doAugustus {
   my $runDir = "$buildDir/trackData/augustus";