9fb7e2a933dd0b378f251d348153730c9972ddcd
hiram
  Wed Sep 1 15:24:05 2021 -0700
eliminate small errors in unlocalized scaffold names and genePredFilter for ncbiGene to eliminate problem gene definitions no redmine

diff --git src/hg/utils/automation/doAssemblyHub.pl src/hg/utils/automation/doAssemblyHub.pl
index b6f4a00..8c3e1ef 100755
--- src/hg/utils/automation/doAssemblyHub.pl
+++ src/hg/utils/automation/doAssemblyHub.pl
@@ -777,50 +777,50 @@
 
     if (needsUpdate($unplacedScafAgp, $agpOutput)) {
       unplacedAgp($unplacedScafAgp, $agpOutput, $agpNames, $chrPrefix);
       `touch -r $unplacedScafAgp $agpOutput`;
     }
     my $fastaOut = "$runDir/$asmId.unplaced.fa.gz";
     if (needsUpdate($twoBitFile, $fastaOut)) {
       unplacedFasta($unplacedScafAgp, $twoBitFile, $chrPrefix, $fastaOut);
       `touch -r $twoBitFile $fastaOut`;
     }
   }
 
   ###########  non-nuclear chromosome sequence  ################
   my $nonNucAsm = "$buildDir/download/${asmId}_assembly_structure/non-nuclear";
   my $nonNucChr2acc = "$nonNucAsm/assembled_chromosomes/chr2acc";
-  my $agpSource = "$nonNucAsm/assembled_chromosomes/AGP";
   if ( -s $nonNucChr2acc ) {
+    my $agpSource = "$nonNucAsm/assembled_chromosomes/AGP";
     my $agpOutput = "$runDir/$asmId.nonNucChr.agp.gz";
     my $agpNames = "$runDir/$asmId.nonNucChr.names";
     my $fastaOut = "$runDir/$asmId.nonNucChr.fa.gz";
     $partsDone += 1;
     if (needsUpdate($nonNucChr2acc, $agpOutput)) {
       compositeAgp($nonNucChr2acc, $agpSource, $agpOutput, $agpNames);
       `touch -r $nonNucChr2acc $agpOutput`;
     }
     if (needsUpdate($twoBitFile, $fastaOut)) {
       compositeFasta($nonNucChr2acc, $twoBitFile, $fastaOut);
       `touch -r $twoBitFile $fastaOut`;
     }
   }
 
   ###########  non-nuclear scaffold unlocalized sequence  ################
   my $nonNucChr2scaf = "$nonNucAsm/unlocalized_scaffolds/unlocalized.chr2scaf";
-  my $agpSource = "$nonNucAsm/unlocalized_scaffolds/AGP";
   if ( -s $nonNucChr2scaf ) {
+    my $agpSource = "$nonNucAsm/unlocalized_scaffolds/AGP";
     my $agpOutput = "$runDir/$asmId.nonNucUnlocalized.agp.gz";
     my $agpNames = "$runDir/$asmId.nonNucUnlocalized.names";
     my $fastaOut = "$runDir/$asmId.nonNucUnlocalized.fa.gz";
     $partsDone += 1;
     if (needsUpdate($nonNucChr2scaf, $agpOutput)) {
       unlocalizedAgp($nonNucChr2scaf, $agpSource, $agpOutput, $agpNames);
       `touch -r $nonNucChr2scaf $agpOutput`;
     }
     if (needsUpdate($twoBitFile, $fastaOut)) {
       unlocalizedFasta($nonNucChr2scaf, $twoBitFile, $fastaOut);
       `touch -r $twoBitFile $fastaOut`;
     }
   }
 
   $bossScript->add(<<_EOF_
@@ -1627,31 +1627,32 @@
     $dupList = " | grep -v -f \"${buildDir}/download/${asmId}.remove.dups.list\" ";
   }
 
   $bossScript->add(<<_EOF_
 export asmId=$asmId
 export gffFile=$gffFile
 
 function cleanUp() {
   rm -f \$asmId.ncbiGene.genePred.gz \$asmId.ncbiGene.genePred
   rm -f \$asmId.geneAttrs.ncbi.txt
 }
 
 if [ \$gffFile -nt \$asmId.ncbiGene.bb ]; then
   (gff3ToGenePred -warnAndContinue -useName \\
     -attrsOut=\$asmId.geneAttrs.ncbi.txt \$gffFile stdout \\
-      2>> \$asmId.ncbiGene.log.txt || true) | genePredFilter stdin stdout \\
+      2>> \$asmId.ncbiGene.log.txt || true) | genePredFilter \\
+         -chromSizes=../../\$asmId.chrom.sizes stdin stdout \\
         $dupList | gzip -c > \$asmId.ncbiGene.genePred.gz
   genePredCheck \$asmId.ncbiGene.genePred.gz
   export howMany=`genePredCheck \$asmId.ncbiGene.genePred.gz 2>&1 | grep "^checked" | awk '{print \$2}'`
   if [ "\${howMany}" -eq 0 ]; then
      printf "# ncbiGene: no gene definitions found in \$gffFile\n";
      cleanUp
      exit 0
   fi
   export ncbiGenePred="\$asmId.ncbiGene.genePred.gz"
 _EOF_
   );
   if ($ucscNames) {
     $bossScript->add(<<_EOF_
   liftUp -extGenePred -type=.gp stdout \\
     ../../sequence/\$asmId.ncbiToUcsc.lift warn \\
@@ -1913,34 +1914,34 @@
   "$HgAutomate::clusterData/asmHubs/refseqBuild/$ftpDir";
 
 $sourceDir = $opt_sourceDir ? $opt_sourceDir : $sourceDir;
 $assemblySource = $opt_sourceDir ? "$opt_sourceDir" : "$sourceDir/$ftpDir";
 my $asmReport = "$assemblySource/${asmId}_assembly_report.txt";
 
 $species = $opt_species ? $opt_species : $species;
 
 if (length($species) < 1) {
   if (-s "$asmReport") {
      $species = `grep -i "organism name:" $asmReport`;
      chomp $species;
      $species =~ s/.*organism\s+name:\s+//i;
      $species =~ s/\s+\(.*//;
   } else {
-     die "no -species specified and can not find $asmReport";
+     die "ERROR: no -species specified and can not find $asmReport";
   }
   if (length($species) < 1) {
-     die "no -species specified and can not find Organism name: in $asmReport";
+     die "ERROR: no -species specified and can not find Organism name: in $asmReport";
   }
 }
 
 $rmskSpecies = $opt_rmskSpecies ? $opt_rmskSpecies : $species;
 $augustusSpecies = $opt_augustusSpecies ? $opt_augustusSpecies : $augustusSpecies;
 $xenoRefSeq = $opt_xenoRefSeq ? $opt_xenoRefSeq : $xenoRefSeq;
 $ucscNames = $opt_ucscNames ? 1 : $ucscNames;   # '1' == 'TRUE'
 $noAugustus = $opt_noAugustus ? 1 : $noAugustus;   # '1' == 'TRUE'
 $noXenoRefSeq = $opt_noXenoRefSeq ? 1 : $noXenoRefSeq;   # '1' == 'TRUE'
 $workhorse = $opt_workhorse ? $opt_workhorse : $workhorse;
 $bigClusterHub = $opt_bigClusterHub ? $opt_bigClusterHub : $bigClusterHub;
 $smallClusterHub = $opt_smallClusterHub ? $opt_smallClusterHub : $smallClusterHub;
 $fileServer = $opt_fileServer ? $opt_fileServer : $fileServer;
 $asmHubName = $opt_asmHubName ? $opt_asmHubName : $asmHubName;
 $ncbiRmsk = $opt_ncbiRmsk ? 1 : 0;