src/hg/utils/automation/doNcbiRefSeq.pl 97199fbf1ff56c9ee01956deb736b97244ea3ac6

97199fbf1ff56c9ee01956deb736b97244ea3ac6
hiram
  Sun Mar 1 21:52:28 2020 -0800
calculating featureBits like measurement for gene tracks, removing duplicates for ncbiRefSeq, remove blanks from gene names for ncbiRefSeq, and fix fundamental bug reference to geneToId in ncbiRefSeqOtherAttrs.pl refs #23891

diff --git src/hg/utils/automation/doNcbiRefSeq.pl src/hg/utils/automation/doNcbiRefSeq.pl
index 394369d..52c8841 100755
--- src/hg/utils/automation/doNcbiRefSeq.pl
+++ src/hg/utils/automation/doNcbiRefSeq.pl
@@ -416,34 +416,42 @@
 twoBitInfo $dbTwoBit stdout | sort -k2,2n > \$db.chrom.sizes
 genePredToBed -tab -fillSpace \$db.other.gp stdout | sort -k1,1 -k2n,2n > \$db.other.bed
 $ncbiRefSeqOtherAttrs \$db.other.bed \$asmId.attrs.txt > \$db.other.extras.bed
 bedToBigBed -type=bed12+13 -as=ncbiRefSeqOther.as -tab \\
   -extraIndex=name \\
   \$db.other.extras.bed \$db.chrom.sizes \$db.other.bb
 
 # Make trix index for ncbiRefSeqOther
 $ncbiRefSeqOtherIxIxx \\
   ncbiRefSeqOther.as \$db.other.extras.bed > ncbiRefSeqOther.ix.tab
 
 ixIxx ncbiRefSeqOther.ix.tab ncbiRefSeqOther.ix{,x}
 
 # PSL data will be loaded into a psl type track to show the alignments
 (zgrep "^#" \$ncbiGffGz | head || true) > gffForPsl.gff
+if [ -s ../../../download/\${asmId}.remove.dups.list ]; then
   (zegrep -v "NG_" \$ncbiGffGz || true) \\
+    | grep -v -f ../../../download/\${asmId}.remove.dups.list \\
     | awk -F\$'\\t' '\$3 == "cDNA_match" || \$3 == "match"' >> gffForPsl.gff
   gff3ToPsl -dropT \$downloadDir/\$asmId.ncbi.chrom.sizes \$downloadDir/rna.sizes \\
     gffForPsl.gff stdout | pslPosTarget stdin \$asmId.psl
+else
+  (zegrep -v "NG_" \$ncbiGffGz || true) \\
+    | awk -F\$'\\t' '\$3 == "cDNA_match" || \$3 == "match"' >> gffForPsl.gff
+  gff3ToPsl -dropT \$downloadDir/\$asmId.ncbi.chrom.sizes \$downloadDir/rna.sizes \\
+    gffForPsl.gff stdout | pslPosTarget stdin \$asmId.psl
+fi
 simpleChain -outPsl -maxGap=300000 \$asmId.psl stdout | pslSwap stdin stdout \\
   | liftUp -type=.psl stdout $localLiftFile warn stdin \\
    | gzip -c > \$db.psl.gz
 pslCheck $pslTargetSizes \\
    -querySizes=\$downloadDir/rna.sizes \$db.psl.gz
 
 # extract RNA CDS information from genbank record
 # Note: $asmId.raFile.txt could be used instead of _rna.gbff.gz
 \$gbffToCds \$downloadDir/\${asmId}_rna.gbff.gz | sort > \$asmId.rna.cds
 
 # the NCBI _genomic.gff.gz file only contains cDNA_match records for transcripts
 # that do not *exactly* match the reference genome.  For all other transcripts
 # construct 'fake' PSL records representing the alignments of all cDNAs
 # that would be perfect matches to the reference genome.  The pslFixCdsJoinGap
 # will fixup those records with unusual alignments due to frameshifts of
@@ -495,31 +503,35 @@
 export db=$db
 export asmId=$asmId
 
 _EOF_
   );
   if (! $dbExists) {
     $bossScript->add(<<_EOF_
 export target2bit=$dbTwoBit
 
 twoBitInfo \$target2bit stdout | sort -k2,2nr > \$db.chrom.sizes
 wget -O bigGenePred.as 'http://genome-source.soe.ucsc.edu/gitlist/kent.git/raw/master/src/hg/lib/bigGenePred.as'
 wget -O bigPsl.as 'http://genome-source.soe.ucsc.edu/gitlist/kent.git/raw/master/src/hg/lib/bigPsl.as'
 
 ### overall gene track with both predicted and curated
 genePredToBigGenePred process/\$db.ncbiRefSeq.gp stdout | sort -k1,1 -k2,2n > \$db.ncbiRefSeq.bigGp
-
+genePredToBed -tab -fillSpace process/\$db.ncbiRefSeq.gp stdout \\
+    | bedToExons stdin stdout | bedSingleCover.pl stdin > \$asmId.exons.bed
+export baseCount=`awk '{sum+=\$3-\$2}END{printf "%d", sum}' \$asmId.exons.bed`
+export asmSizeNoGaps=`grep sequences ../../\$asmId.faSize.txt | awk '{print \$5}'`
+export perCent=`echo \$baseCount \$asmSizeNoGaps | awk '{printf "%.3f", 100.0*\$1/\$2}'`
 bedToBigBed -type=bed12+8 -tab -as=bigGenePred.as -extraIndex=name \\
   \$db.ncbiRefSeq.bigGp \$db.chrom.sizes \\
     \$db.ncbiRefSeq.bb
 bigBedInfo \$db.ncbiRefSeq.bb | egrep "^itemCount:|^basesCovered:" \\
     | sed -e 's/,//g' > \$db.ncbiRefSeq.stats.txt
 LC_NUMERIC=en_US /usr/bin/printf "# ncbiRefSeq %s %'d %s %'d\\n" `cat \$db.ncbiRefSeq.stats.txt` | xargs echo
 ~/kent/src/hg/utils/automation/gpToIx.pl process/\$db.ncbiRefSeq.gp \\
     | sort -u > \$asmId.ncbiRefSeq.ix.txt
 ixIxx \$asmId.ncbiRefSeq.ix.txt \$asmId.ncbiRefSeq.ix{,x}
 rm -f \$asmId.ncbiRefSeq.ix.txt
 
 ### curated only if present
 if [ -s process/\$db.curated.gp ]; then
   genePredToBigGenePred process/\$db.curated.gp stdout | sort -k1,1 -k2,2n > \$db.ncbiRefSeqCurated.bigGp
   bedToBigBed -type=bed12+8 -tab -as=bigGenePred.as -extraIndex=name \\
@@ -616,32 +628,33 @@
 if [ -s \$db.noRna.available.list ]; then
   pslCat -nohead process/\$asmId.\$db.psl.gz \\
     | grep -Fwf \$db.noRna.available.list \\
       | grep chrM > missingChrMFa.psl
   if [ -s missingChrMFa.psl ]; then
     pslToBed missingChrMFa.psl stdout \\
       | twoBitToFa -bed=stdin \$target2bit stdout >> \$db.rna.fa
   fi
 fi
 
 export totalBases=`ave -col=2 \$db.chrom.sizes | grep "^total" | awk '{printf "%d", \$2}'`
 export basesCovered=`bedSingleCover.pl \$db.ncbiRefSeq.bigGp | ave -col=4 stdin | grep "^total" | awk '{printf "%d", \$2}'`
 export percentCovered=`echo \$basesCovered \$totalBases | awk '{printf "%.3f", 100.0*\$1/\$2}'`
 printf "%d bases of %d (%s%%) in intersection\\n" "\$basesCovered" \\
    "\$totalBases" "\$percentCovered" > fb.ncbiRefSeq.\$db.txt
+printf "%d bases of %d (%s%%) in intersection\\n" "\$baseCount" "\$asmSizeNoGaps" "\$perCent" > fb.\$asmId.ncbiRefSeq.txt
 
-rm -f \$db.ncbiRefSeq.bigGp
+rm -f \$db.ncbiRefSeq.bigGp \$asmId.exons.bed
 
 pslToBigPsl -fa=download/\$asmId.rna.fa.gz -cds=process/\$asmId.rna.cds \\
   process/\$asmId.\$db.psl.gz stdout | sort -k1,1 -k2,2n > \$asmId.bigPsl
 bedToBigBed -type=bed12+13 -tab -as=bigPsl.as -extraIndex=name \\
   \$asmId.bigPsl \$db.chrom.sizes \$asmId.bigPsl.bb
 rm -f \$asmId.bigPsl
 _EOF_
     );
   } else {
 
     $bossScript->add(<<_EOF_
 # loading the genePred tracks, all genes in one, and subsets
 hgLoadGenePred -genePredExt \$db ncbiRefSeq process/\$db.ncbiRefSeq.gp
 $genePredCheckDb ncbiRefSeq