src/hg/utils/automation/doNcbiRefSeq.pl 608fb8b82f76fc54c4189815cad4ce27bf57b5a0

608fb8b82f76fc54c4189815cad4ce27bf57b5a0
hiram
  Mon Jan 13 13:16:53 2020 -0800
now running up NCBI RefSeq track on assembly hubs refs #24748

diff --git src/hg/utils/automation/doNcbiRefSeq.pl src/hg/utils/automation/doNcbiRefSeq.pl
index 0d9fdc5..36b4afd 100755
--- src/hg/utils/automation/doNcbiRefSeq.pl
+++ src/hg/utils/automation/doNcbiRefSeq.pl
@@ -469,46 +469,62 @@
 
 _EOF_
   );
   if (! $dbExists) {
     $bossScript->add(<<_EOF_
 export target2bit=$dbTwoBit
 
 twoBitInfo \$target2bit stdout | sort -k2,2nr > \$db.chrom.sizes
 wget -O bigGenePred.as 'http://genome-source.soe.ucsc.edu/gitlist/kent.git/raw/master/src/hg/lib/bigGenePred.as'
 wget -O bigPsl.as 'http://genome-source.soe.ucsc.edu/gitlist/kent.git/raw/master/src/hg/lib/bigPsl.as'
 
 genePredToBigGenePred process/\$db.ncbiRefSeq.gp stdout | sort -k1,1 -k2,2n > \$db.ncbiRefSeq.bigGp
 bedToBigBed -type=bed12+8 -tab -as=bigGenePred.as \\
   \$db.ncbiRefSeq.bigGp \$db.chrom.sizes \\
     \$db.ncbiRefSeq.bb
+bigBedInfo \$db.ncbiRefSeq.bb | egrep "^itemCount:|^basesCovered:" \\
+    | sed -e 's/,//g' > \$db.ncbiRefSeq.stats.txt
+LC_NUMERIC=en_US /usr/bin/printf "# ncbiRefSeq %s %'d %s %'d\\n" `cat \$db.ncbiRefSeq.stats.txt` | xargs echo
+~/kent/src/hg/utils/automation/gpToIx.pl process/\$db.ncbiRefSeq.gp \\
+    | sort -u > \$asmId.ncbiRefSeq.ix.txt
+ixIxx \$asmId.ncbiRefSeq.ix.txt \$asmId.ncbiRefSeq.ix \$asmId.ncbiRefSeq.ixx
+rm -f \$asmId.ncbiRefSeq.ix.txt
 if [ -s process/\$db.curated.gp ]; then
   genePredToBigGenePred process/\$db.curated.gp stdout | sort -k1,1 -k2,2n > \$db.ncbiRefSeqCurated.bigGp
   bedToBigBed -type=bed12+8 -tab -as=bigGenePred.as \\
   \$db.ncbiRefSeqCurated.bigGp \$db.chrom.sizes \\
     \$db.ncbiRefSeqCurated.bb
   rm -f \$db.ncbiRefSeqCurated.bigGp
+  bigBedInfo \$db.ncbiRefSeqCurated.bb | egrep "^itemCount:|^basesCovered:" \\
+    | sed -e 's/,//g' > \$db.ncbiRefSeqCurated.stats.txt
+  LC_NUMERIC=en_US /usr/bin/printf "# ncbiRefSeqCurated %s %'d %s %'d\\n" `cat \$db.ncbiRefSeqCurated.stats.txt` | xargs echo
 fi
 if [ -s process/\$db.predicted.gp ]; then
   genePredToBigGenePred process/\$db.predicted.gp stdout | sort -k1,1 -k2,2n > \$db.ncbiRefSeqPredicted.bigGp
   bedToBigBed -type=bed12+8 -tab -as=bigGenePred.as \\
   \$db.ncbiRefSeqPredicted.bigGp \$db.chrom.sizes \\
     \$db.ncbiRefSeqPredicted.bb
   rm -f \$db.ncbiRefSeqPredicted.bigGp
+  bigBedInfo \$db.ncbiRefSeqPredicted.bb | egrep "^itemCount:|^basesCovered:" \\
+    | sed -e 's/,//g' > \$db.ncbiRefSeqPredicted.stats.txt
+  LC_NUMERIC=en_US /usr/bin/printf "# ncbiRefSeqPredicted %s %'d %s %'d\\n" `cat \$db.ncbiRefSeqPredicted.stats.txt` | xargs echo
 fi
 if [ -s "process/\$db.other.bb" ]; then
   ln -f -s process/\$db.other.bb \$db.ncbiRefSeqOther.bb
+  bigBedInfo \$db.ncbiRefSeqOther.bb | egrep "^itemCount:|^basesCovered:" \\
+    | sed -e 's/,//g' > \$db.ncbiRefSeqOther.stats.txt
+  LC_NUMERIC=en_US /usr/bin/printf "# ncbiRefSeqOther %s %'d %s %'d\\n" `cat \$asmId.ncbiRefSeqOther.stats.txt` | xargs echo
 fi
 if [ -s "process/ncbiRefSeqOther.ix" ]; then
   ln -f -s process/ncbiRefSeqOther.ix ./\$db.ncbiRefSeqOther.ix
   ln -f -s process/ncbiRefSeqOther.ixx ./\$db.ncbiRefSeqOther.ixx
 fi
 ln -f -s process/ncbiRefSeqVersion.txt ./\$db.ncbiRefSeqVersion.txt
 # select only coding genes to have CDS records
 
 awk -F" " '\$6 != \$7 {print \$1;}' process/\$db.ncbiRefSeq.gp \\
   | sort -u > coding.cds.name.list
 
 join -t\$'\t' coding.cds.name.list process/\$asmId.rna.cds \\
   > \$db.ncbiRefSeqCds.tab
 
 rm -f coding.cds.name.list
@@ -545,33 +561,34 @@
   if [ -s missingChrMFa.psl ]; then
     pslToBed missingChrMFa.psl stdout \\
       | twoBitToFa -bed=stdin \$target2bit stdout >> \$db.rna.fa
   fi
 fi
 
 export totalBases=`ave -col=2 \$db.chrom.sizes | grep "^total" | awk '{printf "%d", \$2}'`
 export basesCovered=`bedSingleCover.pl \$db.ncbiRefSeq.bigGp | ave -col=4 stdin | grep "^total" | awk '{printf "%d", \$2}'`
 export percentCovered=`echo \$basesCovered \$totalBases | awk '{printf "%.3f", 100.0*\$1/\$2}'`
 printf "%d bases of %d (%s%%) in intersection\\n" "\$basesCovered" \\
    "\$totalBases" "\$percentCovered" > fb.ncbiRefSeq.\$db.txt
 
 rm -f \$db.ncbiRefSeq.bigGp
 
 pslToBigPsl -fa=download/\$asmId.rna.fa.gz -cds=process/\$asmId.rna.cds \\
-  process/\$asmId.\$db.psl.gz stdout | sort -k1,1 -k2,2n > \$asmId.\$db.bigPsl
+  process/\$asmId.\$db.psl.gz stdout | sort -k1,1 -k2,2n > \$asmId.bigPsl
 bedToBigBed -type=bed12+13 -tab -as=bigPsl.as \\
-  \$asmId.\$db.bigPsl \$db.chrom.sizes \$asmId.\$db.bigPsl.bb
+  \$asmId.bigPsl \$db.chrom.sizes \$asmId.bigPsl.bb
+rm -f \$asmId.bigPsl
 _EOF_
     );
   } else {
 
     $bossScript->add(<<_EOF_
 # loading the genePred tracks, all genes in one, and subsets
 hgLoadGenePred -genePredExt \$db ncbiRefSeq process/\$db.ncbiRefSeq.gp
 $genePredCheckDb ncbiRefSeq
 
 if [ -s process/\$db.curated.gp ]; then
   hgLoadGenePred -genePredExt \$db ncbiRefSeqCurated process/\$db.curated.gp
   $genePredCheckDb ncbiRefSeqCurated
 fi
 
 if [ -s process/\$db.predicted.gp ]; then