940dbe5d9aa8d7d58f364d2e152324f2ec4ec331 hiram Tue Jan 14 11:38:33 2020 -0800 now running up ncbiRefSeq gene tracks refs #24748 diff --git src/hg/utils/automation/doNcbiRefSeq.pl src/hg/utils/automation/doNcbiRefSeq.pl index 36b4afd..43900b1 100755 --- src/hg/utils/automation/doNcbiRefSeq.pl +++ src/hg/utils/automation/doNcbiRefSeq.pl @@ -465,61 +465,77 @@ # establish all variables to use here export db=$db export asmId=$asmId _EOF_ ); if (! $dbExists) { $bossScript->add(<<_EOF_ export target2bit=$dbTwoBit twoBitInfo \$target2bit stdout | sort -k2,2nr > \$db.chrom.sizes wget -O bigGenePred.as 'http://genome-source.soe.ucsc.edu/gitlist/kent.git/raw/master/src/hg/lib/bigGenePred.as' wget -O bigPsl.as 'http://genome-source.soe.ucsc.edu/gitlist/kent.git/raw/master/src/hg/lib/bigPsl.as' +### overall gene track with both predicted and curated genePredToBigGenePred process/\$db.ncbiRefSeq.gp stdout | sort -k1,1 -k2,2n > \$db.ncbiRefSeq.bigGp -bedToBigBed -type=bed12+8 -tab -as=bigGenePred.as \\ + +bedToBigBed -type=bed12+8 -tab -as=bigGenePred.as -extraIndex=name \\ \$db.ncbiRefSeq.bigGp \$db.chrom.sizes \\ \$db.ncbiRefSeq.bb bigBedInfo \$db.ncbiRefSeq.bb | egrep "^itemCount:|^basesCovered:" \\ | sed -e 's/,//g' > \$db.ncbiRefSeq.stats.txt LC_NUMERIC=en_US /usr/bin/printf "# ncbiRefSeq %s %'d %s %'d\\n" `cat \$db.ncbiRefSeq.stats.txt` | xargs echo ~/kent/src/hg/utils/automation/gpToIx.pl process/\$db.ncbiRefSeq.gp \\ | sort -u > \$asmId.ncbiRefSeq.ix.txt -ixIxx \$asmId.ncbiRefSeq.ix.txt \$asmId.ncbiRefSeq.ix \$asmId.ncbiRefSeq.ixx +ixIxx \$asmId.ncbiRefSeq.ix.txt \$asmId.ncbiRefSeq.ix{,x} rm -f \$asmId.ncbiRefSeq.ix.txt + +### curated only if present if [ -s process/\$db.curated.gp ]; then genePredToBigGenePred process/\$db.curated.gp stdout | sort -k1,1 -k2,2n > \$db.ncbiRefSeqCurated.bigGp - bedToBigBed -type=bed12+8 -tab -as=bigGenePred.as \\ + bedToBigBed -type=bed12+8 -tab -as=bigGenePred.as -extraIndex=name \\ \$db.ncbiRefSeqCurated.bigGp \$db.chrom.sizes \\ \$db.ncbiRefSeqCurated.bb rm -f \$db.ncbiRefSeqCurated.bigGp bigBedInfo \$db.ncbiRefSeqCurated.bb | egrep "^itemCount:|^basesCovered:" \\ | sed -e 's/,//g' > \$db.ncbiRefSeqCurated.stats.txt LC_NUMERIC=en_US /usr/bin/printf "# ncbiRefSeqCurated %s %'d %s %'d\\n" `cat \$db.ncbiRefSeqCurated.stats.txt` | xargs echo + ~/kent/src/hg/utils/automation/gpToIx.pl process/\$db.curated.gp \\ + | sort -u > \$asmId.ncbiRefSeqCurated.ix.txt + ixIxx \$asmId.ncbiRefSeqCurated.ix.txt \$asmId.ncbiRefSeqCurated.ix{,x} + rm -f \$asmId.ncbiRefSeqCurated.ix.txt fi + +### predicted only if present if [ -s process/\$db.predicted.gp ]; then genePredToBigGenePred process/\$db.predicted.gp stdout | sort -k1,1 -k2,2n > \$db.ncbiRefSeqPredicted.bigGp - bedToBigBed -type=bed12+8 -tab -as=bigGenePred.as \\ + bedToBigBed -type=bed12+8 -tab -as=bigGenePred.as -extraIndex=name \\ \$db.ncbiRefSeqPredicted.bigGp \$db.chrom.sizes \\ \$db.ncbiRefSeqPredicted.bb rm -f \$db.ncbiRefSeqPredicted.bigGp bigBedInfo \$db.ncbiRefSeqPredicted.bb | egrep "^itemCount:|^basesCovered:" \\ | sed -e 's/,//g' > \$db.ncbiRefSeqPredicted.stats.txt LC_NUMERIC=en_US /usr/bin/printf "# ncbiRefSeqPredicted %s %'d %s %'d\\n" `cat \$db.ncbiRefSeqPredicted.stats.txt` | xargs echo + ~/kent/src/hg/utils/automation/gpToIx.pl process/\$db.predicted.gp \\ + | sort -u > \$asmId.ncbiRefSeqPredicted.ix.txt + ixIxx \$asmId.ncbiRefSeqPredicted.ix.txt \$asmId.ncbiRefSeqPredicted.ix{,x} + rm -f \$asmId.ncbiRefSeqPredicted.ix.txt fi + +### all other annotations, not necessarily genes if [ -s "process/\$db.other.bb" ]; then ln -f -s process/\$db.other.bb \$db.ncbiRefSeqOther.bb bigBedInfo \$db.ncbiRefSeqOther.bb | egrep "^itemCount:|^basesCovered:" \\ | sed -e 's/,//g' > \$db.ncbiRefSeqOther.stats.txt LC_NUMERIC=en_US /usr/bin/printf "# ncbiRefSeqOther %s %'d %s %'d\\n" `cat \$asmId.ncbiRefSeqOther.stats.txt` | xargs echo fi if [ -s "process/ncbiRefSeqOther.ix" ]; then ln -f -s process/ncbiRefSeqOther.ix ./\$db.ncbiRefSeqOther.ix ln -f -s process/ncbiRefSeqOther.ixx ./\$db.ncbiRefSeqOther.ixx fi ln -f -s process/ncbiRefSeqVersion.txt ./\$db.ncbiRefSeqVersion.txt # select only coding genes to have CDS records awk -F" " '\$6 != \$7 {print \$1;}' process/\$db.ncbiRefSeq.gp \\ | sort -u > coding.cds.name.list @@ -562,31 +578,31 @@ pslToBed missingChrMFa.psl stdout \\ | twoBitToFa -bed=stdin \$target2bit stdout >> \$db.rna.fa fi fi export totalBases=`ave -col=2 \$db.chrom.sizes | grep "^total" | awk '{printf "%d", \$2}'` export basesCovered=`bedSingleCover.pl \$db.ncbiRefSeq.bigGp | ave -col=4 stdin | grep "^total" | awk '{printf "%d", \$2}'` export percentCovered=`echo \$basesCovered \$totalBases | awk '{printf "%.3f", 100.0*\$1/\$2}'` printf "%d bases of %d (%s%%) in intersection\\n" "\$basesCovered" \\ "\$totalBases" "\$percentCovered" > fb.ncbiRefSeq.\$db.txt rm -f \$db.ncbiRefSeq.bigGp pslToBigPsl -fa=download/\$asmId.rna.fa.gz -cds=process/\$asmId.rna.cds \\ process/\$asmId.\$db.psl.gz stdout | sort -k1,1 -k2,2n > \$asmId.bigPsl -bedToBigBed -type=bed12+13 -tab -as=bigPsl.as \\ +bedToBigBed -type=bed12+13 -tab -as=bigPsl.as -extraIndex=name \\ \$asmId.bigPsl \$db.chrom.sizes \$asmId.bigPsl.bb rm -f \$asmId.bigPsl _EOF_ ); } else { $bossScript->add(<<_EOF_ # loading the genePred tracks, all genes in one, and subsets hgLoadGenePred -genePredExt \$db ncbiRefSeq process/\$db.ncbiRefSeq.gp $genePredCheckDb ncbiRefSeq if [ -s process/\$db.curated.gp ]; then hgLoadGenePred -genePredExt \$db ncbiRefSeqCurated process/\$db.curated.gp $genePredCheckDb ncbiRefSeqCurated fi