97199fbf1ff56c9ee01956deb736b97244ea3ac6 hiram Sun Mar 1 21:52:28 2020 -0800 calculating featureBits like measurement for gene tracks, removing duplicates for ncbiRefSeq, remove blanks from gene names for ncbiRefSeq, and fix fundamental bug reference to geneToId in ncbiRefSeqOtherAttrs.pl refs #23891 diff --git src/hg/utils/automation/doNcbiRefSeq.pl src/hg/utils/automation/doNcbiRefSeq.pl index 394369d..52c8841 100755 --- src/hg/utils/automation/doNcbiRefSeq.pl +++ src/hg/utils/automation/doNcbiRefSeq.pl @@ -416,34 +416,42 @@ twoBitInfo $dbTwoBit stdout | sort -k2,2n > \$db.chrom.sizes genePredToBed -tab -fillSpace \$db.other.gp stdout | sort -k1,1 -k2n,2n > \$db.other.bed $ncbiRefSeqOtherAttrs \$db.other.bed \$asmId.attrs.txt > \$db.other.extras.bed bedToBigBed -type=bed12+13 -as=ncbiRefSeqOther.as -tab \\ -extraIndex=name \\ \$db.other.extras.bed \$db.chrom.sizes \$db.other.bb # Make trix index for ncbiRefSeqOther $ncbiRefSeqOtherIxIxx \\ ncbiRefSeqOther.as \$db.other.extras.bed > ncbiRefSeqOther.ix.tab ixIxx ncbiRefSeqOther.ix.tab ncbiRefSeqOther.ix{,x} # PSL data will be loaded into a psl type track to show the alignments (zgrep "^#" \$ncbiGffGz | head || true) > gffForPsl.gff +if [ -s ../../../download/\${asmId}.remove.dups.list ]; then (zegrep -v "NG_" \$ncbiGffGz || true) \\ + | grep -v -f ../../../download/\${asmId}.remove.dups.list \\ | awk -F\$'\\t' '\$3 == "cDNA_match" || \$3 == "match"' >> gffForPsl.gff gff3ToPsl -dropT \$downloadDir/\$asmId.ncbi.chrom.sizes \$downloadDir/rna.sizes \\ gffForPsl.gff stdout | pslPosTarget stdin \$asmId.psl +else + (zegrep -v "NG_" \$ncbiGffGz || true) \\ + | awk -F\$'\\t' '\$3 == "cDNA_match" || \$3 == "match"' >> gffForPsl.gff + gff3ToPsl -dropT \$downloadDir/\$asmId.ncbi.chrom.sizes \$downloadDir/rna.sizes \\ + gffForPsl.gff stdout | pslPosTarget stdin \$asmId.psl +fi simpleChain -outPsl -maxGap=300000 \$asmId.psl stdout | pslSwap stdin stdout \\ | liftUp -type=.psl stdout $localLiftFile warn stdin \\ | gzip -c > \$db.psl.gz pslCheck $pslTargetSizes \\ -querySizes=\$downloadDir/rna.sizes \$db.psl.gz # extract RNA CDS information from genbank record # Note: $asmId.raFile.txt could be used instead of _rna.gbff.gz \$gbffToCds \$downloadDir/\${asmId}_rna.gbff.gz | sort > \$asmId.rna.cds # the NCBI _genomic.gff.gz file only contains cDNA_match records for transcripts # that do not *exactly* match the reference genome. For all other transcripts # construct 'fake' PSL records representing the alignments of all cDNAs # that would be perfect matches to the reference genome. The pslFixCdsJoinGap # will fixup those records with unusual alignments due to frameshifts of @@ -495,31 +503,35 @@ export db=$db export asmId=$asmId _EOF_ ); if (! $dbExists) { $bossScript->add(<<_EOF_ export target2bit=$dbTwoBit twoBitInfo \$target2bit stdout | sort -k2,2nr > \$db.chrom.sizes wget -O bigGenePred.as 'http://genome-source.soe.ucsc.edu/gitlist/kent.git/raw/master/src/hg/lib/bigGenePred.as' wget -O bigPsl.as 'http://genome-source.soe.ucsc.edu/gitlist/kent.git/raw/master/src/hg/lib/bigPsl.as' ### overall gene track with both predicted and curated genePredToBigGenePred process/\$db.ncbiRefSeq.gp stdout | sort -k1,1 -k2,2n > \$db.ncbiRefSeq.bigGp - +genePredToBed -tab -fillSpace process/\$db.ncbiRefSeq.gp stdout \\ + | bedToExons stdin stdout | bedSingleCover.pl stdin > \$asmId.exons.bed +export baseCount=`awk '{sum+=\$3-\$2}END{printf "%d", sum}' \$asmId.exons.bed` +export asmSizeNoGaps=`grep sequences ../../\$asmId.faSize.txt | awk '{print \$5}'` +export perCent=`echo \$baseCount \$asmSizeNoGaps | awk '{printf "%.3f", 100.0*\$1/\$2}'` bedToBigBed -type=bed12+8 -tab -as=bigGenePred.as -extraIndex=name \\ \$db.ncbiRefSeq.bigGp \$db.chrom.sizes \\ \$db.ncbiRefSeq.bb bigBedInfo \$db.ncbiRefSeq.bb | egrep "^itemCount:|^basesCovered:" \\ | sed -e 's/,//g' > \$db.ncbiRefSeq.stats.txt LC_NUMERIC=en_US /usr/bin/printf "# ncbiRefSeq %s %'d %s %'d\\n" `cat \$db.ncbiRefSeq.stats.txt` | xargs echo ~/kent/src/hg/utils/automation/gpToIx.pl process/\$db.ncbiRefSeq.gp \\ | sort -u > \$asmId.ncbiRefSeq.ix.txt ixIxx \$asmId.ncbiRefSeq.ix.txt \$asmId.ncbiRefSeq.ix{,x} rm -f \$asmId.ncbiRefSeq.ix.txt ### curated only if present if [ -s process/\$db.curated.gp ]; then genePredToBigGenePred process/\$db.curated.gp stdout | sort -k1,1 -k2,2n > \$db.ncbiRefSeqCurated.bigGp bedToBigBed -type=bed12+8 -tab -as=bigGenePred.as -extraIndex=name \\ @@ -616,32 +628,33 @@ if [ -s \$db.noRna.available.list ]; then pslCat -nohead process/\$asmId.\$db.psl.gz \\ | grep -Fwf \$db.noRna.available.list \\ | grep chrM > missingChrMFa.psl if [ -s missingChrMFa.psl ]; then pslToBed missingChrMFa.psl stdout \\ | twoBitToFa -bed=stdin \$target2bit stdout >> \$db.rna.fa fi fi export totalBases=`ave -col=2 \$db.chrom.sizes | grep "^total" | awk '{printf "%d", \$2}'` export basesCovered=`bedSingleCover.pl \$db.ncbiRefSeq.bigGp | ave -col=4 stdin | grep "^total" | awk '{printf "%d", \$2}'` export percentCovered=`echo \$basesCovered \$totalBases | awk '{printf "%.3f", 100.0*\$1/\$2}'` printf "%d bases of %d (%s%%) in intersection\\n" "\$basesCovered" \\ "\$totalBases" "\$percentCovered" > fb.ncbiRefSeq.\$db.txt +printf "%d bases of %d (%s%%) in intersection\\n" "\$baseCount" "\$asmSizeNoGaps" "\$perCent" > fb.\$asmId.ncbiRefSeq.txt -rm -f \$db.ncbiRefSeq.bigGp +rm -f \$db.ncbiRefSeq.bigGp \$asmId.exons.bed pslToBigPsl -fa=download/\$asmId.rna.fa.gz -cds=process/\$asmId.rna.cds \\ process/\$asmId.\$db.psl.gz stdout | sort -k1,1 -k2,2n > \$asmId.bigPsl bedToBigBed -type=bed12+13 -tab -as=bigPsl.as -extraIndex=name \\ \$asmId.bigPsl \$db.chrom.sizes \$asmId.bigPsl.bb rm -f \$asmId.bigPsl _EOF_ ); } else { $bossScript->add(<<_EOF_ # loading the genePred tracks, all genes in one, and subsets hgLoadGenePred -genePredExt \$db ncbiRefSeq process/\$db.ncbiRefSeq.gp $genePredCheckDb ncbiRefSeq