06d7be056190c14b85e71bc12523f18ea6815b5e markd Mon Dec 7 00:50:29 2020 -0800 BLAT mmap index support merge with master diff --git src/hg/utils/automation/doNcbiRefSeq.pl src/hg/utils/automation/doNcbiRefSeq.pl index 72d49aa..7698583 100755 --- src/hg/utils/automation/doNcbiRefSeq.pl +++ src/hg/utils/automation/doNcbiRefSeq.pl @@ -338,34 +338,41 @@ export ncbiGffGz=\$downloadDir/\${asmId}_genomic.gff.gz export db=$db export gff3ToRefLink=$gff3ToRefLink export gbffToCds=$gbffToCds export dateStamp=`date "+%F"` export annotationRelease=`zcat \$ncbiGffGz | head -100 | grep ^#.annotation-source | sed -e 's/.*annotation-source //'` if [ "\$annotationRelease" == "" ]; then export annotationRelease=\$asmId fi export versionDate=`ls -L --full-time \$ncbiGffGz | awk '{print \$6;}'` echo "\$annotationRelease (\$versionDate)" > ncbiRefSeqVersion.txt # this produces the genePred in NCBI coordinates # 8/23/17: gff3ToGenePred quits over illegal attribute SO_type... make it legal (so_type): +if [ -s ../../../download/\${asmId}.remove.dups.list ]; then + zcat \$ncbiGffGz | grep -v -f ../../../download/\${asmId}.remove.dups.list \\ + | sed -re 's/([;\\t])SO_type=/\\1so_type=/;' \\ + | gff3ToGenePred $warnOnly -refseqHacks -attrsOut=\$asmId.attrs.txt \\ + -unprocessedRootsOut=\$asmId.unprocessedRoots.txt stdin \$asmId.gp +else zcat \$ncbiGffGz \\ | sed -re 's/([;\\t])SO_type=/\\1so_type=/;' \\ | gff3ToGenePred $warnOnly -refseqHacks -attrsOut=\$asmId.attrs.txt \\ -unprocessedRootsOut=\$asmId.unprocessedRoots.txt stdin \$asmId.gp +fi genePredCheck \$asmId.gp zcat \$ncbiGffGz | egrep 'tag=(RefSeq|MANE) Select' || true > before.cut9.txt if [ -s before.cut9.txt ]; then cut -f9- before.cut9.txt | tr ';' '\\n' \\ | grep 'Name=' | grep -v NP_ | cut -d= -f2 | sort -u \\ > \$asmId.refseqSelectTranscripts.txt fi rm -f before.cut9.txt # extract labels from semi-structured text in gbff COMMENT/description sections: zcat \$downloadDir/\${asmId}_rna.gbff.gz \\ | (grep ' :: ' || true) \\ | perl -wpe 's/\\s+::.*//; s/^\\s+//;' \\ @@ -429,31 +436,36 @@ fi fi if [ -s \$db.predicted.gp ]; then $genePredCheckDb \$db.predicted.gp fi if [ -s \$db.other.gp ]; then $genePredCheckDb \$db.other.gp fi # join the refLink metadata with curated+predicted names cut -f1 \$db.ncbiRefSeq.gp | sort -u > \$asmId.\$db.name.list join -t\$'\\t' \$asmId.\$db.name.list \$asmId.refLink.tab > \$asmId.\$db.ncbiRefSeqLink.tab # Make bigBed with attributes in extra columns for ncbiRefSeqOther: twoBitInfo $dbTwoBit stdout | sort -k2,2n > \$db.chrom.sizes +if [ -s ../../../download/\${asmId}.remove.dups.list ]; then + genePredToBed -tab -fillSpace \$db.other.gp stdout | sort -k1,1 -k2n,2n \\ + | grep -v -f ../../../download/\${asmId}.remove.dups.list > \$db.other.bed +else genePredToBed -tab -fillSpace \$db.other.gp stdout | sort -k1,1 -k2n,2n > \$db.other.bed +fi $ncbiRefSeqOtherAttrs \$db.other.bed \$asmId.attrs.txt > \$db.other.extras.bed bedToBigBed -type=bed12+13 -as=ncbiRefSeqOther.as -tab \\ -extraIndex=name \\ \$db.other.extras.bed \$db.chrom.sizes \$db.other.bb # Make trix index for ncbiRefSeqOther $ncbiRefSeqOtherIxIxx \\ ncbiRefSeqOther.as \$db.other.extras.bed > ncbiRefSeqOther.ix.tab ixIxx ncbiRefSeqOther.ix.tab ncbiRefSeqOther.ix{,x} # PSL data will be loaded into a psl type track to show the alignments (zgrep "^#" \$ncbiGffGz | head || true) > gffForPsl.gff if [ -s ../../../download/\${asmId}.remove.dups.list ]; then (zegrep -v "NG_" \$ncbiGffGz || true) \\ @@ -707,39 +719,43 @@ $bossScript->add(<<_EOF_ # If \$db.noRna.available.list is not empty but items are on chrM, # make fake cDNA sequence for them using chrM sequence # since NCBI puts proteins, not coding RNAs, in the GFF. if [ -s \$db.noRna.available.list ]; then pslCat -nohead process/\$asmId.\$db.psl.gz \\ | grep -Fwf \$db.noRna.available.list \\ | egrep "$nonNucNames" > missingChrMFa.psl if [ -s missingChrMFa.psl ]; then pslToBed missingChrMFa.psl stdout \\ | twoBitToFa -bed=stdin \$target2bit stdout >> \$db.rna.fa fi fi -if [ -s process/\$asmId.rna.cds.gz ]; then - zcat process/\$asmId.rna.cds.gz egrep '[0-9]+\\.\\.[0-9]\\+' \\ - pslMismatchGapToBed -cdsFile=stdin -db=\$db -ignoreQNamePrefix=X \\ +if [ -s process/\$asmId.rna.cds ]; then + cat process/\$asmId.rna.cds | grep '[0-9]\\+\\.\\.[0-9]\\+' \\ + | pslMismatchGapToBed -cdsFile=stdin -db=\$db -ignoreQNamePrefix=X \\ process/\$asmId.\$db.psl.gz \$target2bit \\ \$db.rna.fa ncbiRefSeqGenomicDiff || true + if [ -s ncbiRefSeqGenomicDiff.bed ]; then wget -O txAliDiff.as 'http://genome-source.soe.ucsc.edu/gitlist/kent.git/raw/master/src/hg/lib/txAliDiff.as' bedToBigBed -type=bed9+ -tab -as=txAliDiff.as \\ ncbiRefSeqGenomicDiff.bed \$db.chrom.sizes ncbiRefSeqGenomicDiff.bb + else + rm -f ncbiRefSeqGenomicDiff.bed + fi fi export totalBases=`ave -col=2 \$db.chrom.sizes | grep "^total" | awk '{printf "%d", \$2}'` export basesCovered=`bedSingleCover.pl \$db.ncbiRefSeq.bigGp | ave -col=4 stdin | grep "^total" | awk '{printf "%d", \$2}'` export percentCovered=`echo \$basesCovered \$totalBases | awk '{printf "%.3f", 100.0*\$1/\$2}'` printf "%d bases of %d (%s%%) in intersection\\n" "\$basesCovered" \\ "\$totalBases" "\$percentCovered" > fb.ncbiRefSeq.\$db.txt printf "%d bases of %d (%s%%) in intersection\\n" "\$baseCount" "\$asmSizeNoGaps" "\$perCent" > fb.\$asmId.ncbiRefSeq.txt rm -f \$db.ncbiRefSeq.bigGp \$asmId.exons.bed pslToBigPsl -fa=download/\$asmId.rna.fa.gz -cds=process/\$asmId.rna.cds \\ process/\$asmId.\$db.psl.gz stdout | sort -k1,1 -k2,2n > \$asmId.bigPsl bedToBigBed -type=bed12+13 -tab -as=bigPsl.as -extraIndex=name \\ \$asmId.bigPsl \$db.chrom.sizes \$asmId.bigPsl.bb @@ -815,40 +831,44 @@ pslCat -nohead process/\$asmId.\$db.psl.gz \\ | grep -Fwf \$db.noRna.available.list \\ | grep chrM > missingChrMFa.psl if [ -s missingChrMFa.psl ]; then pslToBed missingChrMFa.psl stdout \\ | twoBitToFa -bed=stdin $dbTwoBit stdout >> \$db.rna.fa fi fi mkdir -p $gbdbDir ln -f -s `pwd`/\$db.rna.fa $gbdbDir/seqNcbiRefSeq.rna.fa hgLoadSeq -drop -seqTbl=seqNcbiRefSeq -extFileTbl=extNcbiRefSeq \$db $gbdbDir/seqNcbiRefSeq.rna.fa hgLoadPsl \$db -table=ncbiRefSeqPsl process/\$asmId.\$db.psl.gz -if [ -s process/\$asmId.rna.cds.gz ]; then - zcat process/\$asmId.rna.cds.gz egrep '[0-9]+\\.\\.[0-9]\\+' \\ - pslMismatchGapToBed -cdsFile=stdin -db=\$db -ignoreQNamePrefix=X \\ +if [ -s process/\$asmId.rna.cds ]; then + zcat process/\$asmId.rna.cds | grep '[0-9]\\+\\.\\.[0-9]\\+' \\ + | pslMismatchGapToBed -cdsFile=stdin -db=\$db -ignoreQNamePrefix=X \\ process/\$asmId.\$db.psl.gz $dbTwoBit \\ \$db.rna.fa ncbiRefSeqGenomicDiff || true - bedToBigBed -type=bed9+ -tab -as=~/kent/src/hg/lib/txAliDiff.as \\ - ncbiRefSeqGenomicDiff.bed process/\$db.chrom.sizes ncbiRefSeqGenomicDiff.bb rm -f $gbdbDir/ncbiRefSeqGenomicDiff.bb + if [ -s ncbiRefSeqGenomicDiff.bed ]; then + bedToBigBed -type=bed9+ -tab -as=\${HOME}/kent/src/hg/lib/txAliDiff.as \\ + ncbiRefSeqGenomicDiff.bed process/\$db.chrom.sizes ncbiRefSeqGenomicDiff.bb ln -s `pwd`/ncbiRefSeqGenomicDiff.bb $gbdbDir/ncbiRefSeqGenomicDiff.bb + else + rm -f ncbiRefSeqGenomicDiff.bed + fi fi if [ -d "/usr/local/apache/htdocs-hgdownload/goldenPath/archive" ]; then gtfFile=`ls \$db.*.ncbiRefSeq.gtf.gz` mkdir -p /usr/local/apache/htdocs-hgdownload/goldenPath/archive/\$db/ncbiRefSeq rm -f /usr/local/apache/htdocs-hgdownload/goldenPath/archive/\$db/ncbiRefSeq/\$gtfFile ln -s `pwd`/\$db.*.ncbiRefSeq.gtf.gz \\ /usr/local/apache/htdocs-hgdownload/goldenPath/archive/\$db/ncbiRefSeq/ fi featureBits \$db ncbiRefSeq > fb.ncbiRefSeq.\$db.txt 2>&1 cat fb.ncbiRefSeq.\$db.txt 2>&1 _EOF_ ); } # if ($dbExists)