ba856026204f578954c3cd7468c26d173e66d58f hiram Fri May 28 11:37:58 2021 -0700 correctly load RefSeqSelect table and add loading of RefSeqHgmd table for hg19 and hg38 refs #27575 diff --git src/hg/utils/automation/doNcbiRefSeq.pl src/hg/utils/automation/doNcbiRefSeq.pl index f74e1db..15b6bdd 100755 --- src/hg/utils/automation/doNcbiRefSeq.pl +++ src/hg/utils/automation/doNcbiRefSeq.pl @@ -351,31 +351,32 @@ # this produces the genePred in NCBI coordinates # 8/23/17: gff3ToGenePred quits over illegal attribute SO_type... make it legal (so_type): if [ -s ../../../download/\${asmId}.remove.dups.list ]; then zcat \$ncbiGffGz | grep -v -f ../../../download/\${asmId}.remove.dups.list \\ | sed -re 's/([;\\t])SO_type=/\\1so_type=/;' \\ | gff3ToGenePred $warnOnly -refseqHacks -attrsOut=\$asmId.attrs.txt \\ -unprocessedRootsOut=\$asmId.unprocessedRoots.txt stdin \$asmId.gp else zcat \$ncbiGffGz \\ | sed -re 's/([;\\t])SO_type=/\\1so_type=/;' \\ | gff3ToGenePred $warnOnly -refseqHacks -attrsOut=\$asmId.attrs.txt \\ -unprocessedRootsOut=\$asmId.unprocessedRoots.txt stdin \$asmId.gp fi genePredCheck \$asmId.gp -zcat \$ncbiGffGz | egrep 'tag=(RefSeq|MANE) Select' || true > before.cut9.txt +rm -f \$asmId.refseqSelectTranscripts.txt +zegrep 'tag=(RefSeq|MANE) Select' \$ncbiGffGz > before.cut9.txt || true if [ -s before.cut9.txt ]; then cut -f9- before.cut9.txt | tr ';' '\\n' \\ | grep 'Name=' | grep -v NP_ | cut -d= -f2 | sort -u \\ > \$asmId.refseqSelectTranscripts.txt fi rm -f before.cut9.txt # extract labels from semi-structured text in gbff COMMENT/description sections: zcat \$downloadDir/\${asmId}_rna.gbff.gz \\ | (grep ' :: ' || true) \\ | perl -wpe 's/\\s+::.*//; s/^\\s+//;' \\ | sort -u \\ > pragmaLabels.txt @@ -399,53 +400,67 @@ _EOF_ ); } $bossScript->add(<<_EOF_ $genePredCheckDb \$asmId.\$db.gp.gz zcat \$asmId.\$db.gp.gz > ncbiRefSeq.\$dateStamp genePredToGtf -utr file ncbiRefSeq.\$dateStamp stdout | gzip -c \\ > ../\$db.\$dateStamp.ncbiRefSeq.gtf.gz rm -f ncbiRefSeq.\$dateStamp # curated subset of all genes (zegrep "^[NY][MRP]_" \$asmId.\$db.gp.gz || true) > \$db.curated.gp # may not be any curated genes if [ ! -s \$db.curated.gp ]; then rm -f \$db.curated.gp -elif [ -s \$asmId.refseqSelectTranscripts.txt ]; then + rm -f \$db.refseqSelect.curated.gp + rm -f hgmd.curated.gp +else + if [ -s \$asmId.refseqSelectTranscripts.txt ]; then cat \$db.curated.gp | fgrep -f \$asmId.refseqSelectTranscripts.txt - \\ > \$db.refseqSelect.curated.gp # may not be any refseqSelect.curated genes if [ ! -s \$db.refseqSelect.curated.gp ]; then rm -f \$db.refseqSelect.curated.gp fi fi + if [ \$db = "hg19" -o \$db = "hg38" ]; then + hgmdFile=`ls /hive/data/outside/hgmd/20*.4-hgmd-public_hg38.tsv | tail -1` + if [ -s "\$hgmdFile" ]; then + cut -f7 "\$hgmdFile" | cut -d. -f1 | sort -u | awk '{printf "%s.\\n", \$1}' > hgmdTranscripts.txt + fgrep -f hgmdTranscripts.txt \$db.curated.gp > hgmd.curated.gp + fi + fi +fi # predicted subset of all genes (zegrep "^X[MR]_" \$asmId.\$db.gp.gz || true) > \$db.predicted.gp # not curated or predicted subset of all genes, the left overs (zegrep -v "^[NXY][MRP]_" \$asmId.\$db.gp.gz || true) > \$db.other.gp # curated and predicted without leftovers: (zegrep "^[NXY][MRP]_" \$asmId.\$db.gp.gz || true) > \$db.ncbiRefSeq.gp if [ -s \$db.curated.gp ]; then $genePredCheckDb \$db.curated.gp if [ -s \$db.refseqSelect.curated.gp ]; then $genePredCheckDb \$db.refseqSelect.curated.gp fi + if [ -s hgmd.curated.gp ]; then + $genePredCheckDb hgmd.curated.gp + fi fi if [ -s \$db.predicted.gp ]; then $genePredCheckDb \$db.predicted.gp fi if [ -s \$db.other.gp ]; then $genePredCheckDb \$db.other.gp fi # join the refLink metadata with curated+predicted names cut -f1 \$db.ncbiRefSeq.gp | sort -u > \$asmId.\$db.name.list join -t\$'\\t' \$asmId.\$db.name.list \$asmId.refLink.tab > \$asmId.\$db.ncbiRefSeqLink.tab # Make bigBed with attributes in extra columns for ncbiRefSeqOther: twoBitInfo $dbTwoBit stdout | sort -k2,2n > \$db.chrom.sizes @@ -629,30 +644,45 @@ ### and refseqSelect if exists (a subset of curated) if [ -s process/\$db.refseqSelect.curated.gp ]; then genePredToBigGenePred process/\$db.refseqSelect.curated.gp stdout | sort -k1,1 -k2,2n > \$db.ncbiRefSeqSelectCurated.bigGp bedToBigBed -type=bed12+8 -tab -as=bigGenePred.as -extraIndex=name \\ \$db.ncbiRefSeqSelectCurated.bigGp \$db.chrom.sizes \\ \$db.ncbiRefSeqSelectCurated.bb rm -f \$db.ncbiRefSeqSelectCurated.bigGp bigBedInfo \$db.ncbiRefSeqSelectCurated.bb | egrep "^itemCount:|^basesCovered:" \\ | sed -e 's/,//g' > \$db.ncbiRefSeqSelectCurated.stats.txt LC_NUMERIC=en_US /usr/bin/printf "# ncbiRefSeqSelectCurated %s %'d %s %'d\\n" `cat \$db.ncbiRefSeqSelectCurated.stats.txt` | xargs echo ~/kent/src/hg/utils/automation/gpToIx.pl process/\$db.refseqSelect.curated.gp \\ | sort -u > \$asmId.ncbiRefSeqSelectCurated.ix.txt ixIxx \$asmId.ncbiRefSeqSelectCurated.ix.txt \$asmId.ncbiRefSeqSelectCurated.ix{,x} rm -f \$asmId.ncbiRefSeqSelectCurated.ix.txt fi +### and hgmd if exists (a subset of curated) + if [ -s process/hgmd.curated.gp ]; then + genePredToBigGenePred process/hgmd.curated.gp stdout | sort -k1,1 -k2,2n > \$db.ncbiRefSeqHgmd.bigGp + bedToBigBed -type=bed12+8 -tab -as=bigGenePred.as -extraIndex=name \\ + \$db.ncbiRefSeqHgmd.bigGp \$db.chrom.sizes \\ + \$db.ncbiRefSeqHgmd.bb + rm -f \$db.ncbiRefSeqHgmd.bigGp + bigBedInfo \$db.ncbiRefSeqHgmd.bb | egrep "^itemCount:|^basesCovered:" \\ + | sed -e 's/,//g' > \$db.ncbiRefSeqHgmd.stats.txt + LC_NUMERIC=en_US /usr/bin/printf "# ncbiRefSeqHgmd %s %'d %s %'d\\n" `cat \$db.ncbiRefSeqHgmd.stats.txt` | xargs echo + ~/kent/src/hg/utils/automation/gpToIx.pl process/hgmd.curated.gp \\ + | sort -u > \$asmId.ncbiRefSeqHgmd.ix.txt + ixIxx \$asmId.ncbiRefSeqHgmd.ix.txt \$asmId.ncbiRefSeqHgmd.ix{,x} + rm -f \$asmId.ncbiRefSeqHgmd.ix.txt + fi fi ### predicted only if present if [ -s process/\$db.predicted.gp ]; then genePredToBigGenePred process/\$db.predicted.gp stdout | sort -k1,1 -k2,2n > \$db.ncbiRefSeqPredicted.bigGp bedToBigBed -type=bed12+8 -tab -as=bigGenePred.as -extraIndex=name \\ \$db.ncbiRefSeqPredicted.bigGp \$db.chrom.sizes \\ \$db.ncbiRefSeqPredicted.bb rm -f \$db.ncbiRefSeqPredicted.bigGp bigBedInfo \$db.ncbiRefSeqPredicted.bb | egrep "^itemCount:|^basesCovered:" \\ | sed -e 's/,//g' > \$db.ncbiRefSeqPredicted.stats.txt LC_NUMERIC=en_US /usr/bin/printf "# ncbiRefSeqPredicted %s %'d %s %'d\\n" `cat \$db.ncbiRefSeqPredicted.stats.txt` | xargs echo ~/kent/src/hg/utils/automation/gpToIx.pl process/\$db.predicted.gp \\ | sort -u > \$asmId.ncbiRefSeqPredicted.ix.txt ixIxx \$asmId.ncbiRefSeqPredicted.ix.txt \$asmId.ncbiRefSeqPredicted.ix{,x} @@ -774,30 +804,34 @@ ); } else { # processing for a database genome $bossScript->add(<<_EOF_ # loading the genePred tracks, all genes in one, and subsets hgLoadGenePred -genePredExt \$db ncbiRefSeq process/\$db.ncbiRefSeq.gp $genePredCheckDb ncbiRefSeq if [ -s process/\$db.curated.gp ]; then hgLoadGenePred -genePredExt \$db ncbiRefSeqCurated process/\$db.curated.gp $genePredCheckDb ncbiRefSeqCurated if [ -s process/\$db.refseqSelect.curated.gp ]; then hgLoadGenePred -genePredExt \$db ncbiRefSeqSelect process/\$db.refseqSelect.curated.gp $genePredCheckDb ncbiRefSeqSelect fi + if [ -s process/hgmd.curated.gp ]; then + hgLoadGenePred -genePredExt \$db ncbiRefSeqHgmd process/hgmd.curated.gp + $genePredCheckDb ncbiRefSeqHgmd + fi fi if [ -s process/\$db.predicted.gp ]; then hgLoadGenePred -genePredExt \$db ncbiRefSeqPredicted process/\$db.predicted.gp $genePredCheckDb ncbiRefSeqPredicted fi mkdir -p $gbdbDir ln -f -s `pwd`/process/\$db.other.bb $gbdbDir/ncbiRefSeqOther.bb hgBbiDbLink \$db ncbiRefSeqOther $gbdbDir/ncbiRefSeqOther.bb ln -f -s `pwd`/process/ncbiRefSeqOther.ix{,x} $gbdbDir/ ln -f -s `pwd`/process/ncbiRefSeqVersion.txt $gbdbDir/ # select only coding genes to have CDS records