06be530f59c8628ccc70d7087528b2d78eaca226 hiram Mon Nov 30 15:49:52 2020 -0800 correctly removing duplicate names refs #24396 diff --git src/hg/utils/automation/doNcbiRefSeq.pl src/hg/utils/automation/doNcbiRefSeq.pl index 6a191ca..7698583 100755 --- src/hg/utils/automation/doNcbiRefSeq.pl +++ src/hg/utils/automation/doNcbiRefSeq.pl @@ -338,34 +338,41 @@ export ncbiGffGz=\$downloadDir/\${asmId}_genomic.gff.gz export db=$db export gff3ToRefLink=$gff3ToRefLink export gbffToCds=$gbffToCds export dateStamp=`date "+%F"` export annotationRelease=`zcat \$ncbiGffGz | head -100 | grep ^#.annotation-source | sed -e 's/.*annotation-source //'` if [ "\$annotationRelease" == "" ]; then export annotationRelease=\$asmId fi export versionDate=`ls -L --full-time \$ncbiGffGz | awk '{print \$6;}'` echo "\$annotationRelease (\$versionDate)" > ncbiRefSeqVersion.txt # this produces the genePred in NCBI coordinates # 8/23/17: gff3ToGenePred quits over illegal attribute SO_type... make it legal (so_type): +if [ -s ../../../download/\${asmId}.remove.dups.list ]; then + zcat \$ncbiGffGz | grep -v -f ../../../download/\${asmId}.remove.dups.list \\ + | sed -re 's/([;\\t])SO_type=/\\1so_type=/;' \\ + | gff3ToGenePred $warnOnly -refseqHacks -attrsOut=\$asmId.attrs.txt \\ + -unprocessedRootsOut=\$asmId.unprocessedRoots.txt stdin \$asmId.gp +else zcat \$ncbiGffGz \\ | sed -re 's/([;\\t])SO_type=/\\1so_type=/;' \\ | gff3ToGenePred $warnOnly -refseqHacks -attrsOut=\$asmId.attrs.txt \\ -unprocessedRootsOut=\$asmId.unprocessedRoots.txt stdin \$asmId.gp +fi genePredCheck \$asmId.gp zcat \$ncbiGffGz | egrep 'tag=(RefSeq|MANE) Select' || true > before.cut9.txt if [ -s before.cut9.txt ]; then cut -f9- before.cut9.txt | tr ';' '\\n' \\ | grep 'Name=' | grep -v NP_ | cut -d= -f2 | sort -u \\ > \$asmId.refseqSelectTranscripts.txt fi rm -f before.cut9.txt # extract labels from semi-structured text in gbff COMMENT/description sections: zcat \$downloadDir/\${asmId}_rna.gbff.gz \\ | (grep ' :: ' || true) \\ | perl -wpe 's/\\s+::.*//; s/^\\s+//;' \\ @@ -429,31 +436,36 @@ fi fi if [ -s \$db.predicted.gp ]; then $genePredCheckDb \$db.predicted.gp fi if [ -s \$db.other.gp ]; then $genePredCheckDb \$db.other.gp fi # join the refLink metadata with curated+predicted names cut -f1 \$db.ncbiRefSeq.gp | sort -u > \$asmId.\$db.name.list join -t\$'\\t' \$asmId.\$db.name.list \$asmId.refLink.tab > \$asmId.\$db.ncbiRefSeqLink.tab # Make bigBed with attributes in extra columns for ncbiRefSeqOther: twoBitInfo $dbTwoBit stdout | sort -k2,2n > \$db.chrom.sizes +if [ -s ../../../download/\${asmId}.remove.dups.list ]; then + genePredToBed -tab -fillSpace \$db.other.gp stdout | sort -k1,1 -k2n,2n \\ + | grep -v -f ../../../download/\${asmId}.remove.dups.list > \$db.other.bed +else genePredToBed -tab -fillSpace \$db.other.gp stdout | sort -k1,1 -k2n,2n > \$db.other.bed +fi $ncbiRefSeqOtherAttrs \$db.other.bed \$asmId.attrs.txt > \$db.other.extras.bed bedToBigBed -type=bed12+13 -as=ncbiRefSeqOther.as -tab \\ -extraIndex=name \\ \$db.other.extras.bed \$db.chrom.sizes \$db.other.bb # Make trix index for ncbiRefSeqOther $ncbiRefSeqOtherIxIxx \\ ncbiRefSeqOther.as \$db.other.extras.bed > ncbiRefSeqOther.ix.tab ixIxx ncbiRefSeqOther.ix.tab ncbiRefSeqOther.ix{,x} # PSL data will be loaded into a psl type track to show the alignments (zgrep "^#" \$ncbiGffGz | head || true) > gffForPsl.gff if [ -s ../../../download/\${asmId}.remove.dups.list ]; then (zegrep -v "NG_" \$ncbiGffGz || true) \\