1f92dad159673221d937c939cfabf4ac32534823 hiram Wed Sep 1 15:22:44 2021 -0700 use genePredFilter to eliminate the circular wrap around genes on chrMT and like chromosomes no redmine diff --git src/hg/utils/automation/doNcbiRefSeq.pl src/hg/utils/automation/doNcbiRefSeq.pl index 15b6bdd..c459827 100755 --- src/hg/utils/automation/doNcbiRefSeq.pl +++ src/hg/utils/automation/doNcbiRefSeq.pl @@ -342,36 +342,38 @@ export dateStamp=`date "+%F"` export annotationRelease=`zcat \$ncbiGffGz | head -100 | grep ^#.annotation-source | sed -e 's/.*annotation-source //; s/ Updated Annotation Release//;'` if [ "\$annotationRelease" == "" ]; then export annotationRelease=\$asmId fi export versionDate=`ls -L --full-time \$ncbiGffGz | awk '{print \$6;}'` echo "\$annotationRelease (\$versionDate)" > ncbiRefSeqVersion.txt # this produces the genePred in NCBI coordinates # 8/23/17: gff3ToGenePred quits over illegal attribute SO_type... make it legal (so_type): if [ -s ../../../download/\${asmId}.remove.dups.list ]; then zcat \$ncbiGffGz | grep -v -f ../../../download/\${asmId}.remove.dups.list \\ | sed -re 's/([;\\t])SO_type=/\\1so_type=/;' \\ | gff3ToGenePred $warnOnly -refseqHacks -attrsOut=\$asmId.attrs.txt \\ - -unprocessedRootsOut=\$asmId.unprocessedRoots.txt stdin \$asmId.gp + -unprocessedRootsOut=\$asmId.unprocessedRoots.txt stdin stdout \\ + | genePredFilter -chromSizes=../../../\$asmId.chrom.sizes stdin \$asmId.gp else zcat \$ncbiGffGz \\ | sed -re 's/([;\\t])SO_type=/\\1so_type=/;' \\ | gff3ToGenePred $warnOnly -refseqHacks -attrsOut=\$asmId.attrs.txt \\ - -unprocessedRootsOut=\$asmId.unprocessedRoots.txt stdin \$asmId.gp + -unprocessedRootsOut=\$asmId.unprocessedRoots.txt stdin stdout \\ + | genePredFilter -chromSizes=../../../\$asmId.chrom.sizes stdin \$asmId.gp fi genePredCheck \$asmId.gp rm -f \$asmId.refseqSelectTranscripts.txt zegrep 'tag=(RefSeq|MANE) Select' \$ncbiGffGz > before.cut9.txt || true if [ -s before.cut9.txt ]; then cut -f9- before.cut9.txt | tr ';' '\\n' \\ | grep 'Name=' | grep -v NP_ | cut -d= -f2 | sort -u \\ > \$asmId.refseqSelectTranscripts.txt fi rm -f before.cut9.txt # extract labels from semi-structured text in gbff COMMENT/description sections: zcat \$downloadDir/\${asmId}_rna.gbff.gz \\