1f92dad159673221d937c939cfabf4ac32534823
hiram
  Wed Sep 1 15:22:44 2021 -0700
use genePredFilter to eliminate the circular wrap around genes on chrMT and like chromosomes no redmine

diff --git src/hg/utils/automation/doNcbiRefSeq.pl src/hg/utils/automation/doNcbiRefSeq.pl
index 15b6bdd..c459827 100755
--- src/hg/utils/automation/doNcbiRefSeq.pl
+++ src/hg/utils/automation/doNcbiRefSeq.pl
@@ -342,36 +342,38 @@
 export dateStamp=`date "+%F"`
 
 export annotationRelease=`zcat \$ncbiGffGz | head -100 | grep ^#.annotation-source | sed -e 's/.*annotation-source //; s/ Updated Annotation Release//;'`
 if [ "\$annotationRelease" == "" ]; then
   export annotationRelease=\$asmId
 fi
 export versionDate=`ls -L --full-time \$ncbiGffGz | awk '{print \$6;}'`
 echo "\$annotationRelease (\$versionDate)" > ncbiRefSeqVersion.txt
 
 # this produces the genePred in NCBI coordinates
 # 8/23/17: gff3ToGenePred quits over illegal attribute SO_type... make it legal (so_type):
 if [ -s ../../../download/\${asmId}.remove.dups.list ]; then
   zcat \$ncbiGffGz | grep -v -f ../../../download/\${asmId}.remove.dups.list \\
     | sed -re 's/([;\\t])SO_type=/\\1so_type=/;' \\
       | gff3ToGenePred $warnOnly -refseqHacks -attrsOut=\$asmId.attrs.txt \\
-        -unprocessedRootsOut=\$asmId.unprocessedRoots.txt stdin \$asmId.gp
+        -unprocessedRootsOut=\$asmId.unprocessedRoots.txt stdin stdout \\
+      | genePredFilter -chromSizes=../../../\$asmId.chrom.sizes stdin \$asmId.gp
 else
   zcat \$ncbiGffGz \\
     | sed -re 's/([;\\t])SO_type=/\\1so_type=/;' \\
       | gff3ToGenePred $warnOnly -refseqHacks -attrsOut=\$asmId.attrs.txt \\
-        -unprocessedRootsOut=\$asmId.unprocessedRoots.txt stdin \$asmId.gp
+        -unprocessedRootsOut=\$asmId.unprocessedRoots.txt stdin stdout \\
+      | genePredFilter -chromSizes=../../../\$asmId.chrom.sizes stdin \$asmId.gp
 fi
 genePredCheck \$asmId.gp
 
 rm -f \$asmId.refseqSelectTranscripts.txt
 zegrep 'tag=(RefSeq|MANE) Select' \$ncbiGffGz > before.cut9.txt || true
 
 if [ -s before.cut9.txt ]; then
   cut -f9- before.cut9.txt | tr ';' '\\n' \\
     | grep 'Name=' | grep -v NP_ | cut -d= -f2 | sort -u \\
        > \$asmId.refseqSelectTranscripts.txt
 fi
 rm -f before.cut9.txt
 
 # extract labels from semi-structured text in gbff COMMENT/description sections:
 zcat \$downloadDir/\${asmId}_rna.gbff.gz \\