src/hg/utils/automation/doNcbiRefSeq.pl 06be530f59c8628ccc70d7087528b2d78eaca226

06be530f59c8628ccc70d7087528b2d78eaca226
hiram
  Mon Nov 30 15:49:52 2020 -0800
correctly removing duplicate names refs #24396

diff --git src/hg/utils/automation/doNcbiRefSeq.pl src/hg/utils/automation/doNcbiRefSeq.pl
index 6a191ca..7698583 100755
--- src/hg/utils/automation/doNcbiRefSeq.pl
+++ src/hg/utils/automation/doNcbiRefSeq.pl
@@ -338,34 +338,41 @@
 export ncbiGffGz=\$downloadDir/\${asmId}_genomic.gff.gz
 export db=$db
 export gff3ToRefLink=$gff3ToRefLink
 export gbffToCds=$gbffToCds
 export dateStamp=`date "+%F"`
 
 export annotationRelease=`zcat \$ncbiGffGz | head -100 | grep ^#.annotation-source | sed -e 's/.*annotation-source //'`
 if [ "\$annotationRelease" == "" ]; then
   export annotationRelease=\$asmId
 fi
 export versionDate=`ls -L --full-time \$ncbiGffGz | awk '{print \$6;}'`
 echo "\$annotationRelease (\$versionDate)" > ncbiRefSeqVersion.txt
 
 # this produces the genePred in NCBI coordinates
 # 8/23/17: gff3ToGenePred quits over illegal attribute SO_type... make it legal (so_type):
+if [ -s ../../../download/\${asmId}.remove.dups.list ]; then
+  zcat \$ncbiGffGz | grep -v -f ../../../download/\${asmId}.remove.dups.list \\
+    | sed -re 's/([;\\t])SO_type=/\\1so_type=/;' \\
+      | gff3ToGenePred $warnOnly -refseqHacks -attrsOut=\$asmId.attrs.txt \\
+        -unprocessedRootsOut=\$asmId.unprocessedRoots.txt stdin \$asmId.gp
+else
   zcat \$ncbiGffGz \\
     | sed -re 's/([;\\t])SO_type=/\\1so_type=/;' \\
       | gff3ToGenePred $warnOnly -refseqHacks -attrsOut=\$asmId.attrs.txt \\
         -unprocessedRootsOut=\$asmId.unprocessedRoots.txt stdin \$asmId.gp
+fi
 genePredCheck \$asmId.gp
 
 zcat \$ncbiGffGz | egrep 'tag=(RefSeq|MANE) Select' || true > before.cut9.txt
 
 if [ -s before.cut9.txt ]; then
   cut -f9- before.cut9.txt | tr ';' '\\n' \\
     | grep 'Name=' | grep -v NP_ | cut -d= -f2 | sort -u \\
        > \$asmId.refseqSelectTranscripts.txt
 fi
 rm -f before.cut9.txt
 
 # extract labels from semi-structured text in gbff COMMENT/description sections:
 zcat \$downloadDir/\${asmId}_rna.gbff.gz \\
   | (grep ' :: ' || true) \\
     | perl -wpe 's/\\s+::.*//; s/^\\s+//;' \\
@@ -429,31 +436,36 @@
   fi
 fi
 if [ -s \$db.predicted.gp ]; then
   $genePredCheckDb \$db.predicted.gp
 fi
 if [ -s \$db.other.gp ]; then
   $genePredCheckDb \$db.other.gp
 fi
 
 # join the refLink metadata with curated+predicted names
 cut -f1 \$db.ncbiRefSeq.gp | sort -u > \$asmId.\$db.name.list
 join -t\$'\\t' \$asmId.\$db.name.list \$asmId.refLink.tab > \$asmId.\$db.ncbiRefSeqLink.tab
 
 # Make bigBed with attributes in extra columns for ncbiRefSeqOther:
 twoBitInfo $dbTwoBit stdout | sort -k2,2n > \$db.chrom.sizes
+if [ -s ../../../download/\${asmId}.remove.dups.list ]; then
+  genePredToBed -tab -fillSpace \$db.other.gp stdout | sort -k1,1 -k2n,2n \\
+    | grep -v -f ../../../download/\${asmId}.remove.dups.list > \$db.other.bed
+else
   genePredToBed -tab -fillSpace \$db.other.gp stdout | sort -k1,1 -k2n,2n > \$db.other.bed
+fi
 $ncbiRefSeqOtherAttrs \$db.other.bed \$asmId.attrs.txt > \$db.other.extras.bed
 bedToBigBed -type=bed12+13 -as=ncbiRefSeqOther.as -tab \\
   -extraIndex=name \\
   \$db.other.extras.bed \$db.chrom.sizes \$db.other.bb
 
 # Make trix index for ncbiRefSeqOther
 $ncbiRefSeqOtherIxIxx \\
   ncbiRefSeqOther.as \$db.other.extras.bed > ncbiRefSeqOther.ix.tab
 
 ixIxx ncbiRefSeqOther.ix.tab ncbiRefSeqOther.ix{,x}
 
 # PSL data will be loaded into a psl type track to show the alignments
 (zgrep "^#" \$ncbiGffGz | head || true) > gffForPsl.gff
 if [ -s ../../../download/\${asmId}.remove.dups.list ]; then
   (zegrep -v "NG_" \$ncbiGffGz || true) \\