f6a4d0f9523d06a6aa46e4ee4d96b37a4091d4a5
hiram
  Thu Sep 30 18:20:31 2021 -0700
correctly filter on chromSizes to eliminate out of range gene definitions no redmine

diff --git src/hg/utils/automation/doNcbiRefSeq.pl src/hg/utils/automation/doNcbiRefSeq.pl
index c459827..431f021 100755
--- src/hg/utils/automation/doNcbiRefSeq.pl
+++ src/hg/utils/automation/doNcbiRefSeq.pl
@@ -317,63 +317,71 @@
   my $localLiftFile = "$downloadDir/${asmId}To${db}.lift";
   if (! -s "$localLiftFile") {
      $localLiftFile = "../download/${asmId}To${db}.lift" if (-s "../download/${asmId}To${db}.lift");
   }
   $localLiftFile = "" if (! -s "$localLiftFile");
   $localLiftFile = $liftFile if (-s $liftFile);
   my $pslTargetSizes = "-db=\$db";
   my $fakePslSizes = "";
   if (-s "$target2bit") {
     $pslTargetSizes = "-targetSizes=\$db.chrom.sizes";
     $fakePslSizes = "-chromSize=\$db.chrom.sizes";
   }
 
   my $dbTwoBit = "$HgAutomate::clusterData/$db/$db.2bit";
   $dbTwoBit = $target2bit if (-s "$target2bit");
+  my $localSizes = "$HgAutomate::clusterData/$db/chrom.sizes";
 
   $bossScript->add(<<_EOF_
 # establish all variables to use here
 
 export asmId=$asmId
 export downloadDir=$downloadDir
 export ncbiGffGz=\$downloadDir/\${asmId}_genomic.gff.gz
 export db=$db
+export chromSizes="$localSizes"
 export gff3ToRefLink=$gff3ToRefLink
 export gbffToCds=$gbffToCds
 export dateStamp=`date "+%F"`
+if [ -s "../../../\$asmId.chrom.sizes" ]; then
+  chromSizes="../../../\$asmId.chrom.sizes"
+fi
+if [ -s "../download/\$asmId.ncbi.chrom.sizes" ]; then
+  chromSizes="../download/\$asmId.ncbi.chrom.sizes"
+fi
 
 export annotationRelease=`zcat \$ncbiGffGz | head -100 | grep ^#.annotation-source | sed -e 's/.*annotation-source //; s/ Updated Annotation Release//;'`
 if [ "\$annotationRelease" == "" ]; then
   export annotationRelease=\$asmId
 fi
 export versionDate=`ls -L --full-time \$ncbiGffGz | awk '{print \$6;}'`
 echo "\$annotationRelease (\$versionDate)" > ncbiRefSeqVersion.txt
 
 # this produces the genePred in NCBI coordinates
 # 8/23/17: gff3ToGenePred quits over illegal attribute SO_type... make it legal (so_type):
 if [ -s ../../../download/\${asmId}.remove.dups.list ]; then
   zcat \$ncbiGffGz | grep -v -f ../../../download/\${asmId}.remove.dups.list \\
     | sed -re 's/([;\\t])SO_type=/\\1so_type=/;' \\
       | gff3ToGenePred $warnOnly -refseqHacks -attrsOut=\$asmId.attrs.txt \\
         -unprocessedRootsOut=\$asmId.unprocessedRoots.txt stdin stdout \\
-      | genePredFilter -chromSizes=../../../\$asmId.chrom.sizes stdin \$asmId.gp
+      | genePredFilter -chromSizes=\$chromSizes stdin \$asmId.gp
 else
   zcat \$ncbiGffGz \\
     | sed -re 's/([;\\t])SO_type=/\\1so_type=/;' \\
       | gff3ToGenePred $warnOnly -refseqHacks -attrsOut=\$asmId.attrs.txt \\
         -unprocessedRootsOut=\$asmId.unprocessedRoots.txt stdin stdout \\
-      | genePredFilter -chromSizes=../../../\$asmId.chrom.sizes stdin \$asmId.gp
+      | genePredFilter -chromSizes=\$chromSizes stdin \$asmId.gp
 fi
 genePredCheck \$asmId.gp
 
 rm -f \$asmId.refseqSelectTranscripts.txt
 zegrep 'tag=(RefSeq|MANE) Select' \$ncbiGffGz > before.cut9.txt || true
 
 if [ -s before.cut9.txt ]; then
   cut -f9- before.cut9.txt | tr ';' '\\n' \\
     | grep 'Name=' | grep -v NP_ | cut -d= -f2 | sort -u \\
        > \$asmId.refseqSelectTranscripts.txt
 fi
 rm -f before.cut9.txt
 
 # extract labels from semi-structured text in gbff COMMENT/description sections:
 zcat \$downloadDir/\${asmId}_rna.gbff.gz \\