36d92a1a5471428caeab49221e1585b76537498d
max
  Wed Apr 15 06:06:47 2026 -0700
fix zero-length items in NMD escape regions output, refs #33737

Two edge cases produced degenerate zero-length items in the BED output:
- CDS boundary landing exactly on an exon boundary created zero-length
cdsExons during construction
- In outputExonsUpTo, when cumulative doneNs exactly hit the target n
on a previous iteration, the next iteration entered the truncation
branch with missBps=0, producing a zero-length region

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

diff --git src/hg/makeDb/scripts/nmd/genePredNmdEsc src/hg/makeDb/scripts/nmd/genePredNmdEsc
index b088b2729cf..f95e07be6ea 100755
--- src/hg/makeDb/scripts/nmd/genePredNmdEsc
+++ src/hg/makeDb/scripts/nmd/genePredNmdEsc
@@ -135,30 +135,32 @@
     """ given a list of (start, end), output start-end BEDs to ofh that cover n nucleotides.
         Returns list of (chrom, start, end) regions output. """
     doneNs = 0
     doStop = False
     regions = []
     if from3Prime:
         cdsExons = list(reversed(cdsExons))
 
     # -50 means "-50 from the last junction" so take length of last exon + 50
     if n < 0:
         ex1Start = cdsExons[0][0]
         ex1End = cdsExons[0][1]
         n = (ex1End-ex1Start)+50
 
     for start, end in cdsExons:
+        if doneNs >= n:
+            return regions
         exLen = end-start
         missBps = n-doneNs
         if doneNs+exLen > n:
             if from3Prime:
                 start = end-missBps
             else:
                 end = start+missBps
             doStop = True
         bed = [chrom, str(start), str(end), name]
         doneNs += exLen
         bedOut(bed, txStart, txEnd, ofh, rule)
         regions.append( (chrom, start, end) )
         if doStop:
             return regions
     return regions
@@ -192,30 +194,33 @@
         geneSym = rec["geneSym"]
 
         # only keep exons that have CDS and cut around CDS
         cdsExons = []
         for exStart, exEnd in zip(rec["exonStarts"], rec["exonEnds"]):
             # 5' UTR
             if cdsStart > exEnd:
                 continue
             # 3' UTR
             if exStart > cdsEnd:
                 continue
             if (exStart <= cdsStart and cdsStart <= exEnd):
                 exStart = cdsStart
             if (exStart <= cdsEnd and cdsEnd <= exEnd):
                 exEnd = cdsEnd
+            # skip degenerate cdsExons (CDS boundary lands exactly on exon boundary)
+            if exStart >= exEnd:
+                continue
             cdsExons.append( (exStart, exEnd) )
 
         def addRegions(regions, rule):
             for r in regions:
                 key = (r[0], r[1], r[2], rule)
                 regionData[key]["transcripts"].append(name)
                 regionData[key]["strands"].add(strand)
                 if geneSym and not regionData[key]["geneSym"]:
                     regionData[key]["geneSym"] = geneSym
 
         if len(cdsExons)==1:
             # rule 2: intronless transcript
             bed = [chrom, str(cdsStart), str(cdsEnd), name]
             bedOut(bed, txStart, txEnd, decoOfh, 2)
             addRegions([(chrom, cdsStart, cdsEnd)], 2)