36d92a1a5471428caeab49221e1585b76537498d max Wed Apr 15 06:06:47 2026 -0700 fix zero-length items in NMD escape regions output, refs #33737 Two edge cases produced degenerate zero-length items in the BED output: - CDS boundary landing exactly on an exon boundary created zero-length cdsExons during construction - In outputExonsUpTo, when cumulative doneNs exactly hit the target n on a previous iteration, the next iteration entered the truncation branch with missBps=0, producing a zero-length region Co-Authored-By: Claude Opus 4.6 (1M context) diff --git src/hg/makeDb/scripts/nmd/genePredNmdEsc src/hg/makeDb/scripts/nmd/genePredNmdEsc index b088b2729cf..f95e07be6ea 100755 --- src/hg/makeDb/scripts/nmd/genePredNmdEsc +++ src/hg/makeDb/scripts/nmd/genePredNmdEsc @@ -135,30 +135,32 @@ """ given a list of (start, end), output start-end BEDs to ofh that cover n nucleotides. Returns list of (chrom, start, end) regions output. """ doneNs = 0 doStop = False regions = [] if from3Prime: cdsExons = list(reversed(cdsExons)) # -50 means "-50 from the last junction" so take length of last exon + 50 if n < 0: ex1Start = cdsExons[0][0] ex1End = cdsExons[0][1] n = (ex1End-ex1Start)+50 for start, end in cdsExons: + if doneNs >= n: + return regions exLen = end-start missBps = n-doneNs if doneNs+exLen > n: if from3Prime: start = end-missBps else: end = start+missBps doStop = True bed = [chrom, str(start), str(end), name] doneNs += exLen bedOut(bed, txStart, txEnd, ofh, rule) regions.append( (chrom, start, end) ) if doStop: return regions return regions @@ -192,30 +194,33 @@ geneSym = rec["geneSym"] # only keep exons that have CDS and cut around CDS cdsExons = [] for exStart, exEnd in zip(rec["exonStarts"], rec["exonEnds"]): # 5' UTR if cdsStart > exEnd: continue # 3' UTR if exStart > cdsEnd: continue if (exStart <= cdsStart and cdsStart <= exEnd): exStart = cdsStart if (exStart <= cdsEnd and cdsEnd <= exEnd): exEnd = cdsEnd + # skip degenerate cdsExons (CDS boundary lands exactly on exon boundary) + if exStart >= exEnd: + continue cdsExons.append( (exStart, exEnd) ) def addRegions(regions, rule): for r in regions: key = (r[0], r[1], r[2], rule) regionData[key]["transcripts"].append(name) regionData[key]["strands"].add(strand) if geneSym and not regionData[key]["geneSym"]: regionData[key]["geneSym"] = geneSym if len(cdsExons)==1: # rule 2: intronless transcript bed = [chrom, str(cdsStart), str(cdsEnd), name] bedOut(bed, txStart, txEnd, decoOfh, 2) addRegions([(chrom, cdsStart, cdsEnd)], 2)