34d2eee845f5f45e571d1e153c632683b8a93f75 lrnassar Tue Apr 21 16:17:53 2026 -0700 Refine NMD Escape Rule 2 gate to "single coding exon and no 3'UTR intron". refs #33737 Previously Rule 2 required exonCount==1 (truly intronless). This overcorrected for single-CDS-exon transcripts whose only introns are in the 5'UTR: biologically these have no EJC downstream of the stop codon (5'UTR EJCs are cleared by the scanning 40S or sit upstream of the terminating ribosome) and are NMD-immune, but the code pushed them to Rules 1/3 under a less accurate "last coding exon" label. New gate: len(cdsExons) == 1 AND no exon-exon junction strictly downstream of the stop codon (strand-aware). Transcripts with a single coding exon but a 3'UTR intron correctly stay in Rules 1/3 because that intron deposits an EJC that can trigger NMD. 3,113 RefSeq Curated and 10,790 Gencode V49 transcripts move into Rule 2. 140 RefSeq and 1,135 Gencode single-CDS-exon transcripts with 3'UTR introns correctly remain in Rules 1/3. Description page and makedoc updated. diff --git src/hg/makeDb/scripts/nmd/genePredNmdEsc src/hg/makeDb/scripts/nmd/genePredNmdEsc index d8b3e005c4d..6c2ea4f891b 100755 --- src/hg/makeDb/scripts/nmd/genePredNmdEsc +++ src/hg/makeDb/scripts/nmd/genePredNmdEsc @@ -4,31 +4,31 @@ from collections import defaultdict from os.path import join, basename, dirname, isfile # ==== functions ===== # Colors by rule RULE_COLORS = { 1: "255,0,0", # red - last 50bp of last coding junction 2: "255,140,0", # orange - intronless transcript 3: "139,0,0", # dark red - first 100bp of coding nucleotides 4: "255,215,0", # gold - long coding exon (>400 nt) } RULE_DESCRIPTIONS = { 1: "Rule 1 - last 50bp of last coding exon junction", - 2: "Rule 2 - intronless transcript", + 2: "Rule 2 - single coding exon, no 3'UTR intron", 3: "Rule 3 - first 100bp of coding nucleotides", 4: "Rule 4 - PTC in long exon (>400 nt)", } # Lindeboom et al. 2016: reduced NMD efficiency for PTCs in exons longer than this LONG_EXON_THRESHOLD = 400 def parseArgs(): " setup logging, parse command line arguments and options. -h shows auto-generated help page " parser = optparse.OptionParser("usage: %prog [options] inFname outDecoFname outCollapsedFname - " "Output BEDs with NMD escape regions. First output is decorator format, " "second is collapsed bigGenePred with gene symbols and transcript lists.") parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages") @@ -212,32 +212,42 @@ if (exStart <= cdsEnd and cdsEnd <= exEnd): exEnd = cdsEnd # skip degenerate cdsExons (CDS boundary lands exactly on exon boundary) if exStart >= exEnd: continue cdsExons.append( (exStart, exEnd) ) def addRegions(regions, rule): for r in regions: key = (r[0], r[1], r[2], rule) regionData[key]["transcripts"].append(name) regionData[key]["strands"].add(strand) if geneSym and not regionData[key]["geneSym"]: regionData[key]["geneSym"] = geneSym - if rec["exonCount"] == 1: - # rule 2: truly intronless transcript (no splicing, no EJCs deposited) + # rule 2 applies when no EJC can be deposited downstream of the stop codon. + # that requires: single CDS exon (no CDS-CDS junction) AND no 3'UTR intron. + # 5'UTR introns are allowed - their EJCs are upstream of the stop and either + # are cleared by the scanning 40S or are never encountered by the terminating + # ribosome, so they do not trigger NMD. + if strand == "+": + hasThreeUtrIntron = any(s > cdsEnd for s in rec["exonStarts"]) + else: + hasThreeUtrIntron = any(e < cdsStart for e in rec["exonEnds"]) + + if len(cdsExons) == 1 and not hasThreeUtrIntron: + # rule 2: no EJC downstream of stop codon -> any PTC in CDS escapes NMD bed = [chrom, str(cdsStart), str(cdsEnd), name] bedOut(bed, txStart, txEnd, decoOfh, 2) addRegions([(chrom, cdsStart, cdsEnd)], 2) else: if strand=="+": # rule 3: first 100bp of coding nucleotides regions = outputExonsUpTo(False, cdsExons, chrom, txStart, txEnd, name, 100, decoOfh, 3) addRegions(regions, 3) # rule 1: last 50bp of last coding junction regions = outputExonsUpTo(True, cdsExons, chrom, txStart, txEnd, name, -50, decoOfh, 1) addRegions(regions, 1) # rule 4: coding exons >400 nt (excluding last; last is already covered by rule 1) rule4Exons = cdsExons[:-1] else: # rule 3: first 100bp of coding nucleotides