0151d00a4a1d73a78c35f6158c6c936ff338faeb max Fri Apr 24 10:37:34 2026 -0700 NMD Escape: MANE subtrack, Rule 1 bug fix, transcript filter. refs #33737 - Add nmdEscMane subtrack (MANE Select Plus Clinical 1.5), built from /gbdb/hg38/mane/mane.bb. Reuses nmdEscTranscripts.html. - Fix Rule 1: measure 50 bp upstream of the transcript's last splice junction (including 3'UTR introns) rather than stripping 3'UTR from the exon list first. The old logic painted the entire last CDS exon as NMD-escape whenever the transcript had only one CDS exon, even when a 3'UTR intron sat far past the stop codon (e.g. NBDY: 207 bp of CDS over-painted for a junction 2.6 kb past the stop). - Add --rule1-mode {cds,mrna} (default cds): cds counts only CDS bp on the walk-back (paints up to 50 bp of CDS matching the rule label literally); mrna counts mRNA bp and clips to CDS (tracks the 55 bp rule literature). Documented in makeDoc. - Rule 4: when a 3'UTR intron exists, the last CDS-containing exon has a downstream EJC and is now eligible for the long-exon rule. - Mouseover lists contributing transcript accessions when 1-3 items collapse into a region; falls back to a count above that. - Add filterText/filterType/filterLabel on all three escape subtracks so a user can narrow the display to one transcript. - genePredNmdEsc: --gene-sym-field (default 17 for Gencode; pass 18 for MANE, whose HGNC symbol lives in bigGenePred geneName2). - Add findShortTxLongUtrIntron.py helper for finding MANE transcripts with long UTR introns (used to pick NMD edge-case test cases). Post-fix collapsed-region counts (--rule1-mode=cds): MANE 1.5: 67,752 Gencode V49: 233,375 RefSeq Curated: 112,356 diff --git src/hg/makeDb/scripts/nmd/genePredNmdEsc src/hg/makeDb/scripts/nmd/genePredNmdEsc index a0015502a90..e599b86bcfd 100755 --- src/hg/makeDb/scripts/nmd/genePredNmdEsc +++ src/hg/makeDb/scripts/nmd/genePredNmdEsc @@ -3,66 +3,79 @@ import logging, sys, optparse, gzip, subprocess from collections import defaultdict from os.path import join, basename, dirname, isfile # ==== functions ===== # Colors by rule RULE_COLORS = { 1: "255,0,0", # red - last 50bp of last coding junction 2: "255,140,0", # orange - intronless transcript 3: "139,0,0", # dark red - first 100bp of coding nucleotides 4: "255,215,0", # gold - long coding exon (>400 nt) } RULE_DESCRIPTIONS = { - 1: "Rule 1 - last 50bp of last coding exon junction", + 1: "Rule 1 - CDS within 50bp of the last splice junction", 2: "Rule 2 - single coding exon, no 3'UTR intron", 3: "Rule 3 - first 100bp of coding nucleotides", 4: "Rule 4 - PTC in long exon (>400 nt)", } # Lindeboom et al. 2016: reduced NMD efficiency for PTCs in exons longer than this LONG_EXON_THRESHOLD = 400 def parseArgs(): " setup logging, parse command line arguments and options. -h shows auto-generated help page " parser = optparse.OptionParser("usage: %prog [options] inFname outDecoFname outCollapsedFname - " "Output BEDs with NMD escape regions. First output is decorator format, " "second is collapsed bigGenePred with gene symbols and transcript lists.") parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages") parser.add_option("-f", "--format", dest="format", action="store", default="genePredExt", help="Input format: 'genePredExt' (with bin column, e.g. ncbiRefSeq) or " "'bigGenePred' (e.g. gencode .bb file, will use bigBedToBed). Default: genePredExt") + parser.add_option("--gene-sym-field", dest="geneSymField", type="int", default=17, + help="bigGenePred 0-based field index that holds the gene symbol. " + "Default 17 (Gencode 'geneName'). MANE puts the HGNC symbol in field 18.") + parser.add_option("--rule1-mode", dest="rule1Mode", action="store", default="cds", + choices=["cds", "mrna"], + help="How to count the 50 bp walk-back from the last splice junction. " + "'cds' (default): count only CDS nucleotides, skipping 3'UTR; " + "paints up to 50 bp of CDS regardless of how far the last junction " + "sits past the stop codon. " + "'mrna': count mRNA nucleotides including 3'UTR, then clip output " + "to CDS; when the last junction is more than 50 mRNA-bp past the " + "stop codon, no CDS position is painted (tracks the 55 bp rule " + "literature more closely).") (options, args) = parser.parse_args() if len(args) != 3: parser.print_help() exit(1) if options.debug: logging.basicConfig(level=logging.DEBUG) logging.getLogger().setLevel(logging.DEBUG) else: logging.basicConfig(level=logging.INFO) logging.getLogger().setLevel(logging.INFO) return args, options -def openInput(fname, fmt): +def openInput(fname, fmt, geneSymField=17): """Open input file and yield parsed transcript dicts. Both formats yield dicts with keys: name, chrom, strand, txStart, txEnd, cdsStart, cdsEnd, exonCount, exonStarts, exonEnds, geneSym, cdsStartStat, cdsEndStat, exonFrames """ if fmt == "bigGenePred": # pipe through bigBedToBed proc = subprocess.Popen(["bigBedToBed", fname, "stdout"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) fh = proc.stdout elif fname.endswith(".gz"): fh = gzip.open(fname, "rt") else: fh = open(fname) for line in fh: @@ -98,56 +111,145 @@ # exonFrames, type, geneName, geneName2, geneType chromStart = int(fields[1]) blockSizes = [int(x) for x in fields[10].strip(",").split(",") if x] blockStarts = [int(x) for x in fields[11].strip(",").split(",") if x] rec = { "name": fields[3], "chrom": fields[0], "strand": fields[5], "txStart": chromStart, "txEnd": int(fields[2]), "cdsStart": int(fields[6]), "cdsEnd": int(fields[7]), "exonCount": int(fields[9]), "exonStarts": [chromStart + s for s in blockStarts], "exonEnds": [chromStart + s + sz for s, sz in zip(blockStarts, blockSizes)], - "geneSym": fields[17] if len(fields) > 17 else "", + "geneSym": fields[geneSymField] if len(fields) > geneSymField else "", "cdsStartStat": fields[13] if len(fields) > 13 else "none", "cdsEndStat": fields[14] if len(fields) > 14 else "none", "exonFrames": fields[15].strip(",") if len(fields) > 15 else "", } else: raise ValueError("Unknown format: " + fmt) yield rec if fmt == "bigGenePred": proc.wait() def bedOut(row, txStart, txEnd, ofh, rule): "write a decorator BED line" row = [str(x) for x in row] chrom, start, end, name = row decItem = chrom+":"+str(txStart)+"-"+str(txEnd)+":"+name color = RULE_COLORS[rule] mouseover = RULE_DESCRIPTIONS[rule] row = [chrom, start, end, name, "0", ".", start, end, color, "1", str(int(end)-int(start)), "0", decItem, "block", color, "", mouseover] ofh.write("\t".join(row)) ofh.write("\n") +def rule1Regions(rec, nUpstreamJxn=50, mode="cds"): + """50 bp rule: paint CDS positions near the transcript's last splice + junction. The last junction is taken over ALL exons of the transcript, + including 3'UTR introns, because those introns deposit EJCs that can + trigger NMD. + + Two modes control how the nUpstreamJxn walk-back from the last junction + is counted: + mode='cds' - count only CDS nucleotides, skipping any 3'UTR as you + walk upstream. Paints up to nUpstreamJxn bp of CDS + regardless of how far the last junction sits past the + stop codon. This is what the 'last 50 bp of last + junction' label most literally means in terms of CDS. + mode='mrna' - count mRNA nucleotides (including 3'UTR). If the + walk-back stays in 3'UTR the whole way, no CDS is + painted; otherwise the CDS overlap of the walked + window is painted. This tracks the 55 bp rule + literature, where the distance is an mRNA distance. + + Plus any CDS downstream of the last junction (the CDS overlap with the + last mRNA exon). Returns a list of (genStart, genEnd) tuples in + genomic order, CDS-only.""" + strand = rec["strand"] + cdsStart = rec["cdsStart"] + cdsEnd = rec["cdsEnd"] + exonStarts = rec["exonStarts"] + exonEnds = rec["exonEnds"] + + if len(exonStarts) < 2: + return [] # no splice junction -> rule does not apply + + exons = list(zip(exonStarts, exonEnds)) + if strand == "-": + exons = list(reversed(exons)) # exons now in mRNA 5'->3' order + + def clipToCds(s, e): + s2, e2 = max(s, cdsStart), min(e, cdsEnd) + return (s2, e2) if s2 < e2 else None + + out = [] + # everything downstream of the last junction = last mRNA exon clipped to CDS + clipped = clipToCds(*exons[-1]) + if clipped: + out.append(clipped) + + # walk upstream of the last junction through earlier exons in mRNA order + remaining = nUpstreamJxn + for i in range(len(exons) - 2, -1, -1): + if remaining <= 0: + break + exStart, exEnd = exons[i] + + if mode == "cds": + # count only CDS bp; skip 3'UTR-only exons entirely + cds = clipToCds(exStart, exEnd) + if not cds: + continue + cdsS, cdsE = cds + cdsLen = cdsE - cdsS + if remaining >= cdsLen: + out.append((cdsS, cdsE)) + remaining -= cdsLen + else: + # take the 3'-most `remaining` CDS bp + if strand == "+": + out.append((cdsE - remaining, cdsE)) + else: + out.append((cdsS, cdsS + remaining)) + remaining = 0 + + else: # mode == "mrna" + exLen = exEnd - exStart + if remaining >= exLen: + chunk = (exStart, exEnd) + remaining -= exLen + else: + # take the 3'-most `remaining` mRNA bp + if strand == "+": + chunk = (exEnd - remaining, exEnd) + else: + chunk = (exStart, exStart + remaining) + remaining = 0 + clipped = clipToCds(*chunk) + if clipped: + out.append(clipped) + + return sorted(out) + + def outputExonsUpTo(from3Prime, cdsExons, chrom, txStart, txEnd, name, n, ofh, rule): """ given a list of (start, end), output start-end BEDs to ofh that cover n nucleotides. Returns list of (chrom, start, end) regions output. """ doneNs = 0 doStop = False regions = [] if from3Prime: cdsExons = list(reversed(cdsExons)) # -50 means "-50 from the last junction" so take length of last exon + 50 if n < 0: ex1Start = cdsExons[0][0] ex1End = cdsExons[0][1] n = (ex1End-ex1Start)+50 @@ -170,31 +272,31 @@ return regions return regions # ----------- main -------------- def main(): args, options = parseArgs() inFname, outDecoFname, outCollapsedFname = args decoOfh = open(outDecoFname, "w") # collect regions for the collapsed output: # key = (chrom, start, end, rule) -> {"transcripts": [...], "strand": set(), "geneSym": str} regionData = defaultdict(lambda: {"transcripts": [], "strands": set(), "geneSym": ""}) - for rec in openInput(inFname, options.format): + for rec in openInput(inFname, options.format, options.geneSymField): name = rec["name"] chrom = rec["chrom"] strand = rec["strand"] txStart = rec["txStart"] txEnd = rec["txEnd"] cdsStart = rec["cdsStart"] cdsEnd = rec["cdsEnd"] # skip non-coding transcripts (cdsStart == cdsEnd) if cdsStart >= cdsEnd: continue # gene symbol from record, fall back to transcript name geneSym = rec["geneSym"] @@ -228,69 +330,80 @@ # that requires: single CDS exon (no CDS-CDS junction) AND no 3'UTR intron. # 5'UTR introns are allowed - their EJCs are upstream of the stop and either # are cleared by the scanning 40S or are never encountered by the terminating # ribosome, so they do not trigger NMD. if strand == "+": hasThreeUtrIntron = any(s > cdsEnd for s in rec["exonStarts"]) else: hasThreeUtrIntron = any(e < cdsStart for e in rec["exonEnds"]) if len(cdsExons) == 1 and not hasThreeUtrIntron: # rule 2: no EJC downstream of stop codon -> any PTC in CDS escapes NMD bed = [chrom, str(cdsStart), str(cdsEnd), name] bedOut(bed, txStart, txEnd, decoOfh, 2) addRegions([(chrom, cdsStart, cdsEnd)], 2) else: - if strand=="+": # rule 3: first 100bp of coding nucleotides + if strand == "+": regions = outputExonsUpTo(False, cdsExons, chrom, txStart, txEnd, name, 100, decoOfh, 3) - addRegions(regions, 3) - # rule 1: last 50bp of last coding junction - regions = outputExonsUpTo(True, cdsExons, chrom, txStart, txEnd, name, -50, decoOfh, 1) - addRegions(regions, 1) - # rule 4: coding exons >400 nt (excluding last; last is already covered by rule 1) - rule4Exons = cdsExons[:-1] else: - # rule 3: first 100bp of coding nucleotides regions = outputExonsUpTo(True, cdsExons, chrom, txStart, txEnd, name, 100, decoOfh, 3) addRegions(regions, 3) - # rule 1: last 50bp of last coding junction - regions = outputExonsUpTo(False, cdsExons, chrom, txStart, txEnd, name, -50, decoOfh, 1) - addRegions(regions, 1) - # rule 4: coding exons >400 nt (excluding last, which on - strand is cdsExons[0]) + + # rule 1: within 50 mRNA-bp upstream of the last splice junction of + # the transcript (incl. 3'UTR introns), plus anything downstream of + # that junction, clipped to CDS. + rule1Tuples = [] + for s, e in rule1Regions(rec, 50, mode=options.rule1Mode): + bed = [chrom, str(s), str(e), name] + bedOut(bed, txStart, txEnd, decoOfh, 1) + rule1Tuples.append((chrom, s, e)) + addRegions(rule1Tuples, 1) + + # rule 4: coding exons >400 nt that have a downstream splice junction. + # Without a 3'UTR intron, the last CDS exon is the last mRNA exon + # and is excluded (no downstream EJC). With a 3'UTR intron, every + # CDS exon has a downstream junction and is eligible. + if hasThreeUtrIntron: + rule4Exons = cdsExons + elif strand == "+": + rule4Exons = cdsExons[:-1] + else: rule4Exons = cdsExons[1:] - # rule 4: Lindeboom et al. 2016 long-exon rule for exStart, exEnd in rule4Exons: if exEnd - exStart > LONG_EXON_THRESHOLD: bed = [chrom, str(exStart), str(exEnd), name] bedOut(bed, txStart, txEnd, decoOfh, 4) addRegions([(chrom, exStart, exEnd)], 4) decoOfh.close() # write collapsed output as bed 9 + mouseover + transcripts collOfh = open(outCollapsedFname, "w") for (chrom, start, end, rule), data in sorted(regionData.items()): txList = sorted(set(data["transcripts"])) geneSym = data["geneSym"] if not geneSym: geneSym = txList[0] # pick strand: use the strand if all transcripts agree, else "." strands = data["strands"] strand = list(strands)[0] if len(strands) == 1 else "." color = RULE_COLORS[rule] txListStr = ",".join(txList) + if len(txList) <= 3: + mouseover = RULE_DESCRIPTIONS[rule] + " (" + ", ".join(txList) + ")" + else: mouseover = RULE_DESCRIPTIONS[rule] + " (" + str(len(txList)) + " transcripts)" row = [chrom, str(start), str(end), geneSym, "0", strand, str(start), str(end), color, mouseover, txListStr] collOfh.write("\t".join(row)) collOfh.write("\n") collOfh.close() logging.info("Wrote %d decorator regions to %s" % (sum(len(d["transcripts"]) for d in regionData.values()), outDecoFname)) logging.info("Wrote %d collapsed regions to %s" % (len(regionData), outCollapsedFname)) main()