3a62ea7e9a8cb3503586a0a78570331308c9bc58
max
Mon Apr 27 02:23:00 2026 -0700
NMD Escape MANE: expose NM_ accession via labelFields. refs #33737
Per QA, the MANE subtrack now shows the NCBI RefSeq accession by default
instead of the HGNC gene symbol, with the ENST and gene symbol still
selectable via labelFields.
- genePredNmdEsc: new --ncbi-id-field N option (default -1 = unused).
When set, the named bigGenePred column is captured per-transcript and
written into a new ncbiIds output column. For MANE pass 21.
- genePredNmdEsc: new --no-collapse option. By default, regions with
identical (chrom, start, end, rule) from multiple transcripts collapse
into one row with comma-separated lists. With --no-collapse the script
emits one row per (transcript, region). Used for MANE so each
label-field column holds a single value: the 74 MANE Plus Clinical
genes (e.g. LMNA) get two rows per region instead of one row with a
two-element list.
- nmdEscCollapsed.as: add lstring ncbiIds column. Schema is now bed9+3.
- nmd.ra (nmdEscMane only): labelFields ncbiIds,name,transcripts;
defaultLabelFields ncbiIds; labelSeparator " / ". Gencode and RefSeq
subtracks unchanged - they default to the gene symbol (name column)
and have an empty ncbiIds column.
- doc/hg38/nmd.txt: bump all three bedToBigBed invocations to bed9+3
and document the --ncbi-id-field 21 + --no-collapse invocation for
MANE.
Counts: MANE 68,028 (--no-collapse); Gencode 233,375; RefSeq 112,356.
diff --git src/hg/makeDb/scripts/nmd/genePredNmdEsc src/hg/makeDb/scripts/nmd/genePredNmdEsc
index e599b86bcfd..baf08c7b87e 100755
--- src/hg/makeDb/scripts/nmd/genePredNmdEsc
+++ src/hg/makeDb/scripts/nmd/genePredNmdEsc
@@ -1,409 +1,430 @@
#!/usr/bin/env python
import logging, sys, optparse, gzip, subprocess
from collections import defaultdict
from os.path import join, basename, dirname, isfile
# ==== functions =====
# Colors by rule
RULE_COLORS = {
1: "255,0,0", # red - last 50bp of last coding junction
2: "255,140,0", # orange - intronless transcript
3: "139,0,0", # dark red - first 100bp of coding nucleotides
4: "255,215,0", # gold - long coding exon (>400 nt)
}
RULE_DESCRIPTIONS = {
1: "Rule 1 - CDS within 50bp of the last splice junction",
2: "Rule 2 - single coding exon, no 3'UTR intron",
3: "Rule 3 - first 100bp of coding nucleotides",
4: "Rule 4 - PTC in long exon (>400 nt)",
}
# Lindeboom et al. 2016: reduced NMD efficiency for PTCs in exons longer than this
LONG_EXON_THRESHOLD = 400
def parseArgs():
" setup logging, parse command line arguments and options. -h shows auto-generated help page "
parser = optparse.OptionParser("usage: %prog [options] inFname outDecoFname outCollapsedFname - "
"Output BEDs with NMD escape regions. First output is decorator format, "
"second is collapsed bigGenePred with gene symbols and transcript lists.")
parser.add_option("-d", "--debug", dest="debug", action="store_true",
help="show debug messages")
parser.add_option("-f", "--format", dest="format", action="store", default="genePredExt",
help="Input format: 'genePredExt' (with bin column, e.g. ncbiRefSeq) or "
"'bigGenePred' (e.g. gencode .bb file, will use bigBedToBed). Default: genePredExt")
parser.add_option("--gene-sym-field", dest="geneSymField", type="int", default=17,
help="bigGenePred 0-based field index that holds the gene symbol. "
"Default 17 (Gencode 'geneName'). MANE puts the HGNC symbol in field 18.")
+ parser.add_option("--ncbi-id-field", dest="ncbiIdField", type="int", default=-1,
+ help="bigGenePred 0-based field index that holds the NCBI RefSeq accession "
+ "(NM_/NR_). Default -1 (not extracted). For MANE, pass 21.")
+ parser.add_option("--no-collapse", dest="noCollapse", action="store_true", default=False,
+ help="Emit one row per (transcript, region) instead of collapsing identical "
+ "coordinates across transcripts. Useful for sets like MANE where each "
+ "label-field column should be a single value.")
parser.add_option("--rule1-mode", dest="rule1Mode", action="store", default="cds",
choices=["cds", "mrna"],
help="How to count the 50 bp walk-back from the last splice junction. "
"'cds' (default): count only CDS nucleotides, skipping 3'UTR; "
"paints up to 50 bp of CDS regardless of how far the last junction "
"sits past the stop codon. "
"'mrna': count mRNA nucleotides including 3'UTR, then clip output "
"to CDS; when the last junction is more than 50 mRNA-bp past the "
"stop codon, no CDS position is painted (tracks the 55 bp rule "
"literature more closely).")
(options, args) = parser.parse_args()
if len(args) != 3:
parser.print_help()
exit(1)
if options.debug:
logging.basicConfig(level=logging.DEBUG)
logging.getLogger().setLevel(logging.DEBUG)
else:
logging.basicConfig(level=logging.INFO)
logging.getLogger().setLevel(logging.INFO)
return args, options
-def openInput(fname, fmt, geneSymField=17):
+def openInput(fname, fmt, geneSymField=17, ncbiIdField=-1):
"""Open input file and yield parsed transcript dicts.
Both formats yield dicts with keys: name, chrom, strand, txStart, txEnd,
- cdsStart, cdsEnd, exonCount, exonStarts, exonEnds, geneSym, cdsStartStat, cdsEndStat, exonFrames
+ cdsStart, cdsEnd, exonCount, exonStarts, exonEnds, geneSym, ncbiId,
+ cdsStartStat, cdsEndStat, exonFrames
"""
if fmt == "bigGenePred":
# pipe through bigBedToBed
proc = subprocess.Popen(["bigBedToBed", fname, "stdout"],
stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
fh = proc.stdout
elif fname.endswith(".gz"):
fh = gzip.open(fname, "rt")
else:
fh = open(fname)
for line in fh:
line = line.rstrip("\n\r")
if not line:
continue
fields = line.split("\t")
if fmt == "genePredExt":
# genePredExt with bin column:
# bin, name, chrom, strand, txStart, txEnd, cdsStart, cdsEnd,
# exonCount, exonStarts, exonEnds, score, name2, cdsStartStat, cdsEndStat, exonFrames
rec = {
"name": fields[1],
"chrom": fields[2],
"strand": fields[3],
"txStart": int(fields[4]),
"txEnd": int(fields[5]),
"cdsStart": int(fields[6]),
"cdsEnd": int(fields[7]),
"exonCount": int(fields[8]),
"exonStarts": [int(x) for x in fields[9].strip(",").split(",") if x],
"exonEnds": [int(x) for x in fields[10].strip(",").split(",") if x],
"geneSym": fields[12] if len(fields) > 12 else "",
+ "ncbiId": "",
"cdsStartStat": fields[13] if len(fields) > 13 else "none",
"cdsEndStat": fields[14] if len(fields) > 14 else "none",
"exonFrames": fields[15].strip(",") if len(fields) > 15 else "",
}
elif fmt == "bigGenePred":
# bigGenePred:
# chrom, chromStart, chromEnd, name, score, strand, thickStart, thickEnd,
# color, blockCount, blockSizes, chromStarts, name2, cdsStartStat, cdsEndStat,
# exonFrames, type, geneName, geneName2, geneType
chromStart = int(fields[1])
blockSizes = [int(x) for x in fields[10].strip(",").split(",") if x]
blockStarts = [int(x) for x in fields[11].strip(",").split(",") if x]
rec = {
"name": fields[3],
"chrom": fields[0],
"strand": fields[5],
"txStart": chromStart,
"txEnd": int(fields[2]),
"cdsStart": int(fields[6]),
"cdsEnd": int(fields[7]),
"exonCount": int(fields[9]),
"exonStarts": [chromStart + s for s in blockStarts],
"exonEnds": [chromStart + s + sz for s, sz in zip(blockStarts, blockSizes)],
"geneSym": fields[geneSymField] if len(fields) > geneSymField else "",
+ "ncbiId": fields[ncbiIdField] if (ncbiIdField >= 0 and len(fields) > ncbiIdField) else "",
"cdsStartStat": fields[13] if len(fields) > 13 else "none",
"cdsEndStat": fields[14] if len(fields) > 14 else "none",
"exonFrames": fields[15].strip(",") if len(fields) > 15 else "",
}
else:
raise ValueError("Unknown format: " + fmt)
yield rec
if fmt == "bigGenePred":
proc.wait()
def bedOut(row, txStart, txEnd, ofh, rule):
"write a decorator BED line"
row = [str(x) for x in row]
chrom, start, end, name = row
decItem = chrom+":"+str(txStart)+"-"+str(txEnd)+":"+name
color = RULE_COLORS[rule]
mouseover = RULE_DESCRIPTIONS[rule]
row = [chrom, start, end, name, "0", ".", start, end, color, "1",
str(int(end)-int(start)), "0", decItem, "block", color, "", mouseover]
ofh.write("\t".join(row))
ofh.write("\n")
def rule1Regions(rec, nUpstreamJxn=50, mode="cds"):
"""50 bp rule: paint CDS positions near the transcript's last splice
junction. The last junction is taken over ALL exons of the transcript,
including 3'UTR introns, because those introns deposit EJCs that can
trigger NMD.
Two modes control how the nUpstreamJxn walk-back from the last junction
is counted:
mode='cds' - count only CDS nucleotides, skipping any 3'UTR as you
walk upstream. Paints up to nUpstreamJxn bp of CDS
regardless of how far the last junction sits past the
stop codon. This is what the 'last 50 bp of last
junction' label most literally means in terms of CDS.
mode='mrna' - count mRNA nucleotides (including 3'UTR). If the
walk-back stays in 3'UTR the whole way, no CDS is
painted; otherwise the CDS overlap of the walked
window is painted. This tracks the 55 bp rule
literature, where the distance is an mRNA distance.
Plus any CDS downstream of the last junction (the CDS overlap with the
last mRNA exon). Returns a list of (genStart, genEnd) tuples in
genomic order, CDS-only."""
strand = rec["strand"]
cdsStart = rec["cdsStart"]
cdsEnd = rec["cdsEnd"]
exonStarts = rec["exonStarts"]
exonEnds = rec["exonEnds"]
if len(exonStarts) < 2:
return [] # no splice junction -> rule does not apply
exons = list(zip(exonStarts, exonEnds))
if strand == "-":
exons = list(reversed(exons)) # exons now in mRNA 5'->3' order
def clipToCds(s, e):
s2, e2 = max(s, cdsStart), min(e, cdsEnd)
return (s2, e2) if s2 < e2 else None
out = []
# everything downstream of the last junction = last mRNA exon clipped to CDS
clipped = clipToCds(*exons[-1])
if clipped:
out.append(clipped)
# walk upstream of the last junction through earlier exons in mRNA order
remaining = nUpstreamJxn
for i in range(len(exons) - 2, -1, -1):
if remaining <= 0:
break
exStart, exEnd = exons[i]
if mode == "cds":
# count only CDS bp; skip 3'UTR-only exons entirely
cds = clipToCds(exStart, exEnd)
if not cds:
continue
cdsS, cdsE = cds
cdsLen = cdsE - cdsS
if remaining >= cdsLen:
out.append((cdsS, cdsE))
remaining -= cdsLen
else:
# take the 3'-most `remaining` CDS bp
if strand == "+":
out.append((cdsE - remaining, cdsE))
else:
out.append((cdsS, cdsS + remaining))
remaining = 0
else: # mode == "mrna"
exLen = exEnd - exStart
if remaining >= exLen:
chunk = (exStart, exEnd)
remaining -= exLen
else:
# take the 3'-most `remaining` mRNA bp
if strand == "+":
chunk = (exEnd - remaining, exEnd)
else:
chunk = (exStart, exStart + remaining)
remaining = 0
clipped = clipToCds(*chunk)
if clipped:
out.append(clipped)
return sorted(out)
def outputExonsUpTo(from3Prime, cdsExons, chrom, txStart, txEnd, name, n, ofh, rule):
""" given a list of (start, end), output start-end BEDs to ofh that cover n nucleotides.
Returns list of (chrom, start, end) regions output. """
doneNs = 0
doStop = False
regions = []
if from3Prime:
cdsExons = list(reversed(cdsExons))
# -50 means "-50 from the last junction" so take length of last exon + 50
if n < 0:
ex1Start = cdsExons[0][0]
ex1End = cdsExons[0][1]
n = (ex1End-ex1Start)+50
for start, end in cdsExons:
if doneNs >= n:
return regions
exLen = end-start
missBps = n-doneNs
if doneNs+exLen > n:
if from3Prime:
start = end-missBps
else:
end = start+missBps
doStop = True
bed = [chrom, str(start), str(end), name]
doneNs += exLen
bedOut(bed, txStart, txEnd, ofh, rule)
regions.append( (chrom, start, end) )
if doStop:
return regions
return regions
# ----------- main --------------
def main():
args, options = parseArgs()
inFname, outDecoFname, outCollapsedFname = args
decoOfh = open(outDecoFname, "w")
# collect regions for the collapsed output:
- # key = (chrom, start, end, rule) -> {"transcripts": [...], "strand": set(), "geneSym": str}
- regionData = defaultdict(lambda: {"transcripts": [], "strands": set(), "geneSym": ""})
+ # key = (chrom, start, end, rule) -> {"transcripts": [...], "ncbiIds": [...],
+ # "strand": set(), "geneSym": str}
+ regionData = defaultdict(lambda: {"transcripts": [], "ncbiIds": [], "strands": set(), "geneSym": ""})
- for rec in openInput(inFname, options.format, options.geneSymField):
+ for rec in openInput(inFname, options.format, options.geneSymField, options.ncbiIdField):
name = rec["name"]
chrom = rec["chrom"]
strand = rec["strand"]
txStart = rec["txStart"]
txEnd = rec["txEnd"]
cdsStart = rec["cdsStart"]
cdsEnd = rec["cdsEnd"]
# skip non-coding transcripts (cdsStart == cdsEnd)
if cdsStart >= cdsEnd:
continue
# gene symbol from record, fall back to transcript name
geneSym = rec["geneSym"]
# only keep exons that have CDS and cut around CDS
cdsExons = []
for exStart, exEnd in zip(rec["exonStarts"], rec["exonEnds"]):
# 5' UTR
if cdsStart > exEnd:
continue
# 3' UTR
if exStart > cdsEnd:
continue
if (exStart <= cdsStart and cdsStart <= exEnd):
exStart = cdsStart
if (exStart <= cdsEnd and cdsEnd <= exEnd):
exEnd = cdsEnd
# skip degenerate cdsExons (CDS boundary lands exactly on exon boundary)
if exStart >= exEnd:
continue
cdsExons.append( (exStart, exEnd) )
+ ncbiId = rec.get("ncbiId", "")
+
def addRegions(regions, rule):
for r in regions:
+ if options.noCollapse:
+ key = (r[0], r[1], r[2], rule, name)
+ else:
key = (r[0], r[1], r[2], rule)
regionData[key]["transcripts"].append(name)
+ if ncbiId:
+ regionData[key]["ncbiIds"].append(ncbiId)
regionData[key]["strands"].add(strand)
if geneSym and not regionData[key]["geneSym"]:
regionData[key]["geneSym"] = geneSym
# rule 2 applies when no EJC can be deposited downstream of the stop codon.
# that requires: single CDS exon (no CDS-CDS junction) AND no 3'UTR intron.
# 5'UTR introns are allowed - their EJCs are upstream of the stop and either
# are cleared by the scanning 40S or are never encountered by the terminating
# ribosome, so they do not trigger NMD.
if strand == "+":
hasThreeUtrIntron = any(s > cdsEnd for s in rec["exonStarts"])
else:
hasThreeUtrIntron = any(e < cdsStart for e in rec["exonEnds"])
if len(cdsExons) == 1 and not hasThreeUtrIntron:
# rule 2: no EJC downstream of stop codon -> any PTC in CDS escapes NMD
bed = [chrom, str(cdsStart), str(cdsEnd), name]
bedOut(bed, txStart, txEnd, decoOfh, 2)
addRegions([(chrom, cdsStart, cdsEnd)], 2)
else:
# rule 3: first 100bp of coding nucleotides
if strand == "+":
regions = outputExonsUpTo(False, cdsExons, chrom, txStart, txEnd, name, 100, decoOfh, 3)
else:
regions = outputExonsUpTo(True, cdsExons, chrom, txStart, txEnd, name, 100, decoOfh, 3)
addRegions(regions, 3)
# rule 1: within 50 mRNA-bp upstream of the last splice junction of
# the transcript (incl. 3'UTR introns), plus anything downstream of
# that junction, clipped to CDS.
rule1Tuples = []
for s, e in rule1Regions(rec, 50, mode=options.rule1Mode):
bed = [chrom, str(s), str(e), name]
bedOut(bed, txStart, txEnd, decoOfh, 1)
rule1Tuples.append((chrom, s, e))
addRegions(rule1Tuples, 1)
# rule 4: coding exons >400 nt that have a downstream splice junction.
# Without a 3'UTR intron, the last CDS exon is the last mRNA exon
# and is excluded (no downstream EJC). With a 3'UTR intron, every
# CDS exon has a downstream junction and is eligible.
if hasThreeUtrIntron:
rule4Exons = cdsExons
elif strand == "+":
rule4Exons = cdsExons[:-1]
else:
rule4Exons = cdsExons[1:]
for exStart, exEnd in rule4Exons:
if exEnd - exStart > LONG_EXON_THRESHOLD:
bed = [chrom, str(exStart), str(exEnd), name]
bedOut(bed, txStart, txEnd, decoOfh, 4)
addRegions([(chrom, exStart, exEnd)], 4)
decoOfh.close()
- # write collapsed output as bed 9 + mouseover + transcripts
+ # write collapsed output as bed 9 + mouseover + transcripts + ncbiIds
collOfh = open(outCollapsedFname, "w")
- for (chrom, start, end, rule), data in sorted(regionData.items()):
+ for key, data in sorted(regionData.items()):
+ chrom, start, end, rule = key[:4]
txList = sorted(set(data["transcripts"]))
+ ncbiList = sorted(set(data["ncbiIds"]))
geneSym = data["geneSym"]
if not geneSym:
geneSym = txList[0]
# pick strand: use the strand if all transcripts agree, else "."
strands = data["strands"]
strand = list(strands)[0] if len(strands) == 1 else "."
color = RULE_COLORS[rule]
txListStr = ",".join(txList)
+ ncbiListStr = ",".join(ncbiList)
if len(txList) <= 3:
mouseover = RULE_DESCRIPTIONS[rule] + " (" + ", ".join(txList) + ")"
else:
mouseover = RULE_DESCRIPTIONS[rule] + " (" + str(len(txList)) + " transcripts)"
row = [chrom, str(start), str(end), geneSym, "0", strand,
- str(start), str(end), color, mouseover, txListStr]
+ str(start), str(end), color, mouseover, txListStr, ncbiListStr]
collOfh.write("\t".join(row))
collOfh.write("\n")
collOfh.close()
logging.info("Wrote %d decorator regions to %s" % (sum(len(d["transcripts"]) for d in regionData.values()), outDecoFname))
logging.info("Wrote %d collapsed regions to %s" % (len(regionData), outCollapsedFname))
main()