src/hg/makeDb/scripts/nmd/genePredNmdEsc 3a62ea7e9a8cb3503586a0a78570331308c9bc58

3a62ea7e9a8cb3503586a0a78570331308c9bc58
max
  Mon Apr 27 02:23:00 2026 -0700
NMD Escape MANE: expose NM_ accession via labelFields. refs #33737

Per QA, the MANE subtrack now shows the NCBI RefSeq accession by default
instead of the HGNC gene symbol, with the ENST and gene symbol still
selectable via labelFields.

- genePredNmdEsc: new --ncbi-id-field N option (default -1 = unused).
When set, the named bigGenePred column is captured per-transcript and
written into a new ncbiIds output column. For MANE pass 21.
- genePredNmdEsc: new --no-collapse option. By default, regions with
identical (chrom, start, end, rule) from multiple transcripts collapse
into one row with comma-separated lists. With --no-collapse the script
emits one row per (transcript, region). Used for MANE so each
label-field column holds a single value: the 74 MANE Plus Clinical
genes (e.g. LMNA) get two rows per region instead of one row with a
two-element list.
- nmdEscCollapsed.as: add lstring ncbiIds column. Schema is now bed9+3.
- nmd.ra (nmdEscMane only): labelFields ncbiIds,name,transcripts;
defaultLabelFields ncbiIds; labelSeparator " / ". Gencode and RefSeq
subtracks unchanged - they default to the gene symbol (name column)
and have an empty ncbiIds column.
- doc/hg38/nmd.txt: bump all three bedToBigBed invocations to bed9+3
and document the --ncbi-id-field 21 + --no-collapse invocation for
MANE.

Counts: MANE 68,028 (--no-collapse); Gencode 233,375; RefSeq 112,356.

diff --git src/hg/makeDb/scripts/nmd/genePredNmdEsc src/hg/makeDb/scripts/nmd/genePredNmdEsc
index e599b86bcfd..baf08c7b87e 100755
--- src/hg/makeDb/scripts/nmd/genePredNmdEsc
+++ src/hg/makeDb/scripts/nmd/genePredNmdEsc
@@ -1,409 +1,430 @@
 #!/usr/bin/env python
 
 import logging, sys, optparse, gzip, subprocess
 from collections import defaultdict
 from os.path import join, basename, dirname, isfile
 
 # ==== functions =====
 
 # Colors by rule
 RULE_COLORS = {
     1: "255,0,0",       # red - last 50bp of last coding junction
     2: "255,140,0",     # orange - intronless transcript
     3: "139,0,0",       # dark red - first 100bp of coding nucleotides
     4: "255,215,0",     # gold - long coding exon (>400 nt)
 }
 
 RULE_DESCRIPTIONS = {
     1: "<b>Rule 1</b> - CDS within 50bp of the last splice junction",
     2: "<b>Rule 2</b> - single coding exon, no 3'UTR intron",
     3: "<b>Rule 3</b> - first 100bp of coding nucleotides",
     4: "<b>Rule 4</b> - PTC in long exon (>400 nt)",
 }
 
 # Lindeboom et al. 2016: reduced NMD efficiency for PTCs in exons longer than this
 LONG_EXON_THRESHOLD = 400
 
 def parseArgs():
     " setup logging, parse command line arguments and options. -h shows auto-generated help page "
     parser = optparse.OptionParser("usage: %prog [options] inFname outDecoFname outCollapsedFname - "
         "Output BEDs with NMD escape regions. First output is decorator format, "
         "second is collapsed bigGenePred with gene symbols and transcript lists.")
 
     parser.add_option("-d", "--debug", dest="debug", action="store_true",
         help="show debug messages")
     parser.add_option("-f", "--format", dest="format", action="store", default="genePredExt",
         help="Input format: 'genePredExt' (with bin column, e.g. ncbiRefSeq) or "
              "'bigGenePred' (e.g. gencode .bb file, will use bigBedToBed). Default: genePredExt")
     parser.add_option("--gene-sym-field", dest="geneSymField", type="int", default=17,
         help="bigGenePred 0-based field index that holds the gene symbol. "
              "Default 17 (Gencode 'geneName'). MANE puts the HGNC symbol in field 18.")
+    parser.add_option("--ncbi-id-field", dest="ncbiIdField", type="int", default=-1,
+        help="bigGenePred 0-based field index that holds the NCBI RefSeq accession "
+             "(NM_/NR_). Default -1 (not extracted). For MANE, pass 21.")
+    parser.add_option("--no-collapse", dest="noCollapse", action="store_true", default=False,
+        help="Emit one row per (transcript, region) instead of collapsing identical "
+             "coordinates across transcripts. Useful for sets like MANE where each "
+             "label-field column should be a single value.")
     parser.add_option("--rule1-mode", dest="rule1Mode", action="store", default="cds",
         choices=["cds", "mrna"],
         help="How to count the 50 bp walk-back from the last splice junction. "
              "'cds' (default): count only CDS nucleotides, skipping 3'UTR; "
              "paints up to 50 bp of CDS regardless of how far the last junction "
              "sits past the stop codon. "
              "'mrna': count mRNA nucleotides including 3'UTR, then clip output "
              "to CDS; when the last junction is more than 50 mRNA-bp past the "
              "stop codon, no CDS position is painted (tracks the 55 bp rule "
              "literature more closely).")
     (options, args) = parser.parse_args()
 
     if len(args) != 3:
         parser.print_help()
         exit(1)
 
     if options.debug:
         logging.basicConfig(level=logging.DEBUG)
         logging.getLogger().setLevel(logging.DEBUG)
     else:
         logging.basicConfig(level=logging.INFO)
         logging.getLogger().setLevel(logging.INFO)
 
     return args, options
 
-def openInput(fname, fmt, geneSymField=17):
+def openInput(fname, fmt, geneSymField=17, ncbiIdField=-1):
     """Open input file and yield parsed transcript dicts.
     Both formats yield dicts with keys: name, chrom, strand, txStart, txEnd,
-    cdsStart, cdsEnd, exonCount, exonStarts, exonEnds, geneSym, cdsStartStat, cdsEndStat, exonFrames
+    cdsStart, cdsEnd, exonCount, exonStarts, exonEnds, geneSym, ncbiId,
+    cdsStartStat, cdsEndStat, exonFrames
     """
     if fmt == "bigGenePred":
         # pipe through bigBedToBed
         proc = subprocess.Popen(["bigBedToBed", fname, "stdout"],
             stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
         fh = proc.stdout
     elif fname.endswith(".gz"):
         fh = gzip.open(fname, "rt")
     else:
         fh = open(fname)
 
     for line in fh:
         line = line.rstrip("\n\r")
         if not line:
             continue
         fields = line.split("\t")
 
         if fmt == "genePredExt":
             # genePredExt with bin column:
             # bin, name, chrom, strand, txStart, txEnd, cdsStart, cdsEnd,
             # exonCount, exonStarts, exonEnds, score, name2, cdsStartStat, cdsEndStat, exonFrames
             rec = {
                 "name": fields[1],
                 "chrom": fields[2],
                 "strand": fields[3],
                 "txStart": int(fields[4]),
                 "txEnd": int(fields[5]),
                 "cdsStart": int(fields[6]),
                 "cdsEnd": int(fields[7]),
                 "exonCount": int(fields[8]),
                 "exonStarts": [int(x) for x in fields[9].strip(",").split(",") if x],
                 "exonEnds": [int(x) for x in fields[10].strip(",").split(",") if x],
                 "geneSym": fields[12] if len(fields) > 12 else "",
+                "ncbiId": "",
                 "cdsStartStat": fields[13] if len(fields) > 13 else "none",
                 "cdsEndStat": fields[14] if len(fields) > 14 else "none",
                 "exonFrames": fields[15].strip(",") if len(fields) > 15 else "",
             }
         elif fmt == "bigGenePred":
             # bigGenePred:
             # chrom, chromStart, chromEnd, name, score, strand, thickStart, thickEnd,
             # color, blockCount, blockSizes, chromStarts, name2, cdsStartStat, cdsEndStat,
             # exonFrames, type, geneName, geneName2, geneType
             chromStart = int(fields[1])
             blockSizes = [int(x) for x in fields[10].strip(",").split(",") if x]
             blockStarts = [int(x) for x in fields[11].strip(",").split(",") if x]
             rec = {
                 "name": fields[3],
                 "chrom": fields[0],
                 "strand": fields[5],
                 "txStart": chromStart,
                 "txEnd": int(fields[2]),
                 "cdsStart": int(fields[6]),
                 "cdsEnd": int(fields[7]),
                 "exonCount": int(fields[9]),
                 "exonStarts": [chromStart + s for s in blockStarts],
                 "exonEnds": [chromStart + s + sz for s, sz in zip(blockStarts, blockSizes)],
                 "geneSym": fields[geneSymField] if len(fields) > geneSymField else "",
+                "ncbiId": fields[ncbiIdField] if (ncbiIdField >= 0 and len(fields) > ncbiIdField) else "",
                 "cdsStartStat": fields[13] if len(fields) > 13 else "none",
                 "cdsEndStat": fields[14] if len(fields) > 14 else "none",
                 "exonFrames": fields[15].strip(",") if len(fields) > 15 else "",
             }
         else:
             raise ValueError("Unknown format: " + fmt)
 
         yield rec
 
     if fmt == "bigGenePred":
         proc.wait()
 
 def bedOut(row, txStart, txEnd, ofh, rule):
     "write a decorator BED line"
     row = [str(x) for x in row]
     chrom, start, end, name = row
     decItem = chrom+":"+str(txStart)+"-"+str(txEnd)+":"+name
     color = RULE_COLORS[rule]
     mouseover = RULE_DESCRIPTIONS[rule]
     row = [chrom, start, end, name, "0", ".", start, end, color, "1",
            str(int(end)-int(start)), "0", decItem, "block", color, "", mouseover]
     ofh.write("\t".join(row))
     ofh.write("\n")
 
 
 def rule1Regions(rec, nUpstreamJxn=50, mode="cds"):
     """50 bp rule: paint CDS positions near the transcript's last splice
     junction. The last junction is taken over ALL exons of the transcript,
     including 3'UTR introns, because those introns deposit EJCs that can
     trigger NMD.
 
     Two modes control how the nUpstreamJxn walk-back from the last junction
     is counted:
       mode='cds'  - count only CDS nucleotides, skipping any 3'UTR as you
                     walk upstream. Paints up to nUpstreamJxn bp of CDS
                     regardless of how far the last junction sits past the
                     stop codon. This is what the 'last 50 bp of last
                     junction' label most literally means in terms of CDS.
       mode='mrna' - count mRNA nucleotides (including 3'UTR). If the
                     walk-back stays in 3'UTR the whole way, no CDS is
                     painted; otherwise the CDS overlap of the walked
                     window is painted. This tracks the 55 bp rule
                     literature, where the distance is an mRNA distance.
 
     Plus any CDS downstream of the last junction (the CDS overlap with the
     last mRNA exon). Returns a list of (genStart, genEnd) tuples in
     genomic order, CDS-only."""
     strand = rec["strand"]
     cdsStart = rec["cdsStart"]
     cdsEnd = rec["cdsEnd"]
     exonStarts = rec["exonStarts"]
     exonEnds = rec["exonEnds"]
 
     if len(exonStarts) < 2:
         return []  # no splice junction -> rule does not apply
 
     exons = list(zip(exonStarts, exonEnds))
     if strand == "-":
         exons = list(reversed(exons))  # exons now in mRNA 5'->3' order
 
     def clipToCds(s, e):
         s2, e2 = max(s, cdsStart), min(e, cdsEnd)
         return (s2, e2) if s2 < e2 else None
 
     out = []
     # everything downstream of the last junction = last mRNA exon clipped to CDS
     clipped = clipToCds(*exons[-1])
     if clipped:
         out.append(clipped)
 
     # walk upstream of the last junction through earlier exons in mRNA order
     remaining = nUpstreamJxn
     for i in range(len(exons) - 2, -1, -1):
         if remaining <= 0:
             break
         exStart, exEnd = exons[i]
 
         if mode == "cds":
             # count only CDS bp; skip 3'UTR-only exons entirely
             cds = clipToCds(exStart, exEnd)
             if not cds:
                 continue
             cdsS, cdsE = cds
             cdsLen = cdsE - cdsS
             if remaining >= cdsLen:
                 out.append((cdsS, cdsE))
                 remaining -= cdsLen
             else:
                 # take the 3'-most `remaining` CDS bp
                 if strand == "+":
                     out.append((cdsE - remaining, cdsE))
                 else:
                     out.append((cdsS, cdsS + remaining))
                 remaining = 0
 
         else:  # mode == "mrna"
             exLen = exEnd - exStart
             if remaining >= exLen:
                 chunk = (exStart, exEnd)
                 remaining -= exLen
             else:
                 # take the 3'-most `remaining` mRNA bp
                 if strand == "+":
                     chunk = (exEnd - remaining, exEnd)
                 else:
                     chunk = (exStart, exStart + remaining)
                 remaining = 0
             clipped = clipToCds(*chunk)
             if clipped:
                 out.append(clipped)
 
     return sorted(out)
 
 
 def outputExonsUpTo(from3Prime, cdsExons, chrom, txStart, txEnd, name, n, ofh, rule):
     """ given a list of (start, end), output start-end BEDs to ofh that cover n nucleotides.
         Returns list of (chrom, start, end) regions output. """
     doneNs = 0
     doStop = False
     regions = []
     if from3Prime:
         cdsExons = list(reversed(cdsExons))
 
     # -50 means "-50 from the last junction" so take length of last exon + 50
     if n < 0:
         ex1Start = cdsExons[0][0]
         ex1End = cdsExons[0][1]
         n = (ex1End-ex1Start)+50
 
     for start, end in cdsExons:
         if doneNs >= n:
             return regions
         exLen = end-start
         missBps = n-doneNs
         if doneNs+exLen > n:
             if from3Prime:
                 start = end-missBps
             else:
                 end = start+missBps
             doStop = True
         bed = [chrom, str(start), str(end), name]
         doneNs += exLen
         bedOut(bed, txStart, txEnd, ofh, rule)
         regions.append( (chrom, start, end) )
         if doStop:
             return regions
     return regions
 
 # ----------- main --------------
 def main():
     args, options = parseArgs()
 
     inFname, outDecoFname, outCollapsedFname = args
 
     decoOfh = open(outDecoFname, "w")
 
     # collect regions for the collapsed output:
-    # key = (chrom, start, end, rule) -> {"transcripts": [...], "strand": set(), "geneSym": str}
-    regionData = defaultdict(lambda: {"transcripts": [], "strands": set(), "geneSym": ""})
+    # key = (chrom, start, end, rule) -> {"transcripts": [...], "ncbiIds": [...],
+    #                                     "strand": set(), "geneSym": str}
+    regionData = defaultdict(lambda: {"transcripts": [], "ncbiIds": [], "strands": set(), "geneSym": ""})
 
-    for rec in openInput(inFname, options.format, options.geneSymField):
+    for rec in openInput(inFname, options.format, options.geneSymField, options.ncbiIdField):
         name = rec["name"]
         chrom = rec["chrom"]
         strand = rec["strand"]
         txStart = rec["txStart"]
         txEnd = rec["txEnd"]
         cdsStart = rec["cdsStart"]
         cdsEnd = rec["cdsEnd"]
 
         # skip non-coding transcripts (cdsStart == cdsEnd)
         if cdsStart >= cdsEnd:
             continue
 
         # gene symbol from record, fall back to transcript name
         geneSym = rec["geneSym"]
 
         # only keep exons that have CDS and cut around CDS
         cdsExons = []
         for exStart, exEnd in zip(rec["exonStarts"], rec["exonEnds"]):
             # 5' UTR
             if cdsStart > exEnd:
                 continue
             # 3' UTR
             if exStart > cdsEnd:
                 continue
             if (exStart <= cdsStart and cdsStart <= exEnd):
                 exStart = cdsStart
             if (exStart <= cdsEnd and cdsEnd <= exEnd):
                 exEnd = cdsEnd
             # skip degenerate cdsExons (CDS boundary lands exactly on exon boundary)
             if exStart >= exEnd:
                 continue
             cdsExons.append( (exStart, exEnd) )
 
+        ncbiId = rec.get("ncbiId", "")
+
         def addRegions(regions, rule):
             for r in regions:
+                if options.noCollapse:
+                    key = (r[0], r[1], r[2], rule, name)
+                else:
                     key = (r[0], r[1], r[2], rule)
                 regionData[key]["transcripts"].append(name)
+                if ncbiId:
+                    regionData[key]["ncbiIds"].append(ncbiId)
                 regionData[key]["strands"].add(strand)
                 if geneSym and not regionData[key]["geneSym"]:
                     regionData[key]["geneSym"] = geneSym
 
         # rule 2 applies when no EJC can be deposited downstream of the stop codon.
         # that requires: single CDS exon (no CDS-CDS junction) AND no 3'UTR intron.
         # 5'UTR introns are allowed - their EJCs are upstream of the stop and either
         # are cleared by the scanning 40S or are never encountered by the terminating
         # ribosome, so they do not trigger NMD.
         if strand == "+":
             hasThreeUtrIntron = any(s > cdsEnd for s in rec["exonStarts"])
         else:
             hasThreeUtrIntron = any(e < cdsStart for e in rec["exonEnds"])
 
         if len(cdsExons) == 1 and not hasThreeUtrIntron:
             # rule 2: no EJC downstream of stop codon -> any PTC in CDS escapes NMD
             bed = [chrom, str(cdsStart), str(cdsEnd), name]
             bedOut(bed, txStart, txEnd, decoOfh, 2)
             addRegions([(chrom, cdsStart, cdsEnd)], 2)
         else:
             # rule 3: first 100bp of coding nucleotides
             if strand == "+":
                 regions = outputExonsUpTo(False, cdsExons, chrom, txStart, txEnd, name, 100, decoOfh, 3)
             else:
                 regions = outputExonsUpTo(True, cdsExons, chrom, txStart, txEnd, name, 100, decoOfh, 3)
             addRegions(regions, 3)
 
             # rule 1: within 50 mRNA-bp upstream of the last splice junction of
             # the transcript (incl. 3'UTR introns), plus anything downstream of
             # that junction, clipped to CDS.
             rule1Tuples = []
             for s, e in rule1Regions(rec, 50, mode=options.rule1Mode):
                 bed = [chrom, str(s), str(e), name]
                 bedOut(bed, txStart, txEnd, decoOfh, 1)
                 rule1Tuples.append((chrom, s, e))
             addRegions(rule1Tuples, 1)
 
             # rule 4: coding exons >400 nt that have a downstream splice junction.
             # Without a 3'UTR intron, the last CDS exon is the last mRNA exon
             # and is excluded (no downstream EJC). With a 3'UTR intron, every
             # CDS exon has a downstream junction and is eligible.
             if hasThreeUtrIntron:
                 rule4Exons = cdsExons
             elif strand == "+":
                 rule4Exons = cdsExons[:-1]
             else:
                 rule4Exons = cdsExons[1:]
 
             for exStart, exEnd in rule4Exons:
                 if exEnd - exStart > LONG_EXON_THRESHOLD:
                     bed = [chrom, str(exStart), str(exEnd), name]
                     bedOut(bed, txStart, txEnd, decoOfh, 4)
                     addRegions([(chrom, exStart, exEnd)], 4)
 
     decoOfh.close()
 
-    # write collapsed output as bed 9 + mouseover + transcripts
+    # write collapsed output as bed 9 + mouseover + transcripts + ncbiIds
     collOfh = open(outCollapsedFname, "w")
-    for (chrom, start, end, rule), data in sorted(regionData.items()):
+    for key, data in sorted(regionData.items()):
+        chrom, start, end, rule = key[:4]
         txList = sorted(set(data["transcripts"]))
+        ncbiList = sorted(set(data["ncbiIds"]))
         geneSym = data["geneSym"]
         if not geneSym:
             geneSym = txList[0]
 
         # pick strand: use the strand if all transcripts agree, else "."
         strands = data["strands"]
         strand = list(strands)[0] if len(strands) == 1 else "."
 
         color = RULE_COLORS[rule]
         txListStr = ",".join(txList)
+        ncbiListStr = ",".join(ncbiList)
         if len(txList) <= 3:
             mouseover = RULE_DESCRIPTIONS[rule] + " (" + ", ".join(txList) + ")"
         else:
             mouseover = RULE_DESCRIPTIONS[rule] + " (" + str(len(txList)) + " transcripts)"
 
         row = [chrom, str(start), str(end), geneSym, "0", strand,
-               str(start), str(end), color, mouseover, txListStr]
+               str(start), str(end), color, mouseover, txListStr, ncbiListStr]
         collOfh.write("\t".join(row))
         collOfh.write("\n")
 
     collOfh.close()
     logging.info("Wrote %d decorator regions to %s" % (sum(len(d["transcripts"]) for d in regionData.values()), outDecoFname))
     logging.info("Wrote %d collapsed regions to %s" % (len(regionData), outCollapsedFname))
 
 main()