src/hg/makeDb/scripts/nmd/genePredNmdEsc 3a62ea7e9a8cb3503586a0a78570331308c9bc58

3a62ea7e9a8cb3503586a0a78570331308c9bc58
max
  Mon Apr 27 02:23:00 2026 -0700
NMD Escape MANE: expose NM_ accession via labelFields. refs #33737

Per QA, the MANE subtrack now shows the NCBI RefSeq accession by default
instead of the HGNC gene symbol, with the ENST and gene symbol still
selectable via labelFields.

- genePredNmdEsc: new --ncbi-id-field N option (default -1 = unused).
When set, the named bigGenePred column is captured per-transcript and
written into a new ncbiIds output column. For MANE pass 21.
- genePredNmdEsc: new --no-collapse option. By default, regions with
identical (chrom, start, end, rule) from multiple transcripts collapse
into one row with comma-separated lists. With --no-collapse the script
emits one row per (transcript, region). Used for MANE so each
label-field column holds a single value: the 74 MANE Plus Clinical
genes (e.g. LMNA) get two rows per region instead of one row with a
two-element list.
- nmdEscCollapsed.as: add lstring ncbiIds column. Schema is now bed9+3.
- nmd.ra (nmdEscMane only): labelFields ncbiIds,name,transcripts;
defaultLabelFields ncbiIds; labelSeparator " / ". Gencode and RefSeq
subtracks unchanged - they default to the gene symbol (name column)
and have an empty ncbiIds column.
- doc/hg38/nmd.txt: bump all three bedToBigBed invocations to bed9+3
and document the --ncbi-id-field 21 + --no-collapse invocation for
MANE.

Counts: MANE 68,028 (--no-collapse); Gencode 233,375; RefSeq 112,356.

diff --git src/hg/makeDb/scripts/nmd/genePredNmdEsc src/hg/makeDb/scripts/nmd/genePredNmdEsc
index e599b86bcfd..baf08c7b87e 100755
--- src/hg/makeDb/scripts/nmd/genePredNmdEsc
+++ src/hg/makeDb/scripts/nmd/genePredNmdEsc
@@ -26,59 +26,67 @@
 
 def parseArgs():
     " setup logging, parse command line arguments and options. -h shows auto-generated help page "
     parser = optparse.OptionParser("usage: %prog [options] inFname outDecoFname outCollapsedFname - "
         "Output BEDs with NMD escape regions. First output is decorator format, "
         "second is collapsed bigGenePred with gene symbols and transcript lists.")
 
     parser.add_option("-d", "--debug", dest="debug", action="store_true",
         help="show debug messages")
     parser.add_option("-f", "--format", dest="format", action="store", default="genePredExt",
         help="Input format: 'genePredExt' (with bin column, e.g. ncbiRefSeq) or "
              "'bigGenePred' (e.g. gencode .bb file, will use bigBedToBed). Default: genePredExt")
     parser.add_option("--gene-sym-field", dest="geneSymField", type="int", default=17,
         help="bigGenePred 0-based field index that holds the gene symbol. "
              "Default 17 (Gencode 'geneName'). MANE puts the HGNC symbol in field 18.")
+    parser.add_option("--ncbi-id-field", dest="ncbiIdField", type="int", default=-1,
+        help="bigGenePred 0-based field index that holds the NCBI RefSeq accession "
+             "(NM_/NR_). Default -1 (not extracted). For MANE, pass 21.")
+    parser.add_option("--no-collapse", dest="noCollapse", action="store_true", default=False,
+        help="Emit one row per (transcript, region) instead of collapsing identical "
+             "coordinates across transcripts. Useful for sets like MANE where each "
+             "label-field column should be a single value.")
     parser.add_option("--rule1-mode", dest="rule1Mode", action="store", default="cds",
         choices=["cds", "mrna"],
         help="How to count the 50 bp walk-back from the last splice junction. "
              "'cds' (default): count only CDS nucleotides, skipping 3'UTR; "
              "paints up to 50 bp of CDS regardless of how far the last junction "
              "sits past the stop codon. "
              "'mrna': count mRNA nucleotides including 3'UTR, then clip output "
              "to CDS; when the last junction is more than 50 mRNA-bp past the "
              "stop codon, no CDS position is painted (tracks the 55 bp rule "
              "literature more closely).")
     (options, args) = parser.parse_args()
 
     if len(args) != 3:
         parser.print_help()
         exit(1)
 
     if options.debug:
         logging.basicConfig(level=logging.DEBUG)
         logging.getLogger().setLevel(logging.DEBUG)
     else:
         logging.basicConfig(level=logging.INFO)
         logging.getLogger().setLevel(logging.INFO)
 
     return args, options
 
-def openInput(fname, fmt, geneSymField=17):
+def openInput(fname, fmt, geneSymField=17, ncbiIdField=-1):
     """Open input file and yield parsed transcript dicts.
     Both formats yield dicts with keys: name, chrom, strand, txStart, txEnd,
-    cdsStart, cdsEnd, exonCount, exonStarts, exonEnds, geneSym, cdsStartStat, cdsEndStat, exonFrames
+    cdsStart, cdsEnd, exonCount, exonStarts, exonEnds, geneSym, ncbiId,
+    cdsStartStat, cdsEndStat, exonFrames
     """
     if fmt == "bigGenePred":
         # pipe through bigBedToBed
         proc = subprocess.Popen(["bigBedToBed", fname, "stdout"],
             stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
         fh = proc.stdout
     elif fname.endswith(".gz"):
         fh = gzip.open(fname, "rt")
     else:
         fh = open(fname)
 
     for line in fh:
         line = line.rstrip("\n\r")
         if not line:
             continue
@@ -88,54 +96,56 @@
             # genePredExt with bin column:
             # bin, name, chrom, strand, txStart, txEnd, cdsStart, cdsEnd,
             # exonCount, exonStarts, exonEnds, score, name2, cdsStartStat, cdsEndStat, exonFrames
             rec = {
                 "name": fields[1],
                 "chrom": fields[2],
                 "strand": fields[3],
                 "txStart": int(fields[4]),
                 "txEnd": int(fields[5]),
                 "cdsStart": int(fields[6]),
                 "cdsEnd": int(fields[7]),
                 "exonCount": int(fields[8]),
                 "exonStarts": [int(x) for x in fields[9].strip(",").split(",") if x],
                 "exonEnds": [int(x) for x in fields[10].strip(",").split(",") if x],
                 "geneSym": fields[12] if len(fields) > 12 else "",
+                "ncbiId": "",
                 "cdsStartStat": fields[13] if len(fields) > 13 else "none",
                 "cdsEndStat": fields[14] if len(fields) > 14 else "none",
                 "exonFrames": fields[15].strip(",") if len(fields) > 15 else "",
             }
         elif fmt == "bigGenePred":
             # bigGenePred:
             # chrom, chromStart, chromEnd, name, score, strand, thickStart, thickEnd,
             # color, blockCount, blockSizes, chromStarts, name2, cdsStartStat, cdsEndStat,
             # exonFrames, type, geneName, geneName2, geneType
             chromStart = int(fields[1])
             blockSizes = [int(x) for x in fields[10].strip(",").split(",") if x]
             blockStarts = [int(x) for x in fields[11].strip(",").split(",") if x]
             rec = {
                 "name": fields[3],
                 "chrom": fields[0],
                 "strand": fields[5],
                 "txStart": chromStart,
                 "txEnd": int(fields[2]),
                 "cdsStart": int(fields[6]),
                 "cdsEnd": int(fields[7]),
                 "exonCount": int(fields[9]),
                 "exonStarts": [chromStart + s for s in blockStarts],
                 "exonEnds": [chromStart + s + sz for s, sz in zip(blockStarts, blockSizes)],
                 "geneSym": fields[geneSymField] if len(fields) > geneSymField else "",
+                "ncbiId": fields[ncbiIdField] if (ncbiIdField >= 0 and len(fields) > ncbiIdField) else "",
                 "cdsStartStat": fields[13] if len(fields) > 13 else "none",
                 "cdsEndStat": fields[14] if len(fields) > 14 else "none",
                 "exonFrames": fields[15].strip(",") if len(fields) > 15 else "",
             }
         else:
             raise ValueError("Unknown format: " + fmt)
 
         yield rec
 
     if fmt == "bigGenePred":
         proc.wait()
 
 def bedOut(row, txStart, txEnd, ofh, rule):
     "write a decorator BED line"
     row = [str(x) for x in row]
@@ -269,34 +279,35 @@
         bedOut(bed, txStart, txEnd, ofh, rule)
         regions.append( (chrom, start, end) )
         if doStop:
             return regions
     return regions
 
 # ----------- main --------------
 def main():
     args, options = parseArgs()
 
     inFname, outDecoFname, outCollapsedFname = args
 
     decoOfh = open(outDecoFname, "w")
 
     # collect regions for the collapsed output:
-    # key = (chrom, start, end, rule) -> {"transcripts": [...], "strand": set(), "geneSym": str}
-    regionData = defaultdict(lambda: {"transcripts": [], "strands": set(), "geneSym": ""})
+    # key = (chrom, start, end, rule) -> {"transcripts": [...], "ncbiIds": [...],
+    #                                     "strand": set(), "geneSym": str}
+    regionData = defaultdict(lambda: {"transcripts": [], "ncbiIds": [], "strands": set(), "geneSym": ""})
 
-    for rec in openInput(inFname, options.format, options.geneSymField):
+    for rec in openInput(inFname, options.format, options.geneSymField, options.ncbiIdField):
         name = rec["name"]
         chrom = rec["chrom"]
         strand = rec["strand"]
         txStart = rec["txStart"]
         txEnd = rec["txEnd"]
         cdsStart = rec["cdsStart"]
         cdsEnd = rec["cdsEnd"]
 
         # skip non-coding transcripts (cdsStart == cdsEnd)
         if cdsStart >= cdsEnd:
             continue
 
         # gene symbol from record, fall back to transcript name
         geneSym = rec["geneSym"]
 
@@ -306,34 +317,41 @@
             # 5' UTR
             if cdsStart > exEnd:
                 continue
             # 3' UTR
             if exStart > cdsEnd:
                 continue
             if (exStart <= cdsStart and cdsStart <= exEnd):
                 exStart = cdsStart
             if (exStart <= cdsEnd and cdsEnd <= exEnd):
                 exEnd = cdsEnd
             # skip degenerate cdsExons (CDS boundary lands exactly on exon boundary)
             if exStart >= exEnd:
                 continue
             cdsExons.append( (exStart, exEnd) )
 
+        ncbiId = rec.get("ncbiId", "")
+
         def addRegions(regions, rule):
             for r in regions:
+                if options.noCollapse:
+                    key = (r[0], r[1], r[2], rule, name)
+                else:
                     key = (r[0], r[1], r[2], rule)
                 regionData[key]["transcripts"].append(name)
+                if ncbiId:
+                    regionData[key]["ncbiIds"].append(ncbiId)
                 regionData[key]["strands"].add(strand)
                 if geneSym and not regionData[key]["geneSym"]:
                     regionData[key]["geneSym"] = geneSym
 
         # rule 2 applies when no EJC can be deposited downstream of the stop codon.
         # that requires: single CDS exon (no CDS-CDS junction) AND no 3'UTR intron.
         # 5'UTR introns are allowed - their EJCs are upstream of the stop and either
         # are cleared by the scanning 40S or are never encountered by the terminating
         # ribosome, so they do not trigger NMD.
         if strand == "+":
             hasThreeUtrIntron = any(s > cdsEnd for s in rec["exonStarts"])
         else:
             hasThreeUtrIntron = any(e < cdsStart for e in rec["exonEnds"])
 
         if len(cdsExons) == 1 and not hasThreeUtrIntron:
@@ -366,44 +384,47 @@
             if hasThreeUtrIntron:
                 rule4Exons = cdsExons
             elif strand == "+":
                 rule4Exons = cdsExons[:-1]
             else:
                 rule4Exons = cdsExons[1:]
 
             for exStart, exEnd in rule4Exons:
                 if exEnd - exStart > LONG_EXON_THRESHOLD:
                     bed = [chrom, str(exStart), str(exEnd), name]
                     bedOut(bed, txStart, txEnd, decoOfh, 4)
                     addRegions([(chrom, exStart, exEnd)], 4)
 
     decoOfh.close()
 
-    # write collapsed output as bed 9 + mouseover + transcripts
+    # write collapsed output as bed 9 + mouseover + transcripts + ncbiIds
     collOfh = open(outCollapsedFname, "w")
-    for (chrom, start, end, rule), data in sorted(regionData.items()):
+    for key, data in sorted(regionData.items()):
+        chrom, start, end, rule = key[:4]
         txList = sorted(set(data["transcripts"]))
+        ncbiList = sorted(set(data["ncbiIds"]))
         geneSym = data["geneSym"]
         if not geneSym:
             geneSym = txList[0]
 
         # pick strand: use the strand if all transcripts agree, else "."
         strands = data["strands"]
         strand = list(strands)[0] if len(strands) == 1 else "."
 
         color = RULE_COLORS[rule]
         txListStr = ",".join(txList)
+        ncbiListStr = ",".join(ncbiList)
         if len(txList) <= 3:
             mouseover = RULE_DESCRIPTIONS[rule] + " (" + ", ".join(txList) + ")"
         else:
             mouseover = RULE_DESCRIPTIONS[rule] + " (" + str(len(txList)) + " transcripts)"
 
         row = [chrom, str(start), str(end), geneSym, "0", strand,
-               str(start), str(end), color, mouseover, txListStr]
+               str(start), str(end), color, mouseover, txListStr, ncbiListStr]
         collOfh.write("\t".join(row))
         collOfh.write("\n")
 
     collOfh.close()
     logging.info("Wrote %d decorator regions to %s" % (sum(len(d["transcripts"]) for d in regionData.values()), outDecoFname))
     logging.info("Wrote %d collapsed regions to %s" % (len(regionData), outCollapsedFname))
 
 main()