3a62ea7e9a8cb3503586a0a78570331308c9bc58 max Mon Apr 27 02:23:00 2026 -0700 NMD Escape MANE: expose NM_ accession via labelFields. refs #33737 Per QA, the MANE subtrack now shows the NCBI RefSeq accession by default instead of the HGNC gene symbol, with the ENST and gene symbol still selectable via labelFields. - genePredNmdEsc: new --ncbi-id-field N option (default -1 = unused). When set, the named bigGenePred column is captured per-transcript and written into a new ncbiIds output column. For MANE pass 21. - genePredNmdEsc: new --no-collapse option. By default, regions with identical (chrom, start, end, rule) from multiple transcripts collapse into one row with comma-separated lists. With --no-collapse the script emits one row per (transcript, region). Used for MANE so each label-field column holds a single value: the 74 MANE Plus Clinical genes (e.g. LMNA) get two rows per region instead of one row with a two-element list. - nmdEscCollapsed.as: add lstring ncbiIds column. Schema is now bed9+3. - nmd.ra (nmdEscMane only): labelFields ncbiIds,name,transcripts; defaultLabelFields ncbiIds; labelSeparator " / ". Gencode and RefSeq subtracks unchanged - they default to the gene symbol (name column) and have an empty ncbiIds column. - doc/hg38/nmd.txt: bump all three bedToBigBed invocations to bed9+3 and document the --ncbi-id-field 21 + --no-collapse invocation for MANE. Counts: MANE 68,028 (--no-collapse); Gencode 233,375; RefSeq 112,356. diff --git src/hg/makeDb/scripts/nmd/genePredNmdEsc src/hg/makeDb/scripts/nmd/genePredNmdEsc index e599b86bcfd..baf08c7b87e 100755 --- src/hg/makeDb/scripts/nmd/genePredNmdEsc +++ src/hg/makeDb/scripts/nmd/genePredNmdEsc @@ -26,59 +26,67 @@ def parseArgs(): " setup logging, parse command line arguments and options. -h shows auto-generated help page " parser = optparse.OptionParser("usage: %prog [options] inFname outDecoFname outCollapsedFname - " "Output BEDs with NMD escape regions. First output is decorator format, " "second is collapsed bigGenePred with gene symbols and transcript lists.") parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages") parser.add_option("-f", "--format", dest="format", action="store", default="genePredExt", help="Input format: 'genePredExt' (with bin column, e.g. ncbiRefSeq) or " "'bigGenePred' (e.g. gencode .bb file, will use bigBedToBed). Default: genePredExt") parser.add_option("--gene-sym-field", dest="geneSymField", type="int", default=17, help="bigGenePred 0-based field index that holds the gene symbol. " "Default 17 (Gencode 'geneName'). MANE puts the HGNC symbol in field 18.") + parser.add_option("--ncbi-id-field", dest="ncbiIdField", type="int", default=-1, + help="bigGenePred 0-based field index that holds the NCBI RefSeq accession " + "(NM_/NR_). Default -1 (not extracted). For MANE, pass 21.") + parser.add_option("--no-collapse", dest="noCollapse", action="store_true", default=False, + help="Emit one row per (transcript, region) instead of collapsing identical " + "coordinates across transcripts. Useful for sets like MANE where each " + "label-field column should be a single value.") parser.add_option("--rule1-mode", dest="rule1Mode", action="store", default="cds", choices=["cds", "mrna"], help="How to count the 50 bp walk-back from the last splice junction. " "'cds' (default): count only CDS nucleotides, skipping 3'UTR; " "paints up to 50 bp of CDS regardless of how far the last junction " "sits past the stop codon. " "'mrna': count mRNA nucleotides including 3'UTR, then clip output " "to CDS; when the last junction is more than 50 mRNA-bp past the " "stop codon, no CDS position is painted (tracks the 55 bp rule " "literature more closely).") (options, args) = parser.parse_args() if len(args) != 3: parser.print_help() exit(1) if options.debug: logging.basicConfig(level=logging.DEBUG) logging.getLogger().setLevel(logging.DEBUG) else: logging.basicConfig(level=logging.INFO) logging.getLogger().setLevel(logging.INFO) return args, options -def openInput(fname, fmt, geneSymField=17): +def openInput(fname, fmt, geneSymField=17, ncbiIdField=-1): """Open input file and yield parsed transcript dicts. Both formats yield dicts with keys: name, chrom, strand, txStart, txEnd, - cdsStart, cdsEnd, exonCount, exonStarts, exonEnds, geneSym, cdsStartStat, cdsEndStat, exonFrames + cdsStart, cdsEnd, exonCount, exonStarts, exonEnds, geneSym, ncbiId, + cdsStartStat, cdsEndStat, exonFrames """ if fmt == "bigGenePred": # pipe through bigBedToBed proc = subprocess.Popen(["bigBedToBed", fname, "stdout"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) fh = proc.stdout elif fname.endswith(".gz"): fh = gzip.open(fname, "rt") else: fh = open(fname) for line in fh: line = line.rstrip("\n\r") if not line: continue @@ -88,54 +96,56 @@ # genePredExt with bin column: # bin, name, chrom, strand, txStart, txEnd, cdsStart, cdsEnd, # exonCount, exonStarts, exonEnds, score, name2, cdsStartStat, cdsEndStat, exonFrames rec = { "name": fields[1], "chrom": fields[2], "strand": fields[3], "txStart": int(fields[4]), "txEnd": int(fields[5]), "cdsStart": int(fields[6]), "cdsEnd": int(fields[7]), "exonCount": int(fields[8]), "exonStarts": [int(x) for x in fields[9].strip(",").split(",") if x], "exonEnds": [int(x) for x in fields[10].strip(",").split(",") if x], "geneSym": fields[12] if len(fields) > 12 else "", + "ncbiId": "", "cdsStartStat": fields[13] if len(fields) > 13 else "none", "cdsEndStat": fields[14] if len(fields) > 14 else "none", "exonFrames": fields[15].strip(",") if len(fields) > 15 else "", } elif fmt == "bigGenePred": # bigGenePred: # chrom, chromStart, chromEnd, name, score, strand, thickStart, thickEnd, # color, blockCount, blockSizes, chromStarts, name2, cdsStartStat, cdsEndStat, # exonFrames, type, geneName, geneName2, geneType chromStart = int(fields[1]) blockSizes = [int(x) for x in fields[10].strip(",").split(",") if x] blockStarts = [int(x) for x in fields[11].strip(",").split(",") if x] rec = { "name": fields[3], "chrom": fields[0], "strand": fields[5], "txStart": chromStart, "txEnd": int(fields[2]), "cdsStart": int(fields[6]), "cdsEnd": int(fields[7]), "exonCount": int(fields[9]), "exonStarts": [chromStart + s for s in blockStarts], "exonEnds": [chromStart + s + sz for s, sz in zip(blockStarts, blockSizes)], "geneSym": fields[geneSymField] if len(fields) > geneSymField else "", + "ncbiId": fields[ncbiIdField] if (ncbiIdField >= 0 and len(fields) > ncbiIdField) else "", "cdsStartStat": fields[13] if len(fields) > 13 else "none", "cdsEndStat": fields[14] if len(fields) > 14 else "none", "exonFrames": fields[15].strip(",") if len(fields) > 15 else "", } else: raise ValueError("Unknown format: " + fmt) yield rec if fmt == "bigGenePred": proc.wait() def bedOut(row, txStart, txEnd, ofh, rule): "write a decorator BED line" row = [str(x) for x in row] @@ -269,34 +279,35 @@ bedOut(bed, txStart, txEnd, ofh, rule) regions.append( (chrom, start, end) ) if doStop: return regions return regions # ----------- main -------------- def main(): args, options = parseArgs() inFname, outDecoFname, outCollapsedFname = args decoOfh = open(outDecoFname, "w") # collect regions for the collapsed output: - # key = (chrom, start, end, rule) -> {"transcripts": [...], "strand": set(), "geneSym": str} - regionData = defaultdict(lambda: {"transcripts": [], "strands": set(), "geneSym": ""}) + # key = (chrom, start, end, rule) -> {"transcripts": [...], "ncbiIds": [...], + # "strand": set(), "geneSym": str} + regionData = defaultdict(lambda: {"transcripts": [], "ncbiIds": [], "strands": set(), "geneSym": ""}) - for rec in openInput(inFname, options.format, options.geneSymField): + for rec in openInput(inFname, options.format, options.geneSymField, options.ncbiIdField): name = rec["name"] chrom = rec["chrom"] strand = rec["strand"] txStart = rec["txStart"] txEnd = rec["txEnd"] cdsStart = rec["cdsStart"] cdsEnd = rec["cdsEnd"] # skip non-coding transcripts (cdsStart == cdsEnd) if cdsStart >= cdsEnd: continue # gene symbol from record, fall back to transcript name geneSym = rec["geneSym"] @@ -306,34 +317,41 @@ # 5' UTR if cdsStart > exEnd: continue # 3' UTR if exStart > cdsEnd: continue if (exStart <= cdsStart and cdsStart <= exEnd): exStart = cdsStart if (exStart <= cdsEnd and cdsEnd <= exEnd): exEnd = cdsEnd # skip degenerate cdsExons (CDS boundary lands exactly on exon boundary) if exStart >= exEnd: continue cdsExons.append( (exStart, exEnd) ) + ncbiId = rec.get("ncbiId", "") + def addRegions(regions, rule): for r in regions: + if options.noCollapse: + key = (r[0], r[1], r[2], rule, name) + else: key = (r[0], r[1], r[2], rule) regionData[key]["transcripts"].append(name) + if ncbiId: + regionData[key]["ncbiIds"].append(ncbiId) regionData[key]["strands"].add(strand) if geneSym and not regionData[key]["geneSym"]: regionData[key]["geneSym"] = geneSym # rule 2 applies when no EJC can be deposited downstream of the stop codon. # that requires: single CDS exon (no CDS-CDS junction) AND no 3'UTR intron. # 5'UTR introns are allowed - their EJCs are upstream of the stop and either # are cleared by the scanning 40S or are never encountered by the terminating # ribosome, so they do not trigger NMD. if strand == "+": hasThreeUtrIntron = any(s > cdsEnd for s in rec["exonStarts"]) else: hasThreeUtrIntron = any(e < cdsStart for e in rec["exonEnds"]) if len(cdsExons) == 1 and not hasThreeUtrIntron: @@ -366,44 +384,47 @@ if hasThreeUtrIntron: rule4Exons = cdsExons elif strand == "+": rule4Exons = cdsExons[:-1] else: rule4Exons = cdsExons[1:] for exStart, exEnd in rule4Exons: if exEnd - exStart > LONG_EXON_THRESHOLD: bed = [chrom, str(exStart), str(exEnd), name] bedOut(bed, txStart, txEnd, decoOfh, 4) addRegions([(chrom, exStart, exEnd)], 4) decoOfh.close() - # write collapsed output as bed 9 + mouseover + transcripts + # write collapsed output as bed 9 + mouseover + transcripts + ncbiIds collOfh = open(outCollapsedFname, "w") - for (chrom, start, end, rule), data in sorted(regionData.items()): + for key, data in sorted(regionData.items()): + chrom, start, end, rule = key[:4] txList = sorted(set(data["transcripts"])) + ncbiList = sorted(set(data["ncbiIds"])) geneSym = data["geneSym"] if not geneSym: geneSym = txList[0] # pick strand: use the strand if all transcripts agree, else "." strands = data["strands"] strand = list(strands)[0] if len(strands) == 1 else "." color = RULE_COLORS[rule] txListStr = ",".join(txList) + ncbiListStr = ",".join(ncbiList) if len(txList) <= 3: mouseover = RULE_DESCRIPTIONS[rule] + " (" + ", ".join(txList) + ")" else: mouseover = RULE_DESCRIPTIONS[rule] + " (" + str(len(txList)) + " transcripts)" row = [chrom, str(start), str(end), geneSym, "0", strand, - str(start), str(end), color, mouseover, txListStr] + str(start), str(end), color, mouseover, txListStr, ncbiListStr] collOfh.write("\t".join(row)) collOfh.write("\n") collOfh.close() logging.info("Wrote %d decorator regions to %s" % (sum(len(d["transcripts"]) for d in regionData.values()), outDecoFname)) logging.info("Wrote %d collapsed regions to %s" % (len(regionData), outCollapsedFname)) main()