4bd316f5f1ca47328bd3f9a181214b788055f0bc lrnassar Tue Apr 21 13:29:26 2026 -0700 NMD Escape QA round 3: switch RefSeq to curated, fix Rule 2 misclassification. refs #33737 Switched the NMD Escape RefSeq subtrack input from hg38.ncbiRefSeq.txt.gz (all) to hg38.ncbiRefSeqCurated.txt.gz (NM_/NR_ only, no XM_/XR_ predicted models) per Max's feedback. longLabel updated to "NCBI RefSeq Curated transcripts". Fixed Rule 2 in genePredNmdEsc to test rec["exonCount"]==1 instead of len(cdsExons)==1. The old test misclassified multi-exon transcripts with a single CDS exon (UTR introns) as "intronless" and silently suppressed their Rule 1/3/4 assignments via the if/else short-circuit. 3,253 RefSeq curated and ~2,000 Gencode transcripts reassigned from Rule 2 to Rules 1/3. Rebuilt both tracks. Added Rule 1 caveat to nmdEscTranscripts.html for transcripts with a penultimate coding exon shorter than 50 bp. Added reciprocal relatedTracks.ra entries for nmd <-> mane and nmd <-> ncbiRefSeq. QA cleanups: non-ASCII prime char replaced with ′, mailing list links given target="_blank" across all three HTML pages, dead commented nmdGencode block removed from nmd.ra, AutoSQL field comments updated to cover Rule 4 color and the gene-symbol-to-transcript-ID fallback. Makedoc updated with the full Gencode + RefSeq pipeline and /gbdb symlinks. diff --git src/hg/makeDb/scripts/nmd/genePredNmdEsc src/hg/makeDb/scripts/nmd/genePredNmdEsc index 12faa973367..d8b3e005c4d 100755 --- src/hg/makeDb/scripts/nmd/genePredNmdEsc +++ src/hg/makeDb/scripts/nmd/genePredNmdEsc @@ -1,286 +1,286 @@ #!/usr/bin/env python import logging, sys, optparse, gzip, subprocess from collections import defaultdict from os.path import join, basename, dirname, isfile # ==== functions ===== # Colors by rule RULE_COLORS = { 1: "255,0,0", # red - last 50bp of last coding junction 2: "255,140,0", # orange - intronless transcript 3: "139,0,0", # dark red - first 100bp of coding nucleotides 4: "255,215,0", # gold - long coding exon (>400 nt) } RULE_DESCRIPTIONS = { 1: "Rule 1 - last 50bp of last coding exon junction", 2: "Rule 2 - intronless transcript", 3: "Rule 3 - first 100bp of coding nucleotides", 4: "Rule 4 - PTC in long exon (>400 nt)", } # Lindeboom et al. 2016: reduced NMD efficiency for PTCs in exons longer than this LONG_EXON_THRESHOLD = 400 def parseArgs(): " setup logging, parse command line arguments and options. -h shows auto-generated help page " parser = optparse.OptionParser("usage: %prog [options] inFname outDecoFname outCollapsedFname - " "Output BEDs with NMD escape regions. First output is decorator format, " "second is collapsed bigGenePred with gene symbols and transcript lists.") parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages") parser.add_option("-f", "--format", dest="format", action="store", default="genePredExt", help="Input format: 'genePredExt' (with bin column, e.g. ncbiRefSeq) or " "'bigGenePred' (e.g. gencode .bb file, will use bigBedToBed). Default: genePredExt") (options, args) = parser.parse_args() if len(args) != 3: parser.print_help() exit(1) if options.debug: logging.basicConfig(level=logging.DEBUG) logging.getLogger().setLevel(logging.DEBUG) else: logging.basicConfig(level=logging.INFO) logging.getLogger().setLevel(logging.INFO) return args, options def openInput(fname, fmt): """Open input file and yield parsed transcript dicts. Both formats yield dicts with keys: name, chrom, strand, txStart, txEnd, cdsStart, cdsEnd, exonCount, exonStarts, exonEnds, geneSym, cdsStartStat, cdsEndStat, exonFrames """ if fmt == "bigGenePred": # pipe through bigBedToBed proc = subprocess.Popen(["bigBedToBed", fname, "stdout"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) fh = proc.stdout elif fname.endswith(".gz"): fh = gzip.open(fname, "rt") else: fh = open(fname) for line in fh: line = line.rstrip("\n\r") if not line: continue fields = line.split("\t") if fmt == "genePredExt": # genePredExt with bin column: # bin, name, chrom, strand, txStart, txEnd, cdsStart, cdsEnd, # exonCount, exonStarts, exonEnds, score, name2, cdsStartStat, cdsEndStat, exonFrames rec = { "name": fields[1], "chrom": fields[2], "strand": fields[3], "txStart": int(fields[4]), "txEnd": int(fields[5]), "cdsStart": int(fields[6]), "cdsEnd": int(fields[7]), "exonCount": int(fields[8]), "exonStarts": [int(x) for x in fields[9].strip(",").split(",") if x], "exonEnds": [int(x) for x in fields[10].strip(",").split(",") if x], "geneSym": fields[12] if len(fields) > 12 else "", "cdsStartStat": fields[13] if len(fields) > 13 else "none", "cdsEndStat": fields[14] if len(fields) > 14 else "none", "exonFrames": fields[15].strip(",") if len(fields) > 15 else "", } elif fmt == "bigGenePred": # bigGenePred: # chrom, chromStart, chromEnd, name, score, strand, thickStart, thickEnd, # color, blockCount, blockSizes, chromStarts, name2, cdsStartStat, cdsEndStat, # exonFrames, type, geneName, geneName2, geneType chromStart = int(fields[1]) blockSizes = [int(x) for x in fields[10].strip(",").split(",") if x] blockStarts = [int(x) for x in fields[11].strip(",").split(",") if x] rec = { "name": fields[3], "chrom": fields[0], "strand": fields[5], "txStart": chromStart, "txEnd": int(fields[2]), "cdsStart": int(fields[6]), "cdsEnd": int(fields[7]), "exonCount": int(fields[9]), "exonStarts": [chromStart + s for s in blockStarts], "exonEnds": [chromStart + s + sz for s, sz in zip(blockStarts, blockSizes)], "geneSym": fields[17] if len(fields) > 17 else "", "cdsStartStat": fields[13] if len(fields) > 13 else "none", "cdsEndStat": fields[14] if len(fields) > 14 else "none", "exonFrames": fields[15].strip(",") if len(fields) > 15 else "", } else: raise ValueError("Unknown format: " + fmt) yield rec if fmt == "bigGenePred": proc.wait() def bedOut(row, txStart, txEnd, ofh, rule): "write a decorator BED line" row = [str(x) for x in row] chrom, start, end, name = row decItem = chrom+":"+str(txStart)+"-"+str(txEnd)+":"+name color = RULE_COLORS[rule] mouseover = RULE_DESCRIPTIONS[rule] row = [chrom, start, end, name, "0", ".", start, end, color, "1", str(int(end)-int(start)), "0", decItem, "block", color, "", mouseover] ofh.write("\t".join(row)) ofh.write("\n") def outputExonsUpTo(from3Prime, cdsExons, chrom, txStart, txEnd, name, n, ofh, rule): """ given a list of (start, end), output start-end BEDs to ofh that cover n nucleotides. Returns list of (chrom, start, end) regions output. """ doneNs = 0 doStop = False regions = [] if from3Prime: cdsExons = list(reversed(cdsExons)) # -50 means "-50 from the last junction" so take length of last exon + 50 if n < 0: ex1Start = cdsExons[0][0] ex1End = cdsExons[0][1] n = (ex1End-ex1Start)+50 for start, end in cdsExons: if doneNs >= n: return regions exLen = end-start missBps = n-doneNs if doneNs+exLen > n: if from3Prime: start = end-missBps else: end = start+missBps doStop = True bed = [chrom, str(start), str(end), name] doneNs += exLen bedOut(bed, txStart, txEnd, ofh, rule) regions.append( (chrom, start, end) ) if doStop: return regions return regions # ----------- main -------------- def main(): args, options = parseArgs() inFname, outDecoFname, outCollapsedFname = args decoOfh = open(outDecoFname, "w") # collect regions for the collapsed output: # key = (chrom, start, end, rule) -> {"transcripts": [...], "strand": set(), "geneSym": str} regionData = defaultdict(lambda: {"transcripts": [], "strands": set(), "geneSym": ""}) for rec in openInput(inFname, options.format): name = rec["name"] chrom = rec["chrom"] strand = rec["strand"] txStart = rec["txStart"] txEnd = rec["txEnd"] cdsStart = rec["cdsStart"] cdsEnd = rec["cdsEnd"] # skip non-coding transcripts (cdsStart == cdsEnd) if cdsStart >= cdsEnd: continue # gene symbol from record, fall back to transcript name geneSym = rec["geneSym"] # only keep exons that have CDS and cut around CDS cdsExons = [] for exStart, exEnd in zip(rec["exonStarts"], rec["exonEnds"]): # 5' UTR if cdsStart > exEnd: continue # 3' UTR if exStart > cdsEnd: continue if (exStart <= cdsStart and cdsStart <= exEnd): exStart = cdsStart if (exStart <= cdsEnd and cdsEnd <= exEnd): exEnd = cdsEnd # skip degenerate cdsExons (CDS boundary lands exactly on exon boundary) if exStart >= exEnd: continue cdsExons.append( (exStart, exEnd) ) def addRegions(regions, rule): for r in regions: key = (r[0], r[1], r[2], rule) regionData[key]["transcripts"].append(name) regionData[key]["strands"].add(strand) if geneSym and not regionData[key]["geneSym"]: regionData[key]["geneSym"] = geneSym - if len(cdsExons)==1: - # rule 2: intronless transcript + if rec["exonCount"] == 1: + # rule 2: truly intronless transcript (no splicing, no EJCs deposited) bed = [chrom, str(cdsStart), str(cdsEnd), name] bedOut(bed, txStart, txEnd, decoOfh, 2) addRegions([(chrom, cdsStart, cdsEnd)], 2) else: if strand=="+": # rule 3: first 100bp of coding nucleotides regions = outputExonsUpTo(False, cdsExons, chrom, txStart, txEnd, name, 100, decoOfh, 3) addRegions(regions, 3) # rule 1: last 50bp of last coding junction regions = outputExonsUpTo(True, cdsExons, chrom, txStart, txEnd, name, -50, decoOfh, 1) addRegions(regions, 1) # rule 4: coding exons >400 nt (excluding last; last is already covered by rule 1) rule4Exons = cdsExons[:-1] else: # rule 3: first 100bp of coding nucleotides regions = outputExonsUpTo(True, cdsExons, chrom, txStart, txEnd, name, 100, decoOfh, 3) addRegions(regions, 3) # rule 1: last 50bp of last coding junction regions = outputExonsUpTo(False, cdsExons, chrom, txStart, txEnd, name, -50, decoOfh, 1) addRegions(regions, 1) # rule 4: coding exons >400 nt (excluding last, which on - strand is cdsExons[0]) rule4Exons = cdsExons[1:] # rule 4: Lindeboom et al. 2016 long-exon rule for exStart, exEnd in rule4Exons: if exEnd - exStart > LONG_EXON_THRESHOLD: bed = [chrom, str(exStart), str(exEnd), name] bedOut(bed, txStart, txEnd, decoOfh, 4) addRegions([(chrom, exStart, exEnd)], 4) decoOfh.close() # write collapsed output as bed 9 + mouseover + transcripts collOfh = open(outCollapsedFname, "w") for (chrom, start, end, rule), data in sorted(regionData.items()): txList = sorted(set(data["transcripts"])) geneSym = data["geneSym"] if not geneSym: geneSym = txList[0] # pick strand: use the strand if all transcripts agree, else "." strands = data["strands"] strand = list(strands)[0] if len(strands) == 1 else "." color = RULE_COLORS[rule] txListStr = ",".join(txList) mouseover = RULE_DESCRIPTIONS[rule] + " (" + str(len(txList)) + " transcripts)" row = [chrom, str(start), str(end), geneSym, "0", strand, str(start), str(end), color, mouseover, txListStr] collOfh.write("\t".join(row)) collOfh.write("\n") collOfh.close() logging.info("Wrote %d decorator regions to %s" % (sum(len(d["transcripts"]) for d in regionData.values()), outDecoFname)) logging.info("Wrote %d collapsed regions to %s" % (len(regionData), outCollapsedFname)) main()