bfa91627a224f0ca4a9e10718297df25f163847a max Tue Mar 24 04:09:48 2026 -0700 adding NMD escape supertrack, refs #33737 diff --git src/hg/makeDb/scripts/nmd/genePredNmdEsc src/hg/makeDb/scripts/nmd/genePredNmdEsc new file mode 100755 index 00000000000..b088b2729cf --- /dev/null +++ src/hg/makeDb/scripts/nmd/genePredNmdEsc @@ -0,0 +1,265 @@ +#!/usr/bin/env python + +import logging, sys, optparse, gzip, subprocess +from collections import defaultdict +from os.path import join, basename, dirname, isfile + +# ==== functions ===== + +# Colors by rule +RULE_COLORS = { + 1: "255,0,0", # red - last 50bp of last coding junction + 2: "255,140,0", # orange - intronless transcript + 3: "139,0,0", # dark red - first 100bp of coding nucleotides +} + +RULE_DESCRIPTIONS = { + 1: "Rule 1 - last 50bp of last coding exon junction", + 2: "Rule 2 - intronless transcript", + 3: "Rule 3 - first 100bp of coding nucleotides", +} + +def parseArgs(): + " setup logging, parse command line arguments and options. -h shows auto-generated help page " + parser = optparse.OptionParser("usage: %prog [options] inFname outDecoFname outCollapsedFname - " + "Output BEDs with NMD escape regions. First output is decorator format, " + "second is collapsed bigGenePred with gene symbols and transcript lists.") + + parser.add_option("-d", "--debug", dest="debug", action="store_true", + help="show debug messages") + parser.add_option("-f", "--format", dest="format", action="store", default="genePredExt", + help="Input format: 'genePredExt' (with bin column, e.g. ncbiRefSeq) or " + "'bigGenePred' (e.g. gencode .bb file, will use bigBedToBed). Default: genePredExt") + (options, args) = parser.parse_args() + + if len(args) != 3: + parser.print_help() + exit(1) + + if options.debug: + logging.basicConfig(level=logging.DEBUG) + logging.getLogger().setLevel(logging.DEBUG) + else: + logging.basicConfig(level=logging.INFO) + logging.getLogger().setLevel(logging.INFO) + + return args, options + +def openInput(fname, fmt): + """Open input file and yield parsed transcript dicts. + Both formats yield dicts with keys: name, chrom, strand, txStart, txEnd, + cdsStart, cdsEnd, exonCount, exonStarts, exonEnds, geneSym, cdsStartStat, cdsEndStat, exonFrames + """ + if fmt == "bigGenePred": + # pipe through bigBedToBed + proc = subprocess.Popen(["bigBedToBed", fname, "stdout"], + stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + fh = proc.stdout + elif fname.endswith(".gz"): + fh = gzip.open(fname, "rt") + else: + fh = open(fname) + + for line in fh: + line = line.rstrip("\n\r") + if not line: + continue + fields = line.split("\t") + + if fmt == "genePredExt": + # genePredExt with bin column: + # bin, name, chrom, strand, txStart, txEnd, cdsStart, cdsEnd, + # exonCount, exonStarts, exonEnds, score, name2, cdsStartStat, cdsEndStat, exonFrames + rec = { + "name": fields[1], + "chrom": fields[2], + "strand": fields[3], + "txStart": int(fields[4]), + "txEnd": int(fields[5]), + "cdsStart": int(fields[6]), + "cdsEnd": int(fields[7]), + "exonCount": int(fields[8]), + "exonStarts": [int(x) for x in fields[9].strip(",").split(",") if x], + "exonEnds": [int(x) for x in fields[10].strip(",").split(",") if x], + "geneSym": fields[12] if len(fields) > 12 else "", + "cdsStartStat": fields[13] if len(fields) > 13 else "none", + "cdsEndStat": fields[14] if len(fields) > 14 else "none", + "exonFrames": fields[15].strip(",") if len(fields) > 15 else "", + } + elif fmt == "bigGenePred": + # bigGenePred: + # chrom, chromStart, chromEnd, name, score, strand, thickStart, thickEnd, + # color, blockCount, blockSizes, chromStarts, name2, cdsStartStat, cdsEndStat, + # exonFrames, type, geneName, geneName2, geneType + chromStart = int(fields[1]) + blockSizes = [int(x) for x in fields[10].strip(",").split(",") if x] + blockStarts = [int(x) for x in fields[11].strip(",").split(",") if x] + rec = { + "name": fields[3], + "chrom": fields[0], + "strand": fields[5], + "txStart": chromStart, + "txEnd": int(fields[2]), + "cdsStart": int(fields[6]), + "cdsEnd": int(fields[7]), + "exonCount": int(fields[9]), + "exonStarts": [chromStart + s for s in blockStarts], + "exonEnds": [chromStart + s + sz for s, sz in zip(blockStarts, blockSizes)], + "geneSym": fields[17] if len(fields) > 17 else "", + "cdsStartStat": fields[13] if len(fields) > 13 else "none", + "cdsEndStat": fields[14] if len(fields) > 14 else "none", + "exonFrames": fields[15].strip(",") if len(fields) > 15 else "", + } + else: + raise ValueError("Unknown format: " + fmt) + + yield rec + + if fmt == "bigGenePred": + proc.wait() + +def bedOut(row, txStart, txEnd, ofh, rule): + "write a decorator BED line" + row = [str(x) for x in row] + chrom, start, end, name = row + decItem = chrom+":"+str(txStart)+"-"+str(txEnd)+":"+name + color = RULE_COLORS[rule] + mouseover = RULE_DESCRIPTIONS[rule] + row = [chrom, start, end, name, "0", ".", start, end, color, "1", + str(int(end)-int(start)), "0", decItem, "block", color, "", mouseover] + ofh.write("\t".join(row)) + ofh.write("\n") + + +def outputExonsUpTo(from3Prime, cdsExons, chrom, txStart, txEnd, name, n, ofh, rule): + """ given a list of (start, end), output start-end BEDs to ofh that cover n nucleotides. + Returns list of (chrom, start, end) regions output. """ + doneNs = 0 + doStop = False + regions = [] + if from3Prime: + cdsExons = list(reversed(cdsExons)) + + # -50 means "-50 from the last junction" so take length of last exon + 50 + if n < 0: + ex1Start = cdsExons[0][0] + ex1End = cdsExons[0][1] + n = (ex1End-ex1Start)+50 + + for start, end in cdsExons: + exLen = end-start + missBps = n-doneNs + if doneNs+exLen > n: + if from3Prime: + start = end-missBps + else: + end = start+missBps + doStop = True + bed = [chrom, str(start), str(end), name] + doneNs += exLen + bedOut(bed, txStart, txEnd, ofh, rule) + regions.append( (chrom, start, end) ) + if doStop: + return regions + return regions + +# ----------- main -------------- +def main(): + args, options = parseArgs() + + inFname, outDecoFname, outCollapsedFname = args + + decoOfh = open(outDecoFname, "w") + + # collect regions for the collapsed output: + # key = (chrom, start, end, rule) -> {"transcripts": [...], "strand": set(), "geneSym": str} + regionData = defaultdict(lambda: {"transcripts": [], "strands": set(), "geneSym": ""}) + + for rec in openInput(inFname, options.format): + name = rec["name"] + chrom = rec["chrom"] + strand = rec["strand"] + txStart = rec["txStart"] + txEnd = rec["txEnd"] + cdsStart = rec["cdsStart"] + cdsEnd = rec["cdsEnd"] + + # skip non-coding transcripts (cdsStart == cdsEnd) + if cdsStart >= cdsEnd: + continue + + # gene symbol from record, fall back to transcript name + geneSym = rec["geneSym"] + + # only keep exons that have CDS and cut around CDS + cdsExons = [] + for exStart, exEnd in zip(rec["exonStarts"], rec["exonEnds"]): + # 5' UTR + if cdsStart > exEnd: + continue + # 3' UTR + if exStart > cdsEnd: + continue + if (exStart <= cdsStart and cdsStart <= exEnd): + exStart = cdsStart + if (exStart <= cdsEnd and cdsEnd <= exEnd): + exEnd = cdsEnd + cdsExons.append( (exStart, exEnd) ) + + def addRegions(regions, rule): + for r in regions: + key = (r[0], r[1], r[2], rule) + regionData[key]["transcripts"].append(name) + regionData[key]["strands"].add(strand) + if geneSym and not regionData[key]["geneSym"]: + regionData[key]["geneSym"] = geneSym + + if len(cdsExons)==1: + # rule 2: intronless transcript + bed = [chrom, str(cdsStart), str(cdsEnd), name] + bedOut(bed, txStart, txEnd, decoOfh, 2) + addRegions([(chrom, cdsStart, cdsEnd)], 2) + else: + if strand=="+": + # rule 3: first 100bp of coding nucleotides + regions = outputExonsUpTo(False, cdsExons, chrom, txStart, txEnd, name, 100, decoOfh, 3) + addRegions(regions, 3) + # rule 1: last 50bp of last coding junction + regions = outputExonsUpTo(True, cdsExons, chrom, txStart, txEnd, name, -50, decoOfh, 1) + addRegions(regions, 1) + else: + # rule 3: first 100bp of coding nucleotides + regions = outputExonsUpTo(True, cdsExons, chrom, txStart, txEnd, name, 100, decoOfh, 3) + addRegions(regions, 3) + # rule 1: last 50bp of last coding junction + regions = outputExonsUpTo(False, cdsExons, chrom, txStart, txEnd, name, -50, decoOfh, 1) + addRegions(regions, 1) + + decoOfh.close() + + # write collapsed output as bed 9 + mouseover + transcripts + collOfh = open(outCollapsedFname, "w") + for (chrom, start, end, rule), data in sorted(regionData.items()): + txList = sorted(set(data["transcripts"])) + geneSym = data["geneSym"] + if not geneSym: + geneSym = txList[0] + + # pick strand: use the strand if all transcripts agree, else "." + strands = data["strands"] + strand = list(strands)[0] if len(strands) == 1 else "." + + color = RULE_COLORS[rule] + txListStr = ",".join(txList) + mouseover = RULE_DESCRIPTIONS[rule] + " (" + str(len(txList)) + " transcripts)" + + row = [chrom, str(start), str(end), geneSym, "0", strand, + str(start), str(end), color, mouseover, txListStr] + collOfh.write("\t".join(row)) + collOfh.write("\n") + + collOfh.close() + logging.info("Wrote %d decorator regions to %s" % (sum(len(d["transcripts"]) for d in regionData.values()), outDecoFname)) + logging.info("Wrote %d collapsed regions to %s" % (len(regionData), outCollapsedFname)) + +main()