f30798ae5d11e88e0ab7eb2bcab634e253fd0675 max Thu Apr 23 10:36:40 2026 -0700 Add gnomAD MPC v4.1.1 track to hg38. New composite track under the gnomAD container showing per-variant MPC (Missense deleteriousness Prediction by Constraint) scores from gnomAD v4.1.1. Four bigWigs provide per-base scores (one per ALT nucleotide); a companion bigBed carries the ~250K multi-transcript variants with a per-transcript breakdown. Included via 'alpha' for QA review. refs #37434 Co-Authored-By: Claude Opus 4.7 (1M context) diff --git src/hg/makeDb/scripts/gnomadMpc/gnomadMpcToBed.py src/hg/makeDb/scripts/gnomadMpc/gnomadMpcToBed.py new file mode 100644 index 00000000000..79b5303f66e --- /dev/null +++ src/hg/makeDb/scripts/gnomadMpc/gnomadMpcToBed.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python3 +"""Convert gnomAD v4.1.1 MPC TSV into a BED9+ file of multi-transcript variants. + +Input TSV (with header): + locusallelestranscriptmpc + chr1:65568["A","C"]ENST000006415158.8102e-01 + +Only variants scored against more than one transcript are emitted (single- +transcript variants are already fully represented by the companion bigWigs). +Per-transcript scores are folded into comma-separated lstring fields so the +track-detail page can show the full breakdown. The BED score column uses the +max MPC across transcripts, scaled to the 0-1000 range (MPC 6 -> ~1000). + +Input must already be sorted so that all rows for the same (chrom, pos) are +grouped together; alts/transcripts within a position may appear in any order. +Output is written unsorted-within-pos-but-emitted-in-input-order; pipe through +`sort -k1,1 -k2,2n` before bedToBigBed. +""" + +import sys +import gzip +import json + +def readRows(infile): + opener = gzip.open if infile.endswith((".gz", ".bgz")) else open + with opener(infile, "rt") as f: + header = f.readline() + if not header.startswith("locus"): + sys.exit("unexpected header: " + header) + for line in f: + if not line.strip(): + continue + locus, alleles_str, transcript, mpc = line.rstrip("\n").split("\t") + chrom, pos = locus.split(":") + alleles = json.loads(alleles_str) + ref, alt = alleles[0], alleles[1] + if ref not in "ACGT" or alt not in "ACGT": + continue + yield chrom, int(pos), ref, alt, transcript, float(mpc) + +def mpcColor(score): + """Map MPC score to an RGB string (gray -> yellow -> orange -> red).""" + if score < 1: + return "200,200,200" + if score < 2: + return "255,200,100" + if score < 3: + return "255,150,50" + return "200,0,0" + +def emit(out, chrom, pos, ref, alt_groups): + start = pos - 1 + end = pos + for alt in sorted(alt_groups): + items = alt_groups[alt] + if len(items) < 2: + continue + max_mpc = max(score for _, score in items) + transcripts = ",".join(tx for tx, _ in items) + scores = ",".join("%g" % score for _, score in items) + name = "%s>%s" % (ref, alt) + score_int = min(1000, int(max_mpc * 166)) + color = mpcColor(max_mpc) + out.write("\t".join([ + chrom, str(start), str(end), name, + str(score_int), ".", + str(start), str(end), color, + ref, alt, "%g" % max_mpc, + transcripts, scores, str(len(items)) + ]) + "\n") + +def main(): + if len(sys.argv) != 3: + sys.exit("usage: gnomadMpcToBed.py input.tsv.{gz,bgz} output.bed") + + infile, outfile = sys.argv[1:] + out = open(outfile, "w") + + cur_chrom = None + cur_pos = None + cur_ref = None + alt_groups = {} + + for chrom, pos, ref, alt, transcript, mpc in readRows(infile): + if (chrom, pos) != (cur_chrom, cur_pos): + if cur_chrom is not None: + emit(out, cur_chrom, cur_pos, cur_ref, alt_groups) + cur_chrom, cur_pos, cur_ref = chrom, pos, ref + alt_groups = {} + alt_groups.setdefault(alt, []).append((transcript, mpc)) + + if alt_groups: + emit(out, cur_chrom, cur_pos, cur_ref, alt_groups) + out.close() + +if __name__ == "__main__": + main()