f30798ae5d11e88e0ab7eb2bcab634e253fd0675 max Thu Apr 23 10:36:40 2026 -0700 Add gnomAD MPC v4.1.1 track to hg38. New composite track under the gnomAD container showing per-variant MPC (Missense deleteriousness Prediction by Constraint) scores from gnomAD v4.1.1. Four bigWigs provide per-base scores (one per ALT nucleotide); a companion bigBed carries the ~250K multi-transcript variants with a per-transcript breakdown. Included via 'alpha' for QA review. refs #37434 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> diff --git src/hg/makeDb/scripts/gnomadMpc/gnomadMpcToWig.py src/hg/makeDb/scripts/gnomadMpc/gnomadMpcToWig.py new file mode 100644 index 00000000000..c1992ddf8dd --- /dev/null +++ src/hg/makeDb/scripts/gnomadMpc/gnomadMpcToWig.py @@ -0,0 +1,86 @@ +#!/usr/bin/env python3 +"""Convert gnomAD v4.1.1 MPC TSV into four wig files, one per ALT base. + +Input format (TSV, with header): + locus<TAB>alleles<TAB>transcript<TAB>mpc + chr1:65568<TAB>["A","C"]<TAB>ENST00000641515<TAB>8.8102e-01 + +Writes a.wig, c.wig, g.wig, t.wig in the current directory. At positions +where the variant ALT matches the file's nucleotide, the MPC score is +emitted; otherwise 0.0. If the same (chrom, pos, alt) appears in multiple +transcripts with different scores, the maximum is kept. +""" + +import sys +import gzip +import json + +def readRows(infile): + opener = gzip.open if infile.endswith((".gz", ".bgz")) else open + with opener(infile, "rt") as f: + header = f.readline() + if not header.startswith("locus"): + sys.exit("unexpected header: " + header) + for line in f: + if not line.strip(): + continue + fields = line.rstrip("\n").split("\t") + locus, alleles_str, transcript, mpc = fields + chrom, pos = locus.split(":") + pos = int(pos) + alleles = json.loads(alleles_str) + ref, alt = alleles[0], alleles[1] + if ref not in "ACGT" or alt not in "ACGT": + continue + yield chrom, pos, ref, alt, float(mpc) + +def chunks(infile): + """Yield (chrom, firstPos, [[A,C,G,T], ...]) for contiguous runs.""" + firstChrom = None + firstPos = None + lastChrom = None + lastPos = None + values = [] + + for chrom, pos, ref, alt, mpc in readRows(infile): + if lastChrom is None: + firstChrom, firstPos = chrom, pos + values = [[0.0, 0.0, 0.0, 0.0]] + elif chrom != lastChrom or pos - lastPos > 1: + yield firstChrom, firstPos, values + firstChrom, firstPos = chrom, pos + values = [[0.0, 0.0, 0.0, 0.0]] + elif pos != lastPos: + values.append([0.0, 0.0, 0.0, 0.0]) + # else same pos: multiple transcripts, fold into current slot + + idx = "ACGT".find(alt) + if mpc > values[-1][idx]: + values[-1][idx] = mpc + + lastChrom, lastPos = chrom, pos + + if values: + yield firstChrom, firstPos, values + +def main(): + if len(sys.argv) != 2: + sys.exit("usage: gnomadMpcToWig.py input.tsv.{gz,bgz}") + + infile = sys.argv[1] + outFhs = {base: open(base.lower() + ".wig", "w") for base in "ACGT"} + + for chrom, firstPos, nuclValues in chunks(infile): + if not nuclValues: + continue + for fh in outFhs.values(): + fh.write("fixedStep chrom=%s span=1 step=1 start=%d\n" % (chrom, firstPos)) + for vals in nuclValues: + for i, base in enumerate("ACGT"): + outFhs[base].write("%g\n" % vals[i]) + + for fh in outFhs.values(): + fh.close() + +if __name__ == "__main__": + main()