f30798ae5d11e88e0ab7eb2bcab634e253fd0675
max
  Thu Apr 23 10:36:40 2026 -0700
Add gnomAD MPC v4.1.1 track to hg38.

New composite track under the gnomAD container showing per-variant
MPC (Missense deleteriousness Prediction by Constraint) scores from
gnomAD v4.1.1. Four bigWigs provide per-base scores (one per ALT
nucleotide); a companion bigBed carries the ~250K multi-transcript
variants with a per-transcript breakdown. Included via 'alpha' for
QA review. refs #37434

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

diff --git src/hg/makeDb/scripts/gnomadMpc/gnomadMpcToWig.py src/hg/makeDb/scripts/gnomadMpc/gnomadMpcToWig.py
new file mode 100644
index 00000000000..c1992ddf8dd
--- /dev/null
+++ src/hg/makeDb/scripts/gnomadMpc/gnomadMpcToWig.py
@@ -0,0 +1,86 @@
+#!/usr/bin/env python3
+"""Convert gnomAD v4.1.1 MPC TSV into four wig files, one per ALT base.
+
+Input format (TSV, with header):
+    locus<TAB>alleles<TAB>transcript<TAB>mpc
+    chr1:65568<TAB>["A","C"]<TAB>ENST00000641515<TAB>8.8102e-01
+
+Writes a.wig, c.wig, g.wig, t.wig in the current directory. At positions
+where the variant ALT matches the file's nucleotide, the MPC score is
+emitted; otherwise 0.0. If the same (chrom, pos, alt) appears in multiple
+transcripts with different scores, the maximum is kept.
+"""
+
+import sys
+import gzip
+import json
+
+def readRows(infile):
+    opener = gzip.open if infile.endswith((".gz", ".bgz")) else open
+    with opener(infile, "rt") as f:
+        header = f.readline()
+        if not header.startswith("locus"):
+            sys.exit("unexpected header: " + header)
+        for line in f:
+            if not line.strip():
+                continue
+            fields = line.rstrip("\n").split("\t")
+            locus, alleles_str, transcript, mpc = fields
+            chrom, pos = locus.split(":")
+            pos = int(pos)
+            alleles = json.loads(alleles_str)
+            ref, alt = alleles[0], alleles[1]
+            if ref not in "ACGT" or alt not in "ACGT":
+                continue
+            yield chrom, pos, ref, alt, float(mpc)
+
+def chunks(infile):
+    """Yield (chrom, firstPos, [[A,C,G,T], ...]) for contiguous runs."""
+    firstChrom = None
+    firstPos = None
+    lastChrom = None
+    lastPos = None
+    values = []
+
+    for chrom, pos, ref, alt, mpc in readRows(infile):
+        if lastChrom is None:
+            firstChrom, firstPos = chrom, pos
+            values = [[0.0, 0.0, 0.0, 0.0]]
+        elif chrom != lastChrom or pos - lastPos > 1:
+            yield firstChrom, firstPos, values
+            firstChrom, firstPos = chrom, pos
+            values = [[0.0, 0.0, 0.0, 0.0]]
+        elif pos != lastPos:
+            values.append([0.0, 0.0, 0.0, 0.0])
+        # else same pos: multiple transcripts, fold into current slot
+
+        idx = "ACGT".find(alt)
+        if mpc > values[-1][idx]:
+            values[-1][idx] = mpc
+
+        lastChrom, lastPos = chrom, pos
+
+    if values:
+        yield firstChrom, firstPos, values
+
+def main():
+    if len(sys.argv) != 2:
+        sys.exit("usage: gnomadMpcToWig.py input.tsv.{gz,bgz}")
+
+    infile = sys.argv[1]
+    outFhs = {base: open(base.lower() + ".wig", "w") for base in "ACGT"}
+
+    for chrom, firstPos, nuclValues in chunks(infile):
+        if not nuclValues:
+            continue
+        for fh in outFhs.values():
+            fh.write("fixedStep chrom=%s span=1 step=1 start=%d\n" % (chrom, firstPos))
+        for vals in nuclValues:
+            for i, base in enumerate("ACGT"):
+                outFhs[base].write("%g\n" % vals[i])
+
+    for fh in outFhs.values():
+        fh.close()
+
+if __name__ == "__main__":
+    main()