0c1e751423b38dd741875d4cdcc6ffb5d4c4a135
max
  Tue May 12 07:51:34 2026 -0700
mei: add DeepMEI 1000G subtrack on hg38

91,617 MEIs (68,282 Alu, 16,891 L1, 6,444 SVA) called by DeepMEI
on the 3,202 high-coverage 1000 Genomes samples. Same 1-bp anchor
convention and Okabe-Ito colors as meiHgsvc3. DeepMEI's symbolic
ALT carries no inserted sequence or insertion length, so the
bigBed schema is a subset of meiHgsvc3 (no svLen, callerCount,
validation flags, insertSeq). Also fixes the INS-svLen:carrierCount
label format note in meiHgsvc3.html. refs #37524

diff --git src/hg/makeDb/scripts/mei/meiDeepmei1kgVcfToBed.py src/hg/makeDb/scripts/mei/meiDeepmei1kgVcfToBed.py
new file mode 100755
index 00000000000..9e9aa0c97e3
--- /dev/null
+++ src/hg/makeDb/scripts/mei/meiDeepmei1kgVcfToBed.py
@@ -0,0 +1,160 @@
+#!/usr/bin/env python3
+"""Convert DeepMEI 1000 Genomes high-coverage MEI VCF to a bed9+ file.
+
+Input: merge_1000g.latested.vcf.gz from the DeepMEI GitHub release
+(https://github.com/xuxif/DeepMEI/raw/refs/heads/main/DeepMEI/1000g_high_callset/),
+which contains 91,617 polymorphic Mobile Element Insertions identified
+in 3,202 high-coverage 1000 Genomes samples (NYGC) by DeepMEI. Each
+record uses a symbolic ALT (<INS:ME:ALU>, <INS:ME:LINE1>, <INS:ME:SVA>);
+no inserted sequence or insertion length is reported. INFO supplies
+SVTYPE, AF, AN and AC. FORMAT is GT:BPS:BPE:TL:CL:CR:ME per sample.
+
+Insertions are drawn as a 1bp anchor block, matching the lrSv / HGSVC3
+MEI tracks:
+  chromStart = POS - 1   (1-based VCF anchor -> 0-based)
+  chromEnd   = chromStart + 1
+"""
+
+import gzip
+import sys
+
+
+COLOR_BY_CLASS = {
+    "Alu": "0,114,178",     # blue (Okabe-Ito)
+    "L1":  "213,94,0",      # vermillion (Okabe-Ito)
+    "SVA": "0,158,115",     # bluish green (Okabe-Ito)
+}
+
+ALT_TO_CLASS = {
+    "<INS:ME:ALU>":   "Alu",
+    "<INS:ME:LINE1>": "L1",
+    "<INS:ME:SVA>":   "SVA",
+}
+
+
+def parseInfo(infoField):
+    fields = {}
+    for tok in infoField.split(";"):
+        if "=" in tok:
+            k, v = tok.split("=", 1)
+            fields[k] = v
+        else:
+            fields[tok] = True
+    return fields
+
+
+def main():
+    if len(sys.argv) != 4:
+        sys.exit("usage: meiDeepmei1kgVcfToBed.py input.vcf.gz chrom.sizes output.bed")
+
+    inFn, sizesFn, outFn = sys.argv[1:]
+    chromSizes = {}
+    with open(sizesFn) as f:
+        for line in f:
+            chrom, size = line.rstrip("\n").split("\t")
+            chromSizes[chrom] = int(size)
+
+    nIn = 0
+    nOut = 0
+    nSkippedChrom = 0
+    nSkippedBoundary = 0
+    nSkippedAlt = 0
+
+    opener = gzip.open if inFn.endswith(".gz") else open
+    out = open(outFn, "w")
+
+    sampleNames = None
+    with opener(inFn, "rt") as f:
+        for line in f:
+            if line.startswith("##"):
+                continue
+            if line.startswith("#CHROM"):
+                header = line.rstrip("\n").split("\t")
+                sampleNames = header[9:]
+                continue
+
+            nIn += 1
+            row = line.rstrip("\n").split("\t")
+            chrom = row[0]
+            pos = int(row[1])
+            alt = row[4]
+            info = parseInfo(row[7])
+            sampleGts = row[9:]
+
+            cls = ALT_TO_CLASS.get(alt)
+            if cls is None:
+                nSkippedAlt += 1
+                sys.stderr.write(f"WARN: skipping unknown ALT {alt} at {chrom}:{pos}\n")
+                continue
+
+            chromStart = pos - 1
+            chromEnd = chromStart + 1
+
+            if chrom not in chromSizes:
+                nSkippedChrom += 1
+                sys.stderr.write(f"WARN: skipping {chrom}:{pos}, chrom not in chrom.sizes\n")
+                continue
+            if chromEnd > chromSizes[chrom]:
+                nSkippedBoundary += 1
+                sys.stderr.write(
+                    f"WARN: skipping {chrom}:{pos}, end {chromEnd} > chromSize {chromSizes[chrom]}\n")
+                continue
+
+            altAC = 0
+            AN = 0
+            carrierCount = 0
+            sampleCount = 0
+            carrierSamples = []
+            for sampleName, gtField in zip(sampleNames, sampleGts):
+                gt = gtField.split(":", 1)[0]
+                hapAlt = 0
+                hapCalled = 0
+                for h in gt.replace("|", "/").split("/"):
+                    if h == ".":
+                        continue
+                    AN += 1
+                    hapCalled += 1
+                    if h == "1":
+                        altAC += 1
+                        hapAlt += 1
+                if hapCalled > 0:
+                    sampleCount += 1
+                if hapAlt > 0:
+                    carrierCount += 1
+                    carrierSamples.append(sampleName)
+
+            altAF = (altAC / AN) if AN > 0 else 0.0
+            score = max(0, min(1000, int(round(altAF * 1000))))
+            color = COLOR_BY_CLASS[cls]
+
+            name = f"INS-{cls}-{carrierCount}"
+
+            out.write("\t".join([
+                chrom,
+                str(chromStart),
+                str(chromEnd),
+                name,
+                str(score),
+                ".",
+                str(chromStart),
+                str(chromEnd),
+                color,
+                cls,
+                str(altAC),
+                str(AN),
+                f"{altAF:.6f}",
+                str(carrierCount),
+                str(sampleCount),
+                ",".join(carrierSamples),
+            ]) + "\n")
+            nOut += 1
+
+    out.close()
+    sys.stderr.write(
+        f"Read {nIn} records, wrote {nOut}, "
+        f"skipped {nSkippedAlt} (unknown ALT) + {nSkippedChrom} (chrom missing) + "
+        f"{nSkippedBoundary} (off chrom end)\n")
+
+
+if __name__ == "__main__":
+    main()