src/hg/makeDb/scripts/lrSv/lrSvCpc1VcfToBed.py 526213b2893134217a300ff913e11b4e98d67991

526213b2893134217a300ff913e11b4e98d67991
max
  Mon Apr 20 08:50:10 2026 -0700
lrSv: add cpc1Sv and aprSv pangenome SV subtracks (hg38, hs1)

cpc1Sv: 97,205 SVs from the CPC + HPRC Phase 1 pangenome (Gao et al 2023,
Nature; PMID 37316654) built on T2T-CHM13v2, with 53 Chinese and 47 HPRC
samples. Each graph snarl site is shown as one item with alt alleles
classified by length delta (INS/DEL/CPX, 50 bp threshold) and collapsed.

aprSv: 103,077 SVs from the Arabic Pangenome Reference (Nassir et al.
2025, Nat Commun; PMID 40707445) built on T2T-CHM13v2 from 53 UAE-resident
Arab individuals. Same multi-allele classification as cpc1Sv, with alt
alleles iterated within each multi-allelic row.

Both tracks load natively on hs1 and are lifted to hg38 with
hs1ToHg38.over.chain.gz.

refs #36258

diff --git src/hg/makeDb/scripts/lrSv/lrSvCpc1VcfToBed.py src/hg/makeDb/scripts/lrSv/lrSvCpc1VcfToBed.py
new file mode 100755
index 00000000000..5faecd7ce9c
--- /dev/null
+++ src/hg/makeDb/scripts/lrSv/lrSvCpc1VcfToBed.py
@@ -0,0 +1,204 @@
+#!/usr/bin/env python3
+"""Convert the CPC+HPRC Phase 1 pangenome VCF (on T2T-CHM13v2) to a
+bed9+7 table for the lrSv/cpc1Sv track.
+
+The input VCF is produced by vcfwave/bcftools norm from a pangenome
+graph, so:
+  * contigs are named "CHM13v2.chrN"
+  * variant IDs are graph snarl traversals like ">2541>2547"
+  * REF and ALT are always explicit sequences (no symbolic ALTs)
+  * each snarl may appear as several rows, one per decomposed alt allele
+
+This script:
+  1. strips the "CHM13v2." contig prefix (hs1 chrom names)
+  2. classifies each alt allele by length delta d = len(ALT)-len(REF):
+         d >= +50   -> INS
+         d <= -50   -> DEL
+         |d| < 50 and max(len) >= 50 -> CPX
+         otherwise  -> dropped (below 50bp threshold)
+  3. groups all alt alleles that share the same graph snarl ID into one
+     output row, taking:
+         svType = that class if all alts agree, else MIXED
+         svLen  = max |d| across alts
+         alleleCount = sum of per-alt AC
+         alleleNumber = AN (constant for the snarl)
+         alleleFreq   = alleleCount / alleleNumber
+     The BED interval spans from POS-1 to POS-1 + len(REF).
+
+Usage:
+    lrSvCpc1VcfToBed.py input.vcf.gz output.bed chrom.sizes
+"""
+
+import gzip
+import sys
+from collections import defaultdict
+
+# Colors per SV type
+COLORS = {
+    "INS": "0,0,200",       # blue
+    "DEL": "200,0,0",       # red
+    "CPX": "230,140,0",     # orange
+    "MIXED": "120,120,120", # grey
+}
+
+SIZE_THRESHOLD = 50
+
+
+def parse_info(info_str):
+    d = {}
+    for token in info_str.split(";"):
+        if not token:
+            continue
+        if "=" in token:
+            k, v = token.split("=", 1)
+            d[k] = v
+        else:
+            d[token] = True
+    return d
+
+
+def classify(ref_len, alt_len):
+    d = alt_len - ref_len
+    if d >= SIZE_THRESHOLD:
+        return "INS", d
+    if d <= -SIZE_THRESHOLD:
+        return "DEL", -d
+    if max(ref_len, alt_len) >= SIZE_THRESHOLD:
+        return "CPX", max(ref_len, alt_len)
+    return None, 0
+
+
+def emit(site, fout):
+    """site is dict: chrom, pos0, ref_len, name, types (set), svlen,
+    ac_sum, an, ns."""
+    classes = site["types"]
+    if len(classes) == 1:
+        sv_type = next(iter(classes))
+    else:
+        sv_type = "MIXED"
+    rgb = COLORS[sv_type]
+    chrom = site["chrom"]
+    start = site["pos0"]
+    end = start + max(site["ref_len"], 1)
+    af = (site["ac_sum"] / site["an"]) if site["an"] else 0.0
+    score = min(1000, max(0, int(round(af * 1000))))
+    row = [
+        chrom, str(start), str(end),
+        site["name"],
+        str(score),
+        ".",
+        str(start), str(end),
+        rgb,
+        sv_type,
+        str(site["svlen"]),
+        str(site["num_alts"]),
+        str(site["ac_sum"]),
+        str(site["an"]),
+        f"{af:.6f}",
+        str(site["ns"]),
+    ]
+    fout.write("\t".join(row) + "\n")
+
+
+def main():
+    if len(sys.argv) < 4:
+        sys.exit("usage: lrSvCpc1VcfToBed.py input.vcf.gz output.bed chrom.sizes")
+    in_file = sys.argv[1]
+    out_file = sys.argv[2]
+    sizes_file = sys.argv[3]
+
+    # Load chrom sizes for validation / clipping
+    chrom_sizes = {}
+    with open(sizes_file) as f:
+        for line in f:
+            c, s = line.strip().split("\t")
+            chrom_sizes[c] = int(s)
+
+    opener = gzip.open if in_file.endswith(".gz") else open
+
+    # Accumulator: key = (chrom, pos0, snarl_id)
+    # We stream the file and flush a site whenever we encounter a new key,
+    # because the VCF is sorted and decomposed alts of the same snarl are
+    # adjacent.
+    prev_key = None
+    site = None
+    kept_rows = 0
+    dropped_small = 0
+    dropped_chrom = 0
+    flushed_sites = 0
+
+    with opener(in_file, "rt") as fin, open(out_file, "w") as fout:
+        for line in fin:
+            if line.startswith("#"):
+                continue
+            f = line.rstrip("\n").split("\t", 8)
+            chrom_raw, pos, vid, ref, alt, qual, filt, info = f[:8]
+            if chrom_raw.startswith("CHM13v2."):
+                chrom = chrom_raw[len("CHM13v2."):]
+            else:
+                chrom = chrom_raw
+            if chrom not in chrom_sizes:
+                dropped_chrom += 1
+                continue
+
+            ref_len = len(ref)
+            alt_len = len(alt)
+            sv_type, mag = classify(ref_len, alt_len)
+            if sv_type is None:
+                dropped_small += 1
+                continue
+
+            info_d = parse_info(info)
+            def _int(val, default=0):
+                try:
+                    return int(str(val).split(",")[0])
+                except (ValueError, TypeError):
+                    return default
+            ac = _int(info_d.get("AC", 0))
+            an = _int(info_d.get("AN", 0))
+            ns = _int(info_d.get("NS", 0))
+            pos0 = int(pos) - 1
+
+            key = (chrom, pos0, vid)
+            if key != prev_key:
+                if site is not None:
+                    emit(site, fout)
+                    flushed_sites += 1
+                site = {
+                    "chrom": chrom,
+                    "pos0": pos0,
+                    "ref_len": ref_len,
+                    "name": vid,
+                    "types": set(),
+                    "svlen": 0,
+                    "num_alts": 0,
+                    "ac_sum": 0,
+                    "an": an,
+                    "ns": ns,
+                }
+                prev_key = key
+
+            site["types"].add(sv_type)
+            if mag > site["svlen"]:
+                site["svlen"] = mag
+            site["num_alts"] += 1
+            site["ac_sum"] += ac
+            # AN, NS should be the same across alts of one site; keep max just in case
+            if an > site["an"]:
+                site["an"] = an
+            if ns > site["ns"]:
+                site["ns"] = ns
+            kept_rows += 1
+
+        if site is not None:
+            emit(site, fout)
+            flushed_sites += 1
+
+    print(f"kept alt rows:        {kept_rows}", file=sys.stderr)
+    print(f"dropped <{SIZE_THRESHOLD}bp alt rows: {dropped_small}", file=sys.stderr)
+    print(f"dropped bad chrom:    {dropped_chrom}", file=sys.stderr)
+    print(f"output sites:         {flushed_sites}", file=sys.stderr)
+
+
+if __name__ == "__main__":
+    main()