src/hg/makeDb/scripts/lrSv/lrSvHprc2RoVcfToBed.py ef61e73fc416622d8557ec2439df2344a1cc80c3

ef61e73fc416622d8557ec2439df2344a1cc80c3
max
  Tue Jun 9 15:10:01 2026 -0700
lrSv: replace HPRC v2.0 pangenome SV track with v2.1 (hprc2v21Sv)

Drop the v2.0 wave-decomposed hprc2Sv track and add hprc2v21Sv built from
the HPRC v2.1 minigraph-cactus raw vg deconstruct VCFs (gref95.ro), on both
hg38 (GRCh38 path, 596,063 SVs) and hs1 (T2T-CHM13 path, 608,435 SVs). The
v2.1 files lack per-allele TYPE/LEN, so the new converter classifies INS/DEL
by parsimony-trimming REF/ALT and the net length change. The v2.0 build
recipe, converter and schema are kept but commented out in the makeDocs in
case wave-decomposed VCFs are released again, refs #36258

diff --git src/hg/makeDb/scripts/lrSv/lrSvHprc2RoVcfToBed.py src/hg/makeDb/scripts/lrSv/lrSvHprc2RoVcfToBed.py
new file mode 100644
index 00000000000..57efda1917b
--- /dev/null
+++ src/hg/makeDb/scripts/lrSv/lrSvHprc2RoVcfToBed.py
@@ -0,0 +1,217 @@
+#!/usr/bin/env python3
+"""Convert the HPRC release-2.1 minigraph-cactus pangenome VCF to BED9+.
+
+This is the v2.1 "gref95.ro" callset (Glenn Hickey, HPRC). Unlike the v2.0
+"wave" VCF, this file is the raw `vg deconstruct` output: it carries graph
+traversals (AT), nested snarls (LV) and a parent-snarl pointer (PS), and it
+has NO per-allele TYPE or LEN fields. We therefore classify each ALT by
+parsimony-trimming the shared prefix/suffix of REF/ALT and looking at the
+net length change:
+
+    trimmed ref empty           -> INS   (pure insertion)
+    trimmed alt empty           -> DEL   (pure deletion)
+    both non-empty, alt longer  -> INS   (net insertion, ragged ends)
+    both non-empty, ref longer  -> DEL   (net deletion, ragged ends)
+    both non-empty, equal length-> CPX   (balanced substitution / MNP)
+
+An allele is kept when abs(len(ALT) - len(REF)) >= 50 bp, matching the
+length threshold used for the v2.0 track. AC / AF / AN / NS / LV are taken
+from the INFO fields. There is no INV annotation in this file, so no
+inversions are emitted (balanced events fall into CPX).
+
+Usage:
+    lrSvHprc2RoVcfToBed.py input.vcf.gz output.bed
+
+Source:
+    https://humanpangenome.org/hprc-data-release-2/
+    hprc-v2.1-mc-grch38.gref95.ro.vcf.gz (233-sample minigraph-cactus graph).
+"""
+
+import gzip
+import os
+import sys
+
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from lrSvCommon import svName
+
+MIN_SV_LEN = 50
+
+SV_COLORS = {
+    "INS": "0,0,200",      # blue
+    "DEL": "200,0,0",      # red
+    "CPX": "140,0,200",    # purple
+}
+
+
+def parseInfo(infoStr):
+    d = {}
+    for item in infoStr.split(";"):
+        if "=" in item:
+            k, v = item.split("=", 1)
+            d[k] = v
+        else:
+            d[item] = True
+    return d
+
+
+def splitMulti(s, n):
+    """Split a per-allele INFO value; pad with '' if missing."""
+    if s is None:
+        return [""] * n
+    parts = s.split(",")
+    if len(parts) < n:
+        parts = parts + [""] * (n - len(parts))
+    return parts
+
+
+def classify(ref, alt):
+    """Parsimony-trim REF/ALT and classify; return (svType, refSpan).
+
+    refSpan is the number of trimmed reference bases the event spans, used
+    to position chromEnd. The original chromStart is shifted by the trimmed
+    prefix length so the feature sits on the actual changed bases.
+    """
+    # trim shared prefix
+    p = 0
+    while p < len(ref) and p < len(alt) and ref[p] == alt[p]:
+        p += 1
+    r = ref[p:]
+    a = alt[p:]
+    # trim shared suffix
+    s = 0
+    while s < len(r) and s < len(a) and r[-1 - s] == a[-1 - s]:
+        s += 1
+    if s:
+        r = r[:len(r) - s]
+        a = a[:len(a) - s]
+    lr, la = len(r), len(a)
+    if la > lr:
+        svType = "INS"
+    elif lr > la:
+        svType = "DEL"
+    else:
+        svType = "CPX"
+    return svType, p, lr
+
+
+def main():
+    if len(sys.argv) != 3:
+        print(__doc__, file=sys.stderr)
+        sys.exit(1)
+
+    inPath, outPath = sys.argv[1], sys.argv[2]
+    opener = gzip.open if inPath.endswith(".gz") else open
+
+    counts = {"INS": 0, "DEL": 0, "CPX": 0}
+    nested = 0
+
+    with opener(inPath, "rt") as fIn, open(outPath, "w") as fOut:
+        for line in fIn:
+            if line.startswith("#"):
+                continue
+            fields = line.rstrip("\n").split("\t")
+            chrom = fields[0]
+            pos = int(fields[1])
+            rowId = fields[2]
+            ref = fields[3]
+            alts = fields[4].split(",")
+            info = parseInfo(fields[7])
+
+            n = len(alts)
+            acList = splitMulti(info.get("AC"), n)
+            afList = splitMulti(info.get("AF"), n)
+
+            try:
+                nSamples = int(info.get("NS", "0"))
+            except ValueError:
+                nSamples = 0
+            try:
+                an = int(info.get("AN", "0"))
+            except ValueError:
+                an = 0
+            try:
+                snarlLevel = int(info.get("LV", "0"))
+            except ValueError:
+                snarlLevel = 0
+
+            for i, alt in enumerate(alts):
+                if alt in (".", "*"):
+                    continue
+
+                svLenVal = abs(len(alt) - len(ref))
+                # Filter: keep SV-sized alleles by net length change.
+                if svLenVal < MIN_SV_LEN:
+                    continue
+
+                svType, prefixLen, refSpan = classify(ref, alt)
+                color = SV_COLORS.get(svType, "100,100,100")
+
+                # chromStart shifts past the shared prefix to the first
+                # changed base; INS gets a 1-bp anchor, DEL/CPX span the
+                # trimmed reference interval.
+                chromStart = (pos - 1) + prefixLen
+                if svType == "INS":
+                    chromEnd = chromStart + 1
+                else:
+                    chromEnd = chromStart + max(1, refSpan)
+                if chromEnd <= chromStart:
+                    chromEnd = chromStart + 1
+
+                try:
+                    ac = int(acList[i])
+                except ValueError:
+                    ac = 0
+                try:
+                    af = float(afList[i])
+                except ValueError:
+                    af = 0.0
+
+                svLen = chromEnd - chromStart
+                if svType == "INS":
+                    insLen = abs(len(alt) - len(ref))
+                else:
+                    insLen = 0
+
+                if rowId and rowId != ".":
+                    baseId = f"{rowId}.{i + 1}" if n > 1 else rowId
+                else:
+                    baseId = f"{chrom}:{pos}:{svType}:{svLenVal}"
+                if len(baseId) > 200:
+                    baseId = f"{chrom}:{pos}:{svType}:{svLenVal}"
+
+                featLen = insLen if svType == "INS" else svLen
+                name = svName(svType, featLen, ac)
+
+                counts[svType] += 1
+                if snarlLevel > 0:
+                    nested += 1
+
+                row = [
+                    chrom,
+                    str(chromStart),
+                    str(chromEnd),
+                    name,
+                    "0",
+                    ".",
+                    str(chromStart),
+                    str(chromEnd),
+                    color,
+                    svType,
+                    str(svLen),
+                    str(insLen),
+                    str(ac),
+                    str(an),
+                    f"{af:.6f}",
+                    str(nSamples),
+                    str(snarlLevel),
+                ]
+                fOut.write("\t".join(row) + "\n")
+
+    total = sum(counts.values())
+    print(f"kept {total} SV-sized alleles: "
+          f"{counts['INS']} INS, {counts['DEL']} DEL, {counts['CPX']} CPX "
+          f"({nested} at nested snarl levels LV>0)", file=sys.stderr)
+
+
+if __name__ == "__main__":
+    main()