src/hg/makeDb/scripts/lrSv/lrSvHgsvc3TsvToBed.py 7594507ca126d5242346787e42e13c52ea7709b1

7594507ca126d5242346787e42e13c52ea7709b1
max
  Fri Apr 17 08:40:31 2026 -0700
Add lrSv supertrack: long-read structural variants from 9 studies (hg38).

#Preview2 week - bugs introduced now will need a build patch to fix
Sub-tracks (all bigBed 9+):
han945Sv     - 945 Han Chinese, ONT (Gong 2025, PMID 39929826)
lrSv1kgOnt   - 1019 1000 Genomes, ONT, SVAN-annotated (Schloissnig 2025,
PMID 40702182; lifted from hs1)
tommoJpSv    - 333 Japanese (111 trios), ONT (Otsuki 2022, PMID 36127505)
aou1kSv      - 1027 All of Us, PacBio HiFi (Garimella 2025, PMID 41256123)
ga4kSv       - 502 GA4K pediatric rare disease, PacBio HiFi
(Cohen 2022, PMID 35305867)
decodeSv     - 3622 Icelanders, ONT (Beyter 2021, PMID 33972781)
hgsvc3Sv     - 65 HGSVC3 diverse haplotype-resolved assemblies, HiFi+ONT
(Logsdon 2025, PMID 40702183; merges insdel+inv tables)
kwanhoSv     - 100 post-mortem brains (PD/ILBD/HC), PacBio HiFi
(Kim 2026, PMID 41929179)
chirmade101Sv - 101 long-read WGS GWAS SVatalog cohort
(Chirmade 2026, PMID 41203876)

Includes per-track conversion scripts and autoSql under
scripts/lrSv/, the supertrack summary table in lrSv.html, and a
consolidated makeDoc at doc/hg38/lrSv.txt.

refs #36258

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

diff --git src/hg/makeDb/scripts/lrSv/lrSvHgsvc3TsvToBed.py src/hg/makeDb/scripts/lrSv/lrSvHgsvc3TsvToBed.py
new file mode 100644
index 00000000000..dc976967aeb
--- /dev/null
+++ src/hg/makeDb/scripts/lrSv/lrSvHgsvc3TsvToBed.py
@@ -0,0 +1,132 @@
+#!/usr/bin/env python3
+"""Convert HGSVC3 final annotation TSVs (insdel + inv) to a merged BED9+ file.
+
+Usage:
+    lrSvHgsvc3TsvToBed.py insdel.tsv.gz inv.tsv.gz output.bed
+"""
+
+import csv
+import gzip
+import sys
+
+SV_COLORS = {
+    "DEL": "200,0,0",      # red
+    "INS": "0,0,200",      # blue
+    "INV": "230,140,0",    # orange
+}
+
+
+def openTsv(path):
+    return gzip.open(path, "rt") if path.endswith(".gz") else open(path, "rt")
+
+
+def countSamples(mergeSamples):
+    """Return (haplotypeCount, uniqSampleCount).
+
+    MERGE_SAMPLES is a comma-separated list of haplotype IDs like
+    HG00096-h1,HG00096-h2,HG00514-un. Haplotype count is the length of that
+    list. Unique sample count strips the -h1/-h2/-un suffix.
+    """
+    if not mergeSamples:
+        return 0, 0
+    parts = mergeSamples.split(",")
+    uniq = set()
+    for p in parts:
+        p = p.strip()
+        if not p:
+            continue
+        for suf in ("-h1", "-h2", "-un"):
+            if p.endswith(suf):
+                p = p[: -len(suf)]
+                break
+        uniq.add(p)
+    return len(parts), len(uniq)
+
+
+def emit(outF, row, typeExtra):
+    """Write one BED row. typeExtra is dict of the fields that differ
+    between insdel and inv.
+    """
+    chrom = row["#CHROM"]
+    pos = int(row["POS"])  # 0-based in source TSV based on sample inspection
+    end = int(row["END"])
+    svType = row["SVTYPE"]
+    try:
+        svLen = int(row["SVLEN"])
+    except ValueError:
+        svLen = 0
+    absSvLen = abs(svLen)
+
+    # Source coords are already 0-based half-open (chr1 POS=10669 END=10901
+    # matches a 232-bp deletion starting at 10670 in 1-based). Use as-is.
+    chromStart = pos
+    chromEnd = end
+    if chromEnd <= chromStart:
+        chromEnd = chromStart + 1
+
+    haploCount, sampleCount = countSamples(row.get("MERGE_SAMPLES", ""))
+
+    color = SV_COLORS.get(svType, "100,100,100")
+
+    refTrf = row.get("REF_TRF", "")
+    try:
+        refSd = float(row.get("REF_SD", "0") or 0)
+    except ValueError:
+        refSd = 0.0
+
+    bedRow = [
+        chrom,
+        str(chromStart),
+        str(chromEnd),
+        row["ID"],
+        "0",
+        ".",
+        str(chromStart),
+        str(chromEnd),
+        color,
+        svType,
+        str(absSvLen),
+        str(haploCount),
+        str(sampleCount),
+        typeExtra.get("homRef", ""),
+        typeExtra.get("homTig", ""),
+        typeExtra.get("regionRefInner", ""),
+        typeExtra.get("te", ""),
+        refTrf,
+        f"{refSd:.6f}",
+        row.get("MERGE_SAMPLES", ""),
+        row.get("WIN_500", ""),
+        row.get("WIN_2K", ""),
+    ]
+    outF.write("\t".join(bedRow) + "\n")
+
+
+def main():
+    if len(sys.argv) != 4:
+        print(__doc__, file=sys.stderr)
+        sys.exit(1)
+
+    insdelPath, invPath, outPath = sys.argv[1], sys.argv[2], sys.argv[3]
+
+    with open(outPath, "w") as outF:
+        # insertions + deletions
+        with openTsv(insdelPath) as fIn:
+            reader = csv.DictReader(fIn, delimiter="\t")
+            for row in reader:
+                emit(outF, row, {
+                    "homRef": row.get("HOM_REF", ""),
+                    "homTig": row.get("HOM_TIG", ""),
+                    "te": row.get("TE", ""),
+                })
+
+        # inversions
+        with openTsv(invPath) as fIn:
+            reader = csv.DictReader(fIn, delimiter="\t")
+            for row in reader:
+                emit(outF, row, {
+                    "regionRefInner": row.get("RGN_REF_INNER", ""),
+                })
+
+
+if __name__ == "__main__":
+    main()