src/hg/makeDb/scripts/lrSv/lrSvHgsvc2TsvToBed.py 9a11061ca6b40fe16bdfd09b1af53192f6c7c85b

9a11061ca6b40fe16bdfd09b1af53192f6c7c85b
max
  Tue Apr 21 08:13:02 2026 -0700
lrSv: add HTML doc pages and conversion scripts for recent subtracks, + hs1 HGSVC3

Subtrack stanzas for these SV callsets landed in earlier commits but
the conversion scripts and per-track HTML description pages were
never added; trackDb therefore had no doc to serve. This commit
catches up.

Docs (new):
- colorsDbSv.html     CoLoRSdb 1,427-sample long-read SVs
- gustafsonSv.html    1KG ONT 100 (Gustafson 2024, PMID 39358015)
- hgsvc2Sv.html       HGSVC2 (Ebert 2021, PMID 33632895)
- hprc2Sv.html        HPRC release-2 pangenome SVs (no PMID yet;
see humanpangenome.org/hprc-data-release-2/)
- onekg3202Sr.html    1KG 3202 Illumina SHORT-READ GATK-SV
(Byrska-Bishop 2022, PMID 36055201)

Scripts (new):
- lrSvGustafson.as / lrSvGustafsonVcfToBed.py
- lrSvHgsvc2.as / lrSvHgsvc2TsvToBed.py  (merges insdel + inv tables)
- lrSvHprc2.as / lrSvHprc2VcfToBed.py    (streams wave-decomposed VCF,
explodes multi-allelic rows,
filters to SV-sized or INV)
- lrSv1kg3202Sr.as / lrSv1kg3202SrVcfToBed.py

HGSVC3 also on hs1:
- hgsvc3Sv.html: note that the hs1 build is native (not lifted):
HGSVC3 aligned all assemblies to both GRCh38 and T2T-CHM13 and
released separate annotation tables per reference. Added the
T2T-CHM13 source URL to the Methods section and the hs1 hgsvc3.bb
download link to Data Access.
- doc/hs1/lrSv.txt (new): hs1-specific wget + build steps; refers
back to doc/hg38/lrSv.txt for the full process.

refs #36258

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

diff --git src/hg/makeDb/scripts/lrSv/lrSvHgsvc2TsvToBed.py src/hg/makeDb/scripts/lrSv/lrSvHgsvc2TsvToBed.py
new file mode 100644
index 00000000000..7492432ddac
--- /dev/null
+++ src/hg/makeDb/scripts/lrSv/lrSvHgsvc2TsvToBed.py
@@ -0,0 +1,163 @@
+#!/usr/bin/env python3
+"""Convert HGSVC2 v2.0 integrated annotation TSVs (insdel + inv) to merged BED9+.
+
+Usage:
+    lrSvHgsvc2TsvToBed.py insdel.tsv.gz inv.tsv.gz output.bed
+
+The two TSVs share an envelope (chrom, pos, end, svtype, svlen, MERGE_AC /
+MERGE_SAMPLES, cytoband, REF_SD, REF_TRF, REFSEQ_*, PLI/LOEUF, WIN_500/2K)
+but diverge on type-specific fields: insdel has POP_*_AF population allele
+frequencies, inv has RGN_REF_INNER. Both columns are emitted; type-specific
+ones are empty where not applicable.
+
+Source:
+    https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/HGSVC2/release/v2.0/integrated_callset/
+Paper:
+    Ebert et al. 2021, Science, PMID 33632895.
+"""
+
+import csv
+import gzip
+import sys
+
+SV_COLORS = {
+    "DEL": "200,0,0",      # red
+    "INS": "0,0,200",      # blue
+    "INV": "230,140,0",    # orange
+}
+
+
+def openTsv(path):
+    return gzip.open(path, "rt") if path.endswith(".gz") else open(path, "rt")
+
+
+def na(val):
+    if val is None:
+        return ""
+    v = val.strip()
+    if v in ("", "NA", ".", "nan"):
+        return ""
+    return v
+
+
+def toInt(s):
+    if not s or s in ("NA", "."):
+        return 0
+    try:
+        return int(float(s))
+    except ValueError:
+        return 0
+
+
+def toFloat(s):
+    if not s or s in ("NA", "."):
+        return 0.0
+    try:
+        return float(s)
+    except ValueError:
+        return 0.0
+
+
+def countSamples(mergeSamples):
+    """Return unique-sample count, stripping -h1/-h2/-un haplotype suffixes."""
+    if not mergeSamples:
+        return 0
+    uniq = set()
+    for p in mergeSamples.split(","):
+        p = p.strip()
+        if not p:
+            continue
+        for suf in ("-h1", "-h2", "-un", "-1", "-2"):
+            if p.endswith(suf):
+                p = p[: -len(suf)]
+                break
+        uniq.add(p)
+    return len(uniq)
+
+
+def emit(outF, row, typeExtra):
+    chrom = row["#CHROM"]
+    if not chrom.startswith("chr"):
+        chrom = "chr" + chrom
+    # HGSVC2 TSV coords are 0-based half-open (matches HGSVC3).
+    chromStart = toInt(row["POS"])
+    chromEnd = toInt(row["END"])
+    if chromEnd <= chromStart:
+        chromEnd = chromStart + 1
+
+    svType = row["SVTYPE"]
+    svLen = abs(toInt(row["SVLEN"]))
+    color = SV_COLORS.get(svType, "100,100,100")
+
+    alleleCount = toInt(row.get("MERGE_AC", "0"))
+    sampleCount = countSamples(row.get("MERGE_SAMPLES", ""))
+
+    bed = [
+        chrom,
+        str(chromStart),
+        str(chromEnd),
+        row["ID"],
+        "0",
+        ".",
+        str(chromStart),
+        str(chromEnd),
+        color,
+        svType,
+        str(svLen),
+        str(alleleCount),
+        str(sampleCount),
+        na(row.get("BAND", "")),
+        f"{toFloat(row.get('REF_SD', '0')):.6f}",
+        na(row.get("REF_TRF", "")),
+        str(toInt(row.get("REFSEQ_CDS", "0"))),
+        str(toInt(row.get("REFSEQ_UTR3", "0"))),
+        str(toInt(row.get("REFSEQ_UTR5", "0"))),
+        str(toInt(row.get("REFSEQ_INTRON", "0"))),
+        str(toInt(row.get("REFSEQ_NCRNA", "0"))),
+        str(toInt(row.get("REFSEQ_UP_5K", "0"))),
+        str(toInt(row.get("REFSEQ_DN_5K", "0"))),
+        na(row.get("PLI_MAX", "")),
+        na(row.get("LOEUF_MIN", "")),
+        typeExtra.get("popAllAf", ""),
+        typeExtra.get("popAfrAf", ""),
+        typeExtra.get("popAmrAf", ""),
+        typeExtra.get("popEasAf", ""),
+        typeExtra.get("popEurAf", ""),
+        typeExtra.get("popSasAf", ""),
+        typeExtra.get("regionRefInner", ""),
+        row.get("MERGE_SAMPLES", "") or "",
+        na(row.get("DISC_CLASS", "")),
+        na(row.get("WIN_500", "")),
+        na(row.get("WIN_2K", "")),
+    ]
+    outF.write("\t".join(bed) + "\n")
+
+
+def main():
+    if len(sys.argv) != 4:
+        print(__doc__, file=sys.stderr)
+        sys.exit(1)
+
+    insdelPath, invPath, outPath = sys.argv[1], sys.argv[2], sys.argv[3]
+
+    with open(outPath, "w") as outF:
+        with openTsv(insdelPath) as fIn:
+            for row in csv.DictReader(fIn, delimiter="\t"):
+                emit(outF, row, {
+                    "popAllAf": na(row.get("POP_ALL_AF", "")),
+                    "popAfrAf": na(row.get("POP_AFR_AF", "")),
+                    "popAmrAf": na(row.get("POP_AMR_AF", "")),
+                    "popEasAf": na(row.get("POP_EAS_AF", "")),
+                    "popEurAf": na(row.get("POP_EUR_AF", "")),
+                    "popSasAf": na(row.get("POP_SAS_AF", "")),
+                })
+
+        with openTsv(invPath) as fIn:
+            for row in csv.DictReader(fIn, delimiter="\t"):
+                emit(outF, row, {
+                    "regionRefInner": na(row.get("RGN_REF_INNER", "")),
+                })
+
+
+if __name__ == "__main__":
+    main()