src/hg/makeDb/scripts/lrSv/lrSvChirmade101TsvToBed.py 7594507ca126d5242346787e42e13c52ea7709b1

7594507ca126d5242346787e42e13c52ea7709b1
max
  Fri Apr 17 08:40:31 2026 -0700
Add lrSv supertrack: long-read structural variants from 9 studies (hg38).

#Preview2 week - bugs introduced now will need a build patch to fix
Sub-tracks (all bigBed 9+):
han945Sv     - 945 Han Chinese, ONT (Gong 2025, PMID 39929826)
lrSv1kgOnt   - 1019 1000 Genomes, ONT, SVAN-annotated (Schloissnig 2025,
PMID 40702182; lifted from hs1)
tommoJpSv    - 333 Japanese (111 trios), ONT (Otsuki 2022, PMID 36127505)
aou1kSv      - 1027 All of Us, PacBio HiFi (Garimella 2025, PMID 41256123)
ga4kSv       - 502 GA4K pediatric rare disease, PacBio HiFi
(Cohen 2022, PMID 35305867)
decodeSv     - 3622 Icelanders, ONT (Beyter 2021, PMID 33972781)
hgsvc3Sv     - 65 HGSVC3 diverse haplotype-resolved assemblies, HiFi+ONT
(Logsdon 2025, PMID 40702183; merges insdel+inv tables)
kwanhoSv     - 100 post-mortem brains (PD/ILBD/HC), PacBio HiFi
(Kim 2026, PMID 41929179)
chirmade101Sv - 101 long-read WGS GWAS SVatalog cohort
(Chirmade 2026, PMID 41203876)

Includes per-track conversion scripts and autoSql under
scripts/lrSv/, the supertrack summary table in lrSv.html, and a
consolidated makeDoc at doc/hg38/lrSv.txt.

refs #36258

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

diff --git src/hg/makeDb/scripts/lrSv/lrSvChirmade101TsvToBed.py src/hg/makeDb/scripts/lrSv/lrSvChirmade101TsvToBed.py
new file mode 100644
index 00000000000..b790c617518
--- /dev/null
+++ src/hg/makeDb/scripts/lrSv/lrSvChirmade101TsvToBed.py
@@ -0,0 +1,118 @@
+#!/usr/bin/env python3
+"""Convert the Chirmade 2026 SVatalog sv_annotations.tsv to BED9+.
+
+Source: https://zenodo.org/records/13367574 (sv_annotations.tsv)
+Paper:  Chirmade et al. 2026, Heredity (Edinb), PMID 41203876
+
+Coordinates in the source TSV are 1-based closed (End-Start+1 == Length).
+Translate to BED-style 0-based half-open (chromStart = Start - 1,
+chromEnd = End).
+
+Usage:
+    lrSvChirmade101TsvToBed.py sv_annotations.tsv output.bed
+"""
+
+import csv
+import sys
+
+SV_COLORS = {
+    "del":     "200,0,0",      # red
+    "ins":     "0,0,200",      # blue
+    "dup":     "0,160,0",      # green
+    "inv":     "230,140,0",    # orange
+    "complex": "140,0,200",    # purple
+}
+
+
+def na(val):
+    """Return '' for missing ('NA' or empty) source values."""
+    if val is None:
+        return ""
+    v = val.strip()
+    if v == "" or v == "NA":
+        return ""
+    return v
+
+
+def toInt(s):
+    if not s:
+        return 0
+    try:
+        return int(float(s))
+    except ValueError:
+        return 0
+
+
+def main():
+    if len(sys.argv) != 3:
+        print(__doc__, file=sys.stderr)
+        sys.exit(1)
+
+    inPath, outPath = sys.argv[1], sys.argv[2]
+
+    with open(inPath, newline="") as fIn, open(outPath, "w") as fOut:
+        reader = csv.DictReader(fIn, delimiter="\t")
+        for row in reader:
+            chrom = row["Chromosome"]
+            if not chrom.startswith("chr"):
+                chrom = "chr" + chrom
+
+            # 1-based closed -> 0-based half-open
+            chromStart = toInt(row["Start"]) - 1
+            chromEnd = toInt(row["End"])
+            if chromEnd <= chromStart:
+                chromEnd = chromStart + 1
+
+            svType = row["Type"]
+            svLen = abs(toInt(row["Length"]))
+            color = SV_COLORS.get(svType, "100,100,100")
+
+            bedRow = [
+                chrom,
+                str(chromStart),
+                str(chromEnd),
+                row["ID"],
+                "0",
+                ".",
+                str(chromStart),
+                str(chromEnd),
+                color,
+                svType,
+                str(svLen),
+                str(toInt(row.get("GC (%)", "0"))),
+                na(row.get("Cytoband", "")),
+                str(toInt(row.get("Gene Count", "0"))),
+                na(row.get("Gene Name(s)", "")),
+                na(row.get("Gene at Start", "")),
+                na(row.get("Gene at End", "")),
+                na(row.get("Exon Name", "")),
+                na(row.get("CDS Name", "")),
+                na(row.get("Dark Genes % Overlap", "")),
+                na(row.get("ClinGen Haploinsufficient", "")),
+                na(row.get("ClinGen Triplosensitive", "")),
+                na(row.get("gnomAD O/E LoF Upper", "")),
+                na(row.get("gnomAD O/E Mis Upper", "")),
+                na(row.get("gnomAD pLI", "")),
+                na(row.get("gnomAD pRec", "")),
+                na(row.get("Repeat % Overlap", "")),
+                na(row.get("Dirty Region % Overlap", "")),
+                na(row.get("Chromosome Region", "")),
+                na(row.get("CGD", "")),
+                na(row.get("OMIM Pheno", "")),
+                na(row.get("OMIM Inh", "")),
+                na(row.get("ClinGen Region", "")),
+                na(row.get("Decipher Region", "")),
+                na(row.get("ClinVar VarID", "")),
+                na(row.get("gnomAD AF Max 90% RO", "")),
+                na(row.get("gnomAD Population AF Max 90% RO", "")),
+                na(row.get("gnomAD Hom/Ref Frequency 90% RO", "")),
+                na(row.get("gnomAD Het Frequency 90% RO", "")),
+                na(row.get("gnomAD Hom/Alt Frequency 90% RO", "")),
+                na(row.get("DGV % Overlap", "")),
+                na(row.get("DGV 50% RO", "")),
+            ]
+            fOut.write("\t".join(bedRow) + "\n")
+
+
+if __name__ == "__main__":
+    main()