src/hg/makeDb/scripts/lrSv/lrSvAou1kCsvToBed.py 7594507ca126d5242346787e42e13c52ea7709b1

7594507ca126d5242346787e42e13c52ea7709b1
max
  Fri Apr 17 08:40:31 2026 -0700
Add lrSv supertrack: long-read structural variants from 9 studies (hg38).

#Preview2 week - bugs introduced now will need a build patch to fix
Sub-tracks (all bigBed 9+):
han945Sv     - 945 Han Chinese, ONT (Gong 2025, PMID 39929826)
lrSv1kgOnt   - 1019 1000 Genomes, ONT, SVAN-annotated (Schloissnig 2025,
PMID 40702182; lifted from hs1)
tommoJpSv    - 333 Japanese (111 trios), ONT (Otsuki 2022, PMID 36127505)
aou1kSv      - 1027 All of Us, PacBio HiFi (Garimella 2025, PMID 41256123)
ga4kSv       - 502 GA4K pediatric rare disease, PacBio HiFi
(Cohen 2022, PMID 35305867)
decodeSv     - 3622 Icelanders, ONT (Beyter 2021, PMID 33972781)
hgsvc3Sv     - 65 HGSVC3 diverse haplotype-resolved assemblies, HiFi+ONT
(Logsdon 2025, PMID 40702183; merges insdel+inv tables)
kwanhoSv     - 100 post-mortem brains (PD/ILBD/HC), PacBio HiFi
(Kim 2026, PMID 41929179)
chirmade101Sv - 101 long-read WGS GWAS SVatalog cohort
(Chirmade 2026, PMID 41203876)

Includes per-track conversion scripts and autoSql under
scripts/lrSv/, the supertrack summary table in lrSv.html, and a
consolidated makeDoc at doc/hg38/lrSv.txt.

refs #36258

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

diff --git src/hg/makeDb/scripts/lrSv/lrSvAou1kCsvToBed.py src/hg/makeDb/scripts/lrSv/lrSvAou1kCsvToBed.py
new file mode 100644
index 00000000000..85dbd766430
--- /dev/null
+++ src/hg/makeDb/scripts/lrSv/lrSvAou1kCsvToBed.py
@@ -0,0 +1,130 @@
+#!/usr/bin/env python3
+"""Convert AoU 1K long-read SV CSV to BED9+ for bigBed.
+
+Input is a gzipped CSV (media-2.gz) with columns:
+  SV_coordinate, SV_ID, SV type, SV length, Mean GenotypePosterior nonref,
+  AF(AFR,AMR,EAS,EUR,SAS), Fst(AFR vs Non-AFR), OMIM genes, Disease genes,
+  Cancer genes, ACMG genes, OMIM CDS, Disease CDS, Cancer CDS, ACMG CDS,
+  Regulatory element, SegDUP, Tandem repeats, Other LR datasets,
+  Detected in AoU SR, eQTLs, GWAS, SV-trait associations,
+  SR validation, LR assembly-supported, Locityper validation
+
+Usage:
+    lrSvAou1kCsvToBed.py input.csv.gz output.bed
+"""
+
+import csv
+import gzip
+import sys
+
+SV_COLORS = {
+    "DEL": "200,0,0",
+    "INS": "0,0,200",
+}
+
+def na(val):
+    """Return empty string for NA values."""
+    if val == "NA" or val == "No" or val == "":
+        return ""
+    return val
+
+def main():
+    if len(sys.argv) != 3:
+        print(__doc__, file=sys.stderr)
+        sys.exit(1)
+
+    inFile, outFile = sys.argv[1], sys.argv[2]
+
+    with gzip.open(inFile, "rt") as fIn, open(outFile, "w") as fOut:
+        reader = csv.reader(fIn)
+        header = next(reader)
+
+        for row in reader:
+            coord = row[0]       # chr1:10627
+            svType = row[2]
+            svLen = int(row[3])
+
+            # Parse coordinate (1-based position)
+            chrom, posStr = coord.split(":")
+            pos = int(posStr)
+
+            # BED is 0-based half-open
+            chromStart = pos - 1
+            if svType == "DEL":
+                chromEnd = chromStart + svLen
+            else:
+                # INS: place at insertion site, 1 bp wide
+                chromEnd = chromStart + 1
+
+            name = f"{svType} {svLen}bp"
+            color = SV_COLORS.get(svType, "100,100,100")
+
+            # Parse population AFs (column 5): "0.001,0.002,0.003,0.004,0.005"
+            afStr = row[5]
+            afParts = afStr.split(",")
+            try:
+                afAfr = float(afParts[0])
+                afAmr = float(afParts[1])
+                afEas = float(afParts[2])
+                afEur = float(afParts[3])
+                afSas = float(afParts[4])
+            except (ValueError, IndexError):
+                afAfr = afAmr = afEas = afEur = afSas = 0.0
+
+            fst = na(row[6])
+
+            # Gene intersections (use gene-level, skip CDS-level which is subset)
+            omimGenes = na(row[7])
+            diseaseGenes = na(row[8])
+            cancerGenes = na(row[9])
+            acmgGenes = na(row[10])
+
+            regElement = na(row[15])
+            segDup = na(row[16])
+            tandemRepeat = na(row[17])
+            otherLr = na(row[18])
+            detectedSr = na(row[19])
+
+            eqtls = na(row[20])
+            gwas = na(row[21])
+            traitAssoc = na(row[22])
+
+            # Use max population AF as score (0-1000)
+            maxAf = max(afAfr, afAmr, afEas, afEur, afSas)
+            score = min(int(round(maxAf * 1000)), 1000)
+
+            bedRow = [
+                chrom,
+                str(chromStart),
+                str(chromEnd),
+                name,
+                str(score),
+                ".",
+                str(chromStart),
+                str(chromEnd),
+                color,
+                svType,
+                str(svLen),
+                f"{afAfr:.6f}",
+                f"{afAmr:.6f}",
+                f"{afEas:.6f}",
+                f"{afEur:.6f}",
+                f"{afSas:.6f}",
+                fst,
+                omimGenes,
+                diseaseGenes,
+                cancerGenes,
+                acmgGenes,
+                regElement,
+                segDup,
+                tandemRepeat,
+                otherLr,
+                detectedSr,
+                eqtls,
+                gwas,
+                traitAssoc,
+            ]
+            fOut.write("\t".join(bedRow) + "\n")
+
+if __name__ == "__main__":
+    main()