src/hg/makeDb/scripts/gnomadStr/gnomadStrToBed.py f43f1239645183b88a00b3de83f74fd5553e6ce1

f43f1239645183b88a00b3de83f74fd5553e6ce1
max
  Thu Mar 12 07:52:35 2026 -0700
Add gnomAD STR genotype track under gnomadVariants supertrack

87 disease-associated STR loci from gnomAD v3.1.3, aggregated from
~1.4M individual genotypes (18,511 WGS samples, ExpansionHunter v5).
Includes allele frequency distributions and population breakdowns.
Added relatedTracks links to strVar supertrack, refs #35420, refs #36652

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

diff --git src/hg/makeDb/scripts/gnomadStr/gnomadStrToBed.py src/hg/makeDb/scripts/gnomadStr/gnomadStrToBed.py
new file mode 100644
index 00000000000..7e34ab26624
--- /dev/null
+++ src/hg/makeDb/scripts/gnomadStr/gnomadStrToBed.py
@@ -0,0 +1,126 @@
+#!/usr/bin/env python3
+"""Aggregate gnomAD STR individual genotypes into per-locus BED9+ summary.
+
+Reads gnomAD_STR_genotypes TSV and outputs one BED record per unique locus
+(Id + Chrom + Start + End), with allele size distributions and sample counts.
+
+Usage:
+    gnomadStrToBed.py <input.tsv.gz> > gnomadStr.bed
+"""
+
+import csv
+import gzip
+import sys
+from collections import defaultdict, Counter
+
+MOTIF_LEN_COLORS = {
+    1: "255,0,0",       # mono: red
+    2: "0,0,255",       # di: blue
+    3: "0,128,0",       # tri: green
+    4: "255,165,0",     # tetra: orange
+    5: "128,0,128",     # penta: purple
+    6: "70,130,180",    # hexa: steel blue
+}
+DEFAULT_COLOR = "128,128,128"
+
+def main():
+    if len(sys.argv) != 2:
+        print(__doc__, file=sys.stderr)
+        sys.exit(1)
+
+    inFile = sys.argv[1]
+
+    # Aggregate per locus: key = (Id, chrom, start, end)
+    loci = {}  # key -> {locusId, motifs, isAdjacent, alleles, nSamples, nPass, populations}
+
+    print("Loading genotypes...", file=sys.stderr)
+    with gzip.open(inFile, "rt") as f:
+        reader = csv.DictReader(f, delimiter="\t")
+        count = 0
+        for row in reader:
+            locusKey = (row["Id"], row["Chrom"], row["Start_0based"], row["End"])
+
+            if locusKey not in loci:
+                # Use the first motif's length for coloring; may have "/" for multi-motif
+                motifField = row["Motif"]
+                firstMotif = motifField.split("/")[0]
+                loci[locusKey] = {
+                    "locusId": row["LocusId"],
+                    "motifs": set(),
+                    "isAdjacent": row["IsAdjacentRepeat"],
+                    "alleles": Counter(),
+                    "nSamples": 0,
+                    "nPass": 0,
+                    "populations": Counter(),
+                    "motifLen": len(firstMotif),
+                }
+
+            rec = loci[locusKey]
+            rec["motifs"].add(row["Motif"])
+            rec["nSamples"] += 1
+            if row["Filter"] == "PASS":
+                rec["nPass"] += 1
+            rec["populations"][row["Population"]] += 1
+
+            # Count allele sizes (both alleles)
+            a1 = row["Allele1"]
+            a2 = row["Allele2"]
+            if a1:
+                rec["alleles"][a1] += 1
+            if a2:
+                rec["alleles"][a2] += 1
+
+            count += 1
+            if count % 500000 == 0:
+                print(f"  Processed {count} genotypes...", file=sys.stderr)
+
+    print(f"Loaded {count} genotypes across {len(loci)} loci.", file=sys.stderr)
+
+    # Output BED records
+    for (locusId_raw, chrom, start, end), rec in sorted(loci.items(), key=lambda x: (x[0][1], int(x[0][2]))):
+        color = MOTIF_LEN_COLORS.get(rec["motifLen"], DEFAULT_COLOR)
+        motifs = ",".join(sorted(rec["motifs"]))
+
+        # Build allele frequency distribution (sorted by allele size numerically)
+        alleleCounts = rec["alleles"]
+        totalAlleles = sum(alleleCounts.values())
+        if totalAlleles > 0:
+            # Try to sort numerically, fall back to string sort
+            try:
+                sortedAlleles = sorted(alleleCounts.keys(), key=lambda x: float(x))
+            except ValueError:
+                sortedAlleles = sorted(alleleCounts.keys())
+            alleleStr = ",".join(sortedAlleles)
+            freqStr = ",".join(f"{alleleCounts[a]/totalAlleles:.4f}" for a in sortedAlleles)
+        else:
+            alleleStr = ""
+            freqStr = ""
+
+        # Population summary
+        popStr = ",".join(f"{p}:{n}" for p, n in sorted(rec["populations"].items(), key=lambda x: -x[1]))
+
+        fields = [
+            chrom,
+            start,
+            end,
+            locusId_raw,        # name
+            "0",                # score
+            ".",                # strand
+            start,              # thickStart
+            end,                # thickEnd
+            color,              # itemRgb
+            rec["locusId"],     # gene/locus
+            motifs,             # motif(s)
+            rec["isAdjacent"],
+            str(rec["nSamples"]),
+            str(rec["nPass"]),
+            alleleStr,
+            freqStr,
+            popStr,
+        ]
+        print("\t".join(fields))
+
+    print(f"Wrote {len(loci)} locus records.", file=sys.stderr)
+
+if __name__ == "__main__":
+    main()