src/hg/makeDb/scripts/webstr/webstrToBed.py a37d4ffb473b6ebf6c66545f2b7d5f7eef35ef4b

a37d4ffb473b6ebf6c66545f2b7d5f7eef35ef4b
max
  Fri Apr 10 03:00:33 2026 -0700
Color strVar subtracks by expected heterozygosity instead of motif period, fix hgTrackUi filter label truncation, refs #36652

Change all four strVar subtracks (webstr, tommoStr, trexplorer, viennaVntr) from
motif-period-based coloring to expected heterozygosity (het = 1 - sum(p_i^2)),
using a blue-to-red heat map: dark blue (het<0.1) through medium blue, light
purple, salmon, to dark red (het>=0.7). Add het as a filterable bigBed field with
scoreFilter and filterByRange on each track. Update mouseOver, track docs, and
makedoc. Also fix hgTrackUi to strip the "|..." suffix from autoSql comments
when displaying numeric filter labels.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

diff --git src/hg/makeDb/scripts/webstr/webstrToBed.py src/hg/makeDb/scripts/webstr/webstrToBed.py
index 7754047adb4..f26bab2b5d8 100644
--- src/hg/makeDb/scripts/webstr/webstrToBed.py
+++ src/hg/makeDb/scripts/webstr/webstrToBed.py
@@ -1,126 +1,160 @@
 #!/usr/bin/env python3
 """Convert WebSTR CSV data to BED9+ format for bigBed conversion.
 
 Reads hg38_repeats_withlinks.csv.gz and hg38_afreqs.csv.gz from the input
 directory and writes a tab-separated BED file to stdout.
 
 Usage:
     webstrToBed.py <inputDir> > webstr.bed
 """
 
 import csv
 import gzip
 import sys
 from collections import defaultdict
 
-PERIOD_COLORS = {
-    1: "255,0,0",       # mono: red
-    2: "0,0,255",       # di: blue
-    3: "0,128,0",       # tri: green
-    4: "255,165,0",     # tetra: orange
-    5: "128,0,128",     # penta: purple
-    6: "70,130,180",    # hexa: steel blue
-}
-DEFAULT_COLOR = "128,128,128"  # gray for period > 6
+HET_BINS = [
+    (0.1, "0,0,180"),       # het < 0.1: dark blue (nearly monomorphic)
+    (0.3, "70,130,230"),    # het 0.1-0.3: medium blue (low diversity)
+    (0.5, "180,130,200"),   # het 0.3-0.5: light purple (moderate diversity)
+    (0.7, "230,100,80"),    # het 0.5-0.7: salmon (high diversity)
+    (999, "180,0,0"),       # het >= 0.7: dark red (very high diversity)
+]
+NO_DATA_COLOR = "128,128,128"  # gray when no allele freq data
 
 
 def truncateMotif(motif, maxLen=25):
     """Truncate motif to maxLen characters with '..' in the middle."""
     if len(motif) <= maxLen:
         return motif
     keepLen = maxLen - 2
     leftLen = (keepLen + 1) // 2
     rightLen = keepLen - leftLen
     return motif[:leftLen] + ".." + motif[-rightLen:]
 
 
+def hetToColor(het):
+    """Map heterozygosity value to an RGB color string."""
+    if het < 0:
+        return NO_DATA_COLOR
+    for threshold, color in HET_BINS:
+        if het < threshold:
+            return color
+    return HET_BINS[-1][1]
+
+
+def computeHet(af):
+    """Compute expected heterozygosity from allele freqs pooled across populations.
+
+    Pools allele frequencies weighted by sample count, then computes
+    het = 1 - sum(p_i^2).
+    """
+    if af is None:
+        return -1.0
+    totalN = 0
+    weightedFreqs = defaultdict(float)
+    for cohort in COHORT_ORDER:
+        entry = af[cohort]
+        n = entry["n"]
+        if n == 0:
+            continue
+        totalN += n
+        for allele, freq in zip(entry["alleles"], entry["freqs"]):
+            weightedFreqs[allele] += float(freq) * n
+    if totalN == 0:
+        return -1.0
+    sumPSq = sum((wf / totalN) ** 2 for wf in weightedFreqs.values())
+    return round(1.0 - sumPSq, 3)
+
+
 COHORT_ORDER = ["AFR", "AMR", "EAS", "EUR", "SAS"]
 COHORT_MAP = {
     "1000 Genomes AFR": "AFR",
     "1000 Genomes AMR": "AMR",
     "1000 Genomes EAS": "EAS",
     "1000 Genomes EUR": "EUR",
     "1000 Genomes SAS": "SAS",
 }
 
 def loadAlleleFreqs(inDir):
     """Load allele frequency data, grouped by repeatid and cohort."""
     freqs = defaultdict(lambda: {c: {"alleles": [], "freqs": [], "n": 0} for c in COHORT_ORDER})
     path = f"{inDir}/hg38_afreqs.csv.gz"
     with gzip.open(path, "rt") as f:
         reader = csv.reader(f)
         header = next(reader)  # skip header
         for row in reader:
             cohort_raw, allele, freq, n, repeatid = row
             cohort = COHORT_MAP.get(cohort_raw)
             if cohort is None:
                 continue
             entry = freqs[repeatid][cohort]
             entry["alleles"].append(allele)
             entry["freqs"].append(freq)
             entry["n"] = int(n)
     return freqs
 
 def main():
     if len(sys.argv) != 2:
         print(__doc__, file=sys.stderr)
         sys.exit(1)
 
     inDir = sys.argv[1]
 
     print("Loading allele frequencies...", file=sys.stderr)
     afreqs = loadAlleleFreqs(inDir)
     print(f"  Loaded frequencies for {len(afreqs)} repeats", file=sys.stderr)
 
     print("Processing repeats...", file=sys.stderr)
     repeatsPath = f"{inDir}/hg38_repeats_withlinks.csv.gz"
     count = 0
     with gzip.open(repeatsPath, "rt") as f:
         reader = csv.reader(f)
         header = next(reader)  # skip header
         for row in reader:
             repeatid, panel, chrom, motif, start, end, period, numcopies, _webstr_link = row
-            period_int = int(period)
-            color = PERIOD_COLORS.get(period_int, DEFAULT_COLOR)
-
             # Source coordinates are 1-based; convert start to 0-based for BED
             start = str(int(start) - 1)
 
+            # Compute heterozygosity pooled across populations
+            af = afreqs.get(repeatid)
+            het = computeHet(af)
+            color = hetToColor(het)
+            score = max(0, int(het * 1000)) if het >= 0 else 0
+
             # BED9 fields
             fields = [
                 chrom,
                 start,
                 end,
                 truncateMotif(motif) + "x" + numcopies,  # name
-                "0",            # score
+                str(score),     # score (het * 1000)
                 ".",            # strand
                 start,          # thickStart
                 end,            # thickEnd
                 color,          # itemRgb
                 motif,
                 period,
                 numcopies,
                 repeatid,
+                str(het),       # het field
             ]
-
-            # Allele frequency fields for each cohort (logfmt: allele=freq pairs)
-            af = afreqs.get(repeatid)
             for cohort in COHORT_ORDER:
                 if af and af[cohort]["alleles"]:
                     entry = af[cohort]
                     pairs = [a + "=" + f for a, f in zip(entry["alleles"], entry["freqs"])]
                     fields.append(" ".join(pairs))
                     fields.append(str(entry["n"]))
                 else:
                     fields.append("")
                     fields.append("0")
 
             print("\t".join(fields))
             count += 1
             if count % 500000 == 0:
                 print(f"  Processed {count} repeats...", file=sys.stderr)
 
     print(f"Done. Wrote {count} records.", file=sys.stderr)
 
 if __name__ == "__main__":
     main()