src/hg/makeDb/scripts/webstr/webstrToBed.py 8efbcad3f2816dec50b5671a4445d2e6943f7f91

8efbcad3f2816dec50b5671a4445d2e6943f7f91
max
  Mon Apr 13 07:57:41 2026 -0700
Use light gray for monomorphic strVar loci (het=0), distinct from no-data gray, refs #36652

Add a separate color for loci where heterozygosity is exactly 0 (single allele
observed) across all four strVar subtracks: light gray (200,200,200). This
distinguishes them from the existing medium gray (128,128,128) used when no
allele frequency data is available. Previously het=0 was lumped into the
dark blue "nearly monomorphic" bin. Also expand the itemRgb field description
in all four .as files to list the full color scheme.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

diff --git src/hg/makeDb/scripts/webstr/webstrToBed.py src/hg/makeDb/scripts/webstr/webstrToBed.py
index f26bab2b5d8..8178fbef8c4 100644
--- src/hg/makeDb/scripts/webstr/webstrToBed.py
+++ src/hg/makeDb/scripts/webstr/webstrToBed.py
@@ -1,160 +1,163 @@
 #!/usr/bin/env python3
 """Convert WebSTR CSV data to BED9+ format for bigBed conversion.
 
 Reads hg38_repeats_withlinks.csv.gz and hg38_afreqs.csv.gz from the input
 directory and writes a tab-separated BED file to stdout.
 
 Usage:
     webstrToBed.py <inputDir> > webstr.bed
 """
 
 import csv
 import gzip
 import sys
 from collections import defaultdict
 
 HET_BINS = [
     (0.1, "0,0,180"),       # het < 0.1: dark blue (nearly monomorphic)
     (0.3, "70,130,230"),    # het 0.1-0.3: medium blue (low diversity)
     (0.5, "180,130,200"),   # het 0.3-0.5: light purple (moderate diversity)
     (0.7, "230,100,80"),    # het 0.5-0.7: salmon (high diversity)
     (999, "180,0,0"),       # het >= 0.7: dark red (very high diversity)
 ]
-NO_DATA_COLOR = "128,128,128"  # gray when no allele freq data
+NO_DATA_COLOR = "128,128,128"       # medium gray when no allele freq data
+MONOMORPHIC_COLOR = "200,200,200"   # light gray when het == 0 (fixed allele)
 
 
 def truncateMotif(motif, maxLen=25):
     """Truncate motif to maxLen characters with '..' in the middle."""
     if len(motif) <= maxLen:
         return motif
     keepLen = maxLen - 2
     leftLen = (keepLen + 1) // 2
     rightLen = keepLen - leftLen
     return motif[:leftLen] + ".." + motif[-rightLen:]
 
 
 def hetToColor(het):
     """Map heterozygosity value to an RGB color string."""
     if het < 0:
         return NO_DATA_COLOR
+    if het == 0:
+        return MONOMORPHIC_COLOR
     for threshold, color in HET_BINS:
         if het < threshold:
             return color
     return HET_BINS[-1][1]
 
 
 def computeHet(af):
     """Compute expected heterozygosity from allele freqs pooled across populations.
 
     Pools allele frequencies weighted by sample count, then computes
     het = 1 - sum(p_i^2).
     """
     if af is None:
         return -1.0
     totalN = 0
     weightedFreqs = defaultdict(float)
     for cohort in COHORT_ORDER:
         entry = af[cohort]
         n = entry["n"]
         if n == 0:
             continue
         totalN += n
         for allele, freq in zip(entry["alleles"], entry["freqs"]):
             weightedFreqs[allele] += float(freq) * n
     if totalN == 0:
         return -1.0
     sumPSq = sum((wf / totalN) ** 2 for wf in weightedFreqs.values())
     return round(1.0 - sumPSq, 3)
 
 
 COHORT_ORDER = ["AFR", "AMR", "EAS", "EUR", "SAS"]
 COHORT_MAP = {
     "1000 Genomes AFR": "AFR",
     "1000 Genomes AMR": "AMR",
     "1000 Genomes EAS": "EAS",
     "1000 Genomes EUR": "EUR",
     "1000 Genomes SAS": "SAS",
 }
 
 def loadAlleleFreqs(inDir):
     """Load allele frequency data, grouped by repeatid and cohort."""
     freqs = defaultdict(lambda: {c: {"alleles": [], "freqs": [], "n": 0} for c in COHORT_ORDER})
     path = f"{inDir}/hg38_afreqs.csv.gz"
     with gzip.open(path, "rt") as f:
         reader = csv.reader(f)
         header = next(reader)  # skip header
         for row in reader:
             cohort_raw, allele, freq, n, repeatid = row
             cohort = COHORT_MAP.get(cohort_raw)
             if cohort is None:
                 continue
             entry = freqs[repeatid][cohort]
             entry["alleles"].append(allele)
             entry["freqs"].append(freq)
             entry["n"] = int(n)
     return freqs
 
 def main():
     if len(sys.argv) != 2:
         print(__doc__, file=sys.stderr)
         sys.exit(1)
 
     inDir = sys.argv[1]
 
     print("Loading allele frequencies...", file=sys.stderr)
     afreqs = loadAlleleFreqs(inDir)
     print(f"  Loaded frequencies for {len(afreqs)} repeats", file=sys.stderr)
 
     print("Processing repeats...", file=sys.stderr)
     repeatsPath = f"{inDir}/hg38_repeats_withlinks.csv.gz"
     count = 0
     with gzip.open(repeatsPath, "rt") as f:
         reader = csv.reader(f)
         header = next(reader)  # skip header
         for row in reader:
             repeatid, panel, chrom, motif, start, end, period, numcopies, _webstr_link = row
             # Source coordinates are 1-based; convert start to 0-based for BED
             start = str(int(start) - 1)
 
             # Compute heterozygosity pooled across populations
             af = afreqs.get(repeatid)
             het = computeHet(af)
             color = hetToColor(het)
             score = max(0, int(het * 1000)) if het >= 0 else 0
 
             # BED9 fields
             fields = [
                 chrom,
                 start,
                 end,
                 truncateMotif(motif) + "x" + numcopies,  # name
                 str(score),     # score (het * 1000)
                 ".",            # strand
                 start,          # thickStart
                 end,            # thickEnd
                 color,          # itemRgb
                 motif,
                 period,
                 numcopies,
                 repeatid,
                 str(het),       # het field
             ]
             for cohort in COHORT_ORDER:
                 if af and af[cohort]["alleles"]:
                     entry = af[cohort]
                     pairs = [a + "=" + f for a, f in zip(entry["alleles"], entry["freqs"])]
                     fields.append(" ".join(pairs))
                     fields.append(str(entry["n"]))
                 else:
                     fields.append("")
                     fields.append("0")
 
             print("\t".join(fields))
             count += 1
             if count % 500000 == 0:
                 print(f"  Processed {count} repeats...", file=sys.stderr)
 
     print(f"Done. Wrote {count} records.", file=sys.stderr)
 
 if __name__ == "__main__":
     main()