6bb46ba4e8d91ab3670d354ef18d8bf5321ec9ee max Thu Mar 12 07:15:29 2026 -0700 Add WebSTR short tandem repeat track under new strVar supertrack New track with 1.7M STR loci from WebSTR EnsembleTR panel (hg38), with allele frequency data for 5 populations from 1000 Genomes (3,550 samples). Includes conversion script, .as schema, trackDb, and full HTML documentation, refs #36652 Co-Authored-By: Claude Opus 4.6 diff --git src/hg/makeDb/scripts/webstr/webstrToBed.py src/hg/makeDb/scripts/webstr/webstrToBed.py new file mode 100644 index 00000000000..a1da209378a --- /dev/null +++ src/hg/makeDb/scripts/webstr/webstrToBed.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python3 +"""Convert WebSTR CSV data to BED9+ format for bigBed conversion. + +Reads hg38_repeats_withlinks.csv.gz and hg38_afreqs.csv.gz from the input +directory and writes a tab-separated BED file to stdout. + +Usage: + webstrToBed.py > webstr.bed +""" + +import csv +import gzip +import sys +from collections import defaultdict + +PERIOD_COLORS = { + 1: "255,0,0", # mono: red + 2: "0,0,255", # di: blue + 3: "0,128,0", # tri: green + 4: "255,165,0", # tetra: orange + 5: "128,0,128", # penta: purple + 6: "70,130,180", # hexa: steel blue +} +DEFAULT_COLOR = "128,128,128" # gray for period > 6 + +COHORT_ORDER = ["AFR", "AMR", "EAS", "EUR", "SAS"] +COHORT_MAP = { + "1000 Genomes AFR": "AFR", + "1000 Genomes AMR": "AMR", + "1000 Genomes EAS": "EAS", + "1000 Genomes EUR": "EUR", + "1000 Genomes SAS": "SAS", +} + +def loadAlleleFreqs(inDir): + """Load allele frequency data, grouped by repeatid and cohort.""" + freqs = defaultdict(lambda: {c: {"alleles": [], "freqs": [], "n": 0} for c in COHORT_ORDER}) + path = f"{inDir}/hg38_afreqs.csv.gz" + with gzip.open(path, "rt") as f: + reader = csv.reader(f) + header = next(reader) # skip header + for row in reader: + cohort_raw, allele, freq, n, repeatid = row + cohort = COHORT_MAP.get(cohort_raw) + if cohort is None: + continue + entry = freqs[repeatid][cohort] + entry["alleles"].append(allele) + entry["freqs"].append(freq) + entry["n"] = int(n) + return freqs + +def main(): + if len(sys.argv) != 2: + print(__doc__, file=sys.stderr) + sys.exit(1) + + inDir = sys.argv[1] + + print("Loading allele frequencies...", file=sys.stderr) + afreqs = loadAlleleFreqs(inDir) + print(f" Loaded frequencies for {len(afreqs)} repeats", file=sys.stderr) + + print("Processing repeats...", file=sys.stderr) + repeatsPath = f"{inDir}/hg38_repeats_withlinks.csv.gz" + count = 0 + with gzip.open(repeatsPath, "rt") as f: + reader = csv.reader(f) + header = next(reader) # skip header + for row in reader: + repeatid, panel, chrom, motif, start, end, period, numcopies, _webstr_link = row + period_int = int(period) + color = PERIOD_COLORS.get(period_int, DEFAULT_COLOR) + + # Source coordinates are 1-based; convert start to 0-based for BED + start = str(int(start) - 1) + + # BED9 fields + fields = [ + chrom, + start, + end, + repeatid, # name + "0", # score + ".", # strand + start, # thickStart + end, # thickEnd + color, # itemRgb + motif, + period, + numcopies, + ] + + # Allele frequency fields for each cohort + af = afreqs.get(repeatid) + for cohort in COHORT_ORDER: + if af and af[cohort]["alleles"]: + entry = af[cohort] + fields.append(",".join(entry["alleles"])) + fields.append(",".join(entry["freqs"])) + fields.append(str(entry["n"])) + else: + fields.append("") + fields.append("") + fields.append("0") + + print("\t".join(fields)) + count += 1 + if count % 500000 == 0: + print(f" Processed {count} repeats...", file=sys.stderr) + + print(f"Done. Wrote {count} records.", file=sys.stderr) + +if __name__ == "__main__": + main()