8efbcad3f2816dec50b5671a4445d2e6943f7f91 max Mon Apr 13 07:57:41 2026 -0700 Use light gray for monomorphic strVar loci (het=0), distinct from no-data gray, refs #36652 Add a separate color for loci where heterozygosity is exactly 0 (single allele observed) across all four strVar subtracks: light gray (200,200,200). This distinguishes them from the existing medium gray (128,128,128) used when no allele frequency data is available. Previously het=0 was lumped into the dark blue "nearly monomorphic" bin. Also expand the itemRgb field description in all four .as files to list the full color scheme. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> diff --git src/hg/makeDb/scripts/tommoStr/tommoStrToBed.py src/hg/makeDb/scripts/tommoStr/tommoStrToBed.py index 6408d9a9a35..aeebbf9941e 100644 --- src/hg/makeDb/scripts/tommoStr/tommoStrToBed.py +++ src/hg/makeDb/scripts/tommoStr/tommoStrToBed.py @@ -1,172 +1,175 @@ #!/usr/bin/env python3 """ Convert ToMMo 61KJPN STR Expansion Hunter VCF to BED9+ for bigBed. Input: VCF from ToMMo with <STRn> ALT alleles and AC/AF/MEAN/MEDIAN INFO fields. Output: BED to stdout. Usage: python3 tommoStrToBed.py input.vcf.gz | sort -k1,1 -k2,2n > tommoStr.bed """ import sys import gzip import re HET_BINS = [ (0.1, "0,0,180"), # het < 0.1: dark blue (nearly monomorphic) (0.3, "70,130,230"), # het 0.1-0.3: medium blue (low diversity) (0.5, "180,130,200"), # het 0.3-0.5: light purple (moderate diversity) (0.7, "230,100,80"), # het 0.5-0.7: salmon (high diversity) (999, "180,0,0"), # het >= 0.7: dark red (very high diversity) ] -NO_DATA_COLOR = "128,128,128" # gray when no allele freq data +NO_DATA_COLOR = "128,128,128" # medium gray when no allele freq data +MONOMORPHIC_COLOR = "200,200,200" # light gray when het == 0 (fixed allele) def het_to_color(het): """Map heterozygosity value to an RGB color string.""" if het < 0: return NO_DATA_COLOR + if het == 0: + return MONOMORPHIC_COLOR for threshold, color in HET_BINS: if het < threshold: return color return HET_BINS[-1][1] def compute_het(hist_parts): """Compute expected heterozygosity from allele count pairs. hist_parts: list of (copies, count) tuples. Returns het = 1 - sum(p_i^2), or -1 if no data. """ if not hist_parts: return -1.0 total = sum(c for _, c in hist_parts) if total == 0: return -1.0 sum_p_sq = sum((c / total) ** 2 for _, c in hist_parts) return round(1.0 - sum_p_sq, 3) def truncate_motif(motif, max_len=25): """Truncate long motifs with .. in the middle.""" if len(motif) <= max_len: return motif half = (max_len - 2) // 2 return motif[:half] + ".." + motif[-half:] def parse_info(info_str): """Parse VCF INFO field into a dict.""" d = {} for item in info_str.split(";"): if "=" in item: key, val = item.split("=", 1) d[key] = val else: d[item] = True return d def parse_str_alleles(alt_str): """Parse <STR0>,<STR1>,... into list of repeat copy numbers.""" copies = [] for a in alt_str.split(","): m = re.match(r"<STR(\d+)>", a) if m: copies.append(int(m.group(1))) else: copies.append(None) return copies def main(): if len(sys.argv) < 2: print("Usage: tommoStrToBed.py input.vcf.gz", file=sys.stderr) sys.exit(1) vcf_path = sys.argv[1] opener = gzip.open if vcf_path.endswith(".gz") else open count = 0 with opener(vcf_path, "rt") as f: for line in f: if line.startswith("#"): continue parts = line.rstrip("\n").split("\t") if len(parts) < 8: continue chrom, pos, _, ref_allele, alt, _, _, info_str = parts[:8] info = parse_info(info_str) # Parse coordinates start = int(pos) - 1 # VCF is 1-based end = int(info.get("END", pos)) # Parse repeat info motif = info.get("RU", ".") period = len(motif) if motif != "." else 0 ref_copies = int(info.get("REF", "0")) mean_val = info.get("MEAN", "0") median_val = info.get("MEDIAN", "0") an = info.get("AN", "0") # Parse allele counts alt_copies = parse_str_alleles(alt) ac_vals = info.get("AC", "").split(",") if "AC" in info else [] # Build histogram: copies=count pairs # Include the reference allele count hist_parts = [] total_alt_ac = 0 for i, copies in enumerate(alt_copies): if copies is None or i >= len(ac_vals): continue try: ac = int(ac_vals[i]) except ValueError: continue if ac > 0: hist_parts.append((copies, ac)) total_alt_ac += ac # Add reference allele count (AN - sum of AC) try: an_int = int(an) ref_ac = an_int - total_alt_ac if ref_ac > 0: hist_parts.append((ref_copies, ref_ac)) except ValueError: pass # Sort by copy number hist_parts.sort(key=lambda x: x[0]) hist_str = " ".join(f"{c}={n}" for c, n in hist_parts) if not hist_str: hist_str = "." # Compute heterozygosity and color het = compute_het(hist_parts) color = het_to_color(het) score = max(0, int(het * 1000)) if het >= 0 else 0 # Name motif_display = truncate_motif(motif) name = f"{motif_display} x{ref_copies}" fields = [ chrom, str(start), str(end), name, str(score), ".", str(start), str(end), color, motif, str(period), str(ref_copies), mean_val, median_val, an, str(het), hist_str, ] print("\t".join(fields)) count += 1 print(f"Wrote {count:,} loci", file=sys.stderr) if __name__ == "__main__": main()