a37d4ffb473b6ebf6c66545f2b7d5f7eef35ef4b max Fri Apr 10 03:00:33 2026 -0700 Color strVar subtracks by expected heterozygosity instead of motif period, fix hgTrackUi filter label truncation, refs #36652 Change all four strVar subtracks (webstr, tommoStr, trexplorer, viennaVntr) from motif-period-based coloring to expected heterozygosity (het = 1 - sum(p_i^2)), using a blue-to-red heat map: dark blue (het<0.1) through medium blue, light purple, salmon, to dark red (het>=0.7). Add het as a filterable bigBed field with scoreFilter and filterByRange on each track. Update mouseOver, track docs, and makedoc. Also fix hgTrackUi to strip the "|..." suffix from autoSql comments when displaying numeric filter labels. Co-Authored-By: Claude Opus 4.6 (1M context) diff --git src/hg/makeDb/scripts/viennaVntr/viennaVntrToBed.py src/hg/makeDb/scripts/viennaVntr/viennaVntrToBed.py new file mode 100644 index 00000000000..5a859f5c52d --- /dev/null +++ src/hg/makeDb/scripts/viennaVntr/viennaVntrToBed.py @@ -0,0 +1,126 @@ +#!/usr/bin/env python3 +""" +Convert Vienna ONT 1000 Genomes VAMOS VNTR summary TSV to BED9+ for bigBed. + +Input: vamos-summary.tsv from the Vienna ONT project. +Output: BED to stdout. + +Usage: + python3 viennaVntrToBed.py vamos-summary.tsv | sort -k1,1 -k2,2n > viennaVntr.bed +""" + +import sys + +HET_BINS = [ + (0.1, "0,0,180"), # het < 0.1: dark blue (nearly monomorphic) + (0.3, "70,130,230"), # het 0.1-0.3: medium blue (low diversity) + (0.5, "180,130,200"), # het 0.3-0.5: light purple (moderate diversity) + (0.7, "230,100,80"), # het 0.5-0.7: salmon (high diversity) + (999, "180,0,0"), # het >= 0.7: dark red (very high diversity) +] +NO_DATA_COLOR = "128,128,128" # gray when no het data + + +def het_to_color(het): + """Map heterozygosity value to an RGB color string.""" + if het < 0: + return NO_DATA_COLOR + for threshold, color in HET_BINS: + if het < threshold: + return color + return HET_BINS[-1][1] + + +def load_het(het_path): + """Load het values from TSV (chrom, pos_1based, het).""" + het_map = {} + with open(het_path) as f: + for line in f: + chrom, pos, het = line.rstrip("\n").split("\t") + het_map[(chrom, int(pos))] = float(het) + return het_map + + +def main(): + if len(sys.argv) < 3: + print("Usage: viennaVntrToBed.py vamos-summary.tsv het.tsv", file=sys.stderr) + sys.exit(1) + + print("Loading het values...", file=sys.stderr) + het_map = load_het(sys.argv[2]) + print(f" Loaded {len(het_map)} het values", file=sys.stderr) + + count = 0 + with open(sys.argv[1]) as f: + header = None + for line in f: + if line.startswith("#"): + header = line.strip().lstrip("#").split("\t") + continue + + parts = line.strip().split("\t") + if len(parts) < 20: + continue + + def safe_int(s, default=0): + try: + v = float(s) + if v != v: # NaN check + return default + return int(v) + except (ValueError, OverflowError): + return default + + def safe_float(s, default="0"): + try: + v = float(s) + if v != v: # NaN + return default + return s + except ValueError: + return default + + chrom = parts[0] + start = int(parts[1]) - 1 # 1-based to 0-based + end = int(parts[2]) + num_haps = safe_int(parts[3]) + ru_len_avg = float(parts[4]) if parts[4] != "NaN" else 0.0 + num_unique = safe_int(parts[6]) + max_rus = safe_int(parts[7]) + min_rus = safe_int(parts[8]) + median_rus = safe_float(parts[9]) + pct25_rus = safe_float(parts[12]) + pct75_rus = safe_float(parts[13]) + unique_num_rus = safe_int(parts[16]) + max_bps = safe_int(parts[17]) + min_bps = safe_int(parts[18]) + median_bps = safe_float(parts[19]) + + # Lookup het (key is chrom, 1-based pos from VCF) + het = het_map.get((chrom, start + 1), -1.0) + color = het_to_color(het) + score = max(0, int(het * 1000)) if het >= 0 else 0 + + # Name: motif size and median repeat count + name = f"{int(round(ru_len_avg))}bp x{median_rus}" + + fields = [ + chrom, str(start), str(end), name, str(score), ".", + str(start), str(end), color, + str(num_haps), + f"{ru_len_avg:.1f}", + str(num_unique), + str(max_rus), str(min_rus), median_rus, + pct25_rus, pct75_rus, + str(unique_num_rus), + str(max_bps), str(min_bps), median_bps, + str(het), + ] + print("\t".join(fields)) + count += 1 + + print(f"Wrote {count:,} loci", file=sys.stderr) + + +if __name__ == "__main__": + main()