8c1ebdae695b1bb40e45f41aff4a4f9fb9e52491 max Thu May 28 05:43:13 2026 -0700 [Claude] add EVE missense variant effect heatmap track for hg38 #Preview2 week - bugs introduced now will need a build patch to fix Heatmap bigBed track showing EVE scores for all possible missense substitutions in 2,949 disease-associated proteins. One entry per protein; columns = amino acid positions at genomic codon coordinates, rows = 20 standard amino acids. Colors blue (benign) to red (pathogenic). Includes conversion script, autoSql, trackDb, HTML doc, and makedoc. Added to predictionScoresSuper. refs #31804 Co-Authored-By: Claude Sonnet 4.6 diff --git src/hg/makeDb/scripts/eve/vcfToEveHeatmap.py src/hg/makeDb/scripts/eve/vcfToEveHeatmap.py new file mode 100644 index 00000000000..a1422a78ef3 --- /dev/null +++ src/hg/makeDb/scripts/eve/vcfToEveHeatmap.py @@ -0,0 +1,230 @@ +#!/usr/bin/env python3 +"""Convert EVE VCF files to heatmap bigBed format (one entry per protein). + +Usage: vcfToEveHeatmap.py + +Each *_HUMAN.vcf in vcf_dir produces one heatmap BED12+ row. + Columns = protein positions sorted by ascending genomic coordinate. + Rows = 20 standard amino acids (A C D E F G H I K L M N P Q R S T V W Y). + Wildtype cell at each column is left empty. + Colors = blue (EVE=0, benign) -> white (EVE=0.5, uncertain) -> red (EVE=1, pathogenic). + Mouseover includes wildtype, variant, EVE score, and Class25 classification. +""" + +import sys +import os +import re +import glob + +STANDARD_AAS = list('ACDEFGHIKLMNPQRSTVWY') # 20 standard AAs, alphabetical +COLOR_BOUNDS = "0,0.5,1" +COLOR_VALUES = "#2166ac,#f7f7f7,#d6604d" +LEGEND = "EVE score: 0=benign (blue) 0.5=uncertain (white) 1=pathogenic (red)" + +RE_PROT_MUT = re.compile(r'^([A-Z0-9]+)_([A-Z])(\d+)([A-Z])$') + + +def add_chr_prefix(chrom): + """Convert bare chromosome names to UCSC style (1 -> chr1, MT -> chrM).""" + if chrom.startswith('chr'): + return chrom + if chrom == 'MT': + return 'chrM' + return 'chr' + chrom + + +def parse_info(info_str): + """Parse VCF INFO field into a dict.""" + d = {} + for field in info_str.split(';'): + if '=' in field: + k, v = field.split('=', 1) + d[k] = v + return d + + +def process_vcf(vcf_path): + """Parse one VCF file. + + Returns (protein_name, uniprot_id, strand, data) or None on error. + + data: {prot_pos (int): { + 'chrom': str, + 'pos': int (1-based), + 'wt_aa': str, + 'scores': {var_aa: float}, # EVE score, rounded to 3 dp + 'class25': {var_aa: str}, # Benign/Uncertain/Pathogenic + }} + """ + basename = os.path.basename(vcf_path) + protein_name = basename.replace('_HUMAN.vcf', '') + + data = {} + seen = set() # (prot_pos, var_aa) pairs already stored + uniprot = None + strand = None + + with open(vcf_path, encoding='utf-8', errors='replace') as fh: + for line in fh: + if line.startswith('#'): + continue + cols = line.rstrip('\n').split('\t') + if len(cols) < 8: + continue + + chrom_raw, pos_str = cols[0], cols[1] + info = parse_info(cols[7]) + + if 'ProtMut' not in info or 'EVE' not in info: + continue + + m = RE_PROT_MUT.match(info['ProtMut']) + if not m: + sys.stderr.write( + f' WARNING: unparseable ProtMut {info["ProtMut"]!r} in {basename}\n' + ) + continue + + uid, wt_aa, pos_digits, var_aa = m.group(1), m.group(2), m.group(3), m.group(4) + prot_pos = int(pos_digits) + + key = (prot_pos, var_aa) + if key in seen: + continue + seen.add(key) + + if uniprot is None: + uniprot = uid + if strand is None: + strand = '-' if info.get('RevStr', 'False') == 'True' else '+' + + chrom = add_chr_prefix(chrom_raw) + pos = int(pos_str) + eve = round(float(info['EVE']), 3) + cls25 = info.get('Class25', '') + + if prot_pos not in data: + data[prot_pos] = { + 'chrom': chrom, + 'pos': pos, + 'wt_aa': wt_aa, + 'scores': {}, + 'class25': {}, + } + + data[prot_pos]['scores'][var_aa] = eve + data[prot_pos]['class25'][var_aa] = cls25 + + if not data: + sys.stderr.write(f' WARNING: no usable data in {basename}\n') + return None + + return protein_name, uniprot or 'unknown', strand or '+', data + + +def build_entry(protein_name, uniprot_id, strand, data, out): + """Write one heatmap BED12+ line for a protein to file object out.""" + # Sort positions by ascending genomic coordinate (left to right on genome). + # For reverse-strand genes this runs C-term to N-term, matching genomic orientation. + sorted_prot_pos = sorted(data.keys(), key=lambda p: data[p]['pos']) + + # Verify single chromosome (should always be true for one protein). + chroms = {data[p]['chrom'] for p in sorted_prot_pos} + if len(chroms) > 1: + sys.stderr.write( + f' WARNING: {protein_name} spans multiple chromosomes {chroms} - skipping\n' + ) + return + + chrom = next(iter(chroms)) + + # 0-based codon starts (VCF POS is 1-based). + starts0 = [data[p]['pos'] - 1 for p in sorted_prot_pos] + chrom_start = starts0[0] + chrom_end = starts0[-1] + 3 # include last codon (3 bp) + n_cols = len(sorted_prot_pos) + + # BED score = max EVE * 1000, capped at 1000. + max_eve = max( + v for p in sorted_prot_pos for v in data[p]['scores'].values() + ) + bed_score = min(1000, int(max_eve * 1000)) + + block_sizes = ','.join(['3'] * n_cols) + ',' + chrom_starts = ','.join(str(s - chrom_start) for s in starts0) + ',' + + row_count = len(STANDARD_AAS) + labels = ','.join(STANDARD_AAS) + + # Build score and label arrays in row-major order + # (all columns for row 0, then row 1, ...). + score_parts = [] + label_parts = [] + + for aa in STANDARD_AAS: + for prot_pos in sorted_prot_pos: + wt = data[prot_pos]['wt_aa'] + if aa == wt: + # Wildtype cell: leave empty so it renders as background. + score_parts.append('') + label_parts.append('') + elif aa in data[prot_pos]['scores']: + eve = data[prot_pos]['scores'][aa] + cls = data[prot_pos]['class25'].get(aa, '') + score_parts.append(f'{eve:.3f}') + # Mouseover label rendered as HTML; no commas in text. + lbl = f'{wt}{prot_pos}→{aa}: EVE {eve:.3f} ({cls})' + label_parts.append(f'"{lbl}"') + else: + score_parts.append('') + label_parts.append('') + + score_array = ','.join(score_parts) + label_array = ','.join(label_parts) + + fields = [ + chrom, chrom_start, chrom_end, + protein_name, bed_score, strand, + chrom_start, chrom_end, 0, # thickStart, thickEnd, itemRgb + n_cols, block_sizes, chrom_starts, + row_count, labels, + COLOR_BOUNDS, COLOR_VALUES, + score_array, label_array, + LEGEND, uniprot_id, + ] + + out.write('\t'.join(str(f) for f in fields) + '\n') + + +def main(): + if len(sys.argv) != 3: + sys.exit(f'Usage: {sys.argv[0]} ') + + vcf_dir = sys.argv[1] + output_bed = sys.argv[2] + + vcf_files = sorted(glob.glob(os.path.join(vcf_dir, '*_HUMAN.vcf'))) + if not vcf_files: + sys.exit(f'ERROR: no *_HUMAN.vcf files found in {vcf_dir}') + + sys.stderr.write(f'Processing {len(vcf_files)} VCF files...\n') + + n_ok = 0 + n_skip = 0 + + with open(output_bed, 'w', encoding='utf-8') as out: + for i, vcf_path in enumerate(vcf_files, 1): + if i % 200 == 0: + sys.stderr.write(f' {i}/{len(vcf_files)}...\n') + result = process_vcf(vcf_path) + if result is None: + n_skip += 1 + continue + build_entry(*result, out) + n_ok += 1 + + sys.stderr.write(f'Done: {n_ok} proteins written, {n_skip} skipped.\n') + + +if __name__ == '__main__': + main()