f24ea956ba3b7a8c7ee8be0826d66489b85e0bbc lrnassar Tue Apr 28 18:22:48 2026 -0700 Adding TP53 VCEP track hub build scripts. refs #37399 16 Python scripts that build the 15 bigBed tracks for the ClinGen TP53 VCEP track hub (CSpec GN009 v2.4.0, NM_000546.6 / NP_000537.3). Includes the NON-FINAL Provisional Classification (Tavtigian point sum across PM1 + PS3/BS3 + PP3/BP4 + AF + BS2 + splicing PP3), gnomAD v4.1 AF codes, FLOSSIES BS2 evidence, Bioinformatic PP3/BP4 (missense + single-aa in-frame del), VCEP curated variants from EvRepo with ClinVar VCV backfill, PM1 (clinical domains + cancerhotspots), PVS1 regions and splice sites, and the Functional Evidence composite (VCEP preliminary PS3/BS3 with per-paper raw scores from Kato/Giacomelli/Kawaguchi/Funk). diff --git src/hg/makeDb/scripts/tp53/tp53VCEPClinVar.py src/hg/makeDb/scripts/tp53/tp53VCEPClinVar.py new file mode 100644 index 00000000000..bd966993935 --- /dev/null +++ src/hg/makeDb/scripts/tp53/tp53VCEPClinVar.py @@ -0,0 +1,480 @@ +#!/usr/bin/env python3 +""" +TP53 VCEP Curated Variants track generator. + +Fetches TP53 VCEP classifications from the ClinGen Evidence Repository +(EvRepo). EvRepo is the authoritative source for VCEP classifications +(176 TP53 variants as of 2026-04-17) and is used here instead of ClinVar +esearch because ClinVar's NCBI esearch index silently drops legacy VCV +records like VCV 12374 / R175H. + +Emits bigBed 9+9 with the per-variant list of applied evidence codes plus +the overall classification (P / LP / VUS / LB / B). hg38 and hg19 +coordinates parsed directly from the HGVS list; liftOver fallback where +absent. +""" + +import argparse +import json +import os +import re +import subprocess +import sys +import tempfile +import time +import urllib.parse +import urllib.request + +EVREPO_URL = ("https://erepo.genome.network/evrepo/api/classifications" + "?gene=TP53&matchLimit=2000&format=json") +CLINVAR_EFETCH = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi" + +# HGVS genomic regex for hg38 (NC_000017.11) and hg19 (NC_000017.10) +HGVS_G_RE = re.compile(r'^NC_0000(17)\.(\d+):g\.(\d+)([ACGT])>([ACGT])$') +HGVS_G_INDEL_RE = re.compile(r'^NC_0000(17)\.(\d+):g\.(\d+)_?(\d+)?(del|ins|dup).*$') + +LIFTOVER_CHAINS = { + 'hg19_to_hg38': '/cluster/data/hg19/bed/liftOver/hg19ToHg38.over.chain.gz', + 'hg38_to_hg19': '/cluster/data/hg38/bed/liftOver/hg38ToHg19.over.chain.gz', +} +CHROM_SIZES = { + 'hg19': '/cluster/data/hg19/chrom.sizes', + 'hg38': '/cluster/data/hg38/chrom.sizes', +} + +COLORS = { + 'pathogenic': '210,0,0', + 'likely pathogenic': '245,152,152', + 'uncertain significance': '0,0,136', + 'likely benign': '213,247,213', + 'benign': '0,210,0', +} +DEFAULT_COLOR = '136,136,136' + +CLASSIFICATION_CANONICAL = { + 'pathogenic': 'Pathogenic', + 'likely pathogenic': 'Likely pathogenic', + 'uncertain significance': 'Uncertain significance', + 'likely benign': 'Likely benign', + 'benign': 'Benign', +} + +AUTOSQL = """table TP53VCEPCuratedVars +"TP53 VCEP expert-panel-reviewed classifications from ClinGen EvRepo" + ( + string chrom; "Reference sequence chromosome or scaffold" + uint chromStart; "Start position in chromosome" + uint chromEnd; "End position in chromosome" + string name; "HGVS (NM_000546.6 preferred)" + uint score; "Not used, set to 0" + char[1] strand; "Not used, set to ." + uint thickStart; "Same as chromStart" + uint thickEnd; "Same as chromEnd" + uint reserved; "RGB color" + string classification; "Final VCEP classification (P/LP/VUS/LB/B)" + string hgvsc; "c. notation (NM_000546.6)" + string hgvsp; "p. notation if available" + string varId; "ClinVar variation ID" + string caid; "ClinGen Canonical Allele ID" + string publishedDate; "Date published to EvRepo" + string metCodes; "Semicolon-separated list of Met evidence codes" + string notMetCodes; "Count / summary of Not Met codes" + lstring _mouseOver; "HTML mouseover" + ) +""" + + +def log(msg): + print(msg, file=sys.stderr) + + +def bash(cmd): + result = subprocess.run(cmd, shell=True, capture_output=True, text=True) + if result.returncode != 0: + raise RuntimeError("Command failed: {}\n{}".format(cmd, result.stderr)) + return result.stdout + + +def escape_html(s): + if not s: + return "" + # First escape HTML special chars, then encode any remaining non-ASCII + # as numeric entities so UCSC hgTracks' mouseover pipeline renders + # cleanly (raw UTF-8 ends up as mojibake like 'â€"'). + escaped = (str(s).replace('&', '&').replace('<', '<') + .replace('>', '>').replace('"', '"')) + return "".join( + ch if ord(ch) < 128 else "&#{};".format(ord(ch)) + for ch in escaped + ) + + +def fetch_evrepo(): + """Download the full TP53 VCEP classification list from EvRepo.""" + log("Fetching TP53 VCEP classifications from EvRepo...") + req = urllib.request.Request(EVREPO_URL, + headers={'User-Agent': 'UCSC-kent/TP53-hub'}) + with urllib.request.urlopen(req, timeout=60) as resp: + data = json.loads(resp.read()) + entries = data.get('variantInterpretations', []) + log(" Found {} TP53 VCEP entries".format(len(entries))) + return entries + + +def pick_hgvs(hgvs_list): + """Select preferred HGVS notations from the HGVS list. + + Returns dict with hg38_g, hg19_g (genomic NC format), hgvsc (NM_000546.6), + hgvsp (NM_000546.6 with p.), display name. + """ + out = {'hg38_g': None, 'hg19_g': None, + 'hgvsc': None, 'hgvsp_full': None, 'display': None} + for h in hgvs_list: + if h.startswith('NC_000017.11:g.'): + out['hg38_g'] = h + elif h.startswith('NC_000017.10:g.'): + out['hg19_g'] = h + elif h.startswith('NM_000546.6:c.'): + out['hgvsc'] = h + elif h.startswith('NM_000546.5(TP53):c.') or h.startswith('NM_000546.6(TP53):c.'): + # The prose form includes p. — e.g. + # "NM_000546.5(TP53):c.245C>T (p.Pro82Leu)" + out['hgvsp_full'] = h + if out['display'] is None: + out['display'] = h + if out['display'] is None: + out['display'] = out['hgvsc'] or (hgvs_list[0] if hgvs_list else 'unknown') + return out + + +def hgvs_to_bed_coords(hgvs_g): + """Convert NC_000017:g.POS[ACGT]>[ACGT] or indel-style g. to (start, end).""" + if not hgvs_g: + return None + m = HGVS_G_RE.match(hgvs_g) + if m: + pos = int(m.group(3)) + return (pos - 1, pos) + m = HGVS_G_INDEL_RE.match(hgvs_g) + if m: + start = int(m.group(3)) + end_s = m.group(4) + end = int(end_s) if end_s else start + kind = m.group(5) + # For a dup, the coord is the duplicated region. For del, the deleted range. + # For ins, zero-width between positions; use start-1 to start in BED. + if kind == 'ins': + return (start, start) + return (start - 1, end) + return None + + +def parse_entry(entry): + """Turn one EvRepo interpretation into our internal dict.""" + hgvs_list = entry.get('hgvs', []) or [] + picks = pick_hgvs(hgvs_list) + + hg38 = hgvs_to_bed_coords(picks['hg38_g']) + hg19 = hgvs_to_bed_coords(picks['hg19_g']) + + outcome = None + met = [] + not_met_count = 0 + for g in entry.get('guidelines', []) or []: + if g.get('outcome'): + outcome = g['outcome'].get('label') + for a in g.get('agents', []) or []: + for c in a.get('evidenceCodes', []) or []: + label = c.get('label') or '' + status = (c.get('status') or '').strip().lower() + if status == 'met': + met.append(label) + elif status == 'not met': + not_met_count += 1 + + cls_raw = outcome or '' + cls = CLASSIFICATION_CANONICAL.get(cls_raw.strip().lower(), cls_raw) + + # Pull a short p. from the prose form, if present. + hgvsp = '' + if picks['hgvsp_full']: + m = re.search(r'\(p\.[^)]+\)', picks['hgvsp_full']) + if m: + hgvsp = m.group(0)[1:-1] + + hgvsc = '' + if picks['hgvsc']: + hgvsc = picks['hgvsc'] + elif picks['hgvsp_full']: + # Fallback to c. from the prose form + m = re.search(r'c\.[^\s(]+', picks['hgvsp_full']) + if m: + hgvsc = 'NM_000546.6:' + m.group(0) + + return { + 'var_id': entry.get('variationId', ''), + 'caid': entry.get('caid', ''), + 'published': entry.get('publishedDate', ''), + 'classification': cls, + 'hgvsc': hgvsc, + 'hgvsp': hgvsp, + 'display': picks['display'], + 'hg38_bed': hg38, # (start,end) or None + 'hg19_bed': hg19, + 'met_codes': met, + 'not_met_count': not_met_count, + } + + +def liftover_coords(coords, chain): + if not coords: + return {} + with tempfile.NamedTemporaryFile(mode='w', suffix='.bed', delete=False) as f: + in_bed = f.name + for vid, (chrom, s, e) in coords.items(): + f.write("{}\t{}\t{}\t{}\n".format(chrom, s, e, vid)) + out_bed = in_bed.replace('.bed', '.lifted.bed') + un_bed = in_bed.replace('.bed', '.unmapped.bed') + try: + bash("liftOver {} {} {} {} 2>/dev/null".format(in_bed, chain, out_bed, un_bed)) + except Exception: + pass + lifted = {} + if os.path.exists(out_bed): + with open(out_bed) as f: + for line in f: + flds = line.strip().split('\t') + if len(flds) >= 4: + lifted[flds[3]] = (flds[0], int(flds[1]), int(flds[2])) + for p in [in_bed, out_bed, un_bed]: + if os.path.exists(p): + os.remove(p) + return lifted + + +def create_bed(variants, assembly): + lines = [] + unmapped = [] + # Which coord set is native on this assembly? + native_key = 'hg38_bed' if assembly == 'hg38' else 'hg19_bed' + other_key = 'hg19_bed' if assembly == 'hg38' else 'hg38_bed' + chain = LIFTOVER_CHAINS['hg19_to_hg38'] if assembly == 'hg38' \ + else LIFTOVER_CHAINS['hg38_to_hg19'] + + # Build needs_liftover map for variants missing native coords. + needs = {} + for v in variants: + if v[native_key] is None and v[other_key] is not None: + other_start, other_end = v[other_key] + needs[v['var_id']] = ('chr17', other_start, other_end) + lifted = liftover_coords(needs, chain) if needs else {} + if needs: + log(" liftOver ({} -> {}): {} lifted of {} needing".format( + 'hg19' if assembly == 'hg38' else 'hg38', + assembly, len(lifted), len(needs))) + + for v in variants: + start, end = None, None + if v[native_key] is not None: + start, end = v[native_key] + chrom = 'chr17' + elif v['var_id'] in lifted: + chrom, start, end = lifted[v['var_id']] + else: + unmapped.append(v) + continue + + color = COLORS.get((v['classification'] or '').strip().lower(), DEFAULT_COLOR) + cv_url = "https://www.ncbi.nlm.nih.gov/clinvar/variation/{}/".format(v['var_id']) + met_summary = ", ".join(v['met_codes']) if v['met_codes'] else "(none Met)" + mo = ( + "Final — VCEP ClinVar submission
" + "Variant: {disp}
" + "Classification: {cls}
" + "ClinGen CAid: {caid}
" + "ClinVar ID: {vid}
" + "Published: {pub}
" + "Met evidence codes: {met}
" + "Not Met codes: {notmet}
" + "Source: ClinGen Evidence Repository (EvRepo)" + ).format( + disp=escape_html(v['display']), + cls=escape_html(v['classification']), + caid=escape_html(v['caid']), + url=cv_url, vid=v['var_id'], + pub=escape_html(v['published']), + met=escape_html(met_summary), + notmet=v['not_met_count'], + ) + # Use HGVSp short if available, else c. notation, else full display. + # ASCII-encode every free-text field to avoid UCSC mouseover mojibake. + def _safe(s): + if s is None: + return '' + return "".join( + ch if ord(ch) < 128 else "&#{};".format(ord(ch)) for ch in str(s) + ) + name = v['hgvsp'] or v['hgvsc'] or v['display'] + if len(name) > 200: + name = name[:197] + "..." + lines.append("\t".join([ + chrom, str(start), str(end), + _safe(name), '0', '.', + str(start), str(end), + color, + v['classification'] or 'Unknown', + _safe(v['hgvsc'] or ''), + _safe(v['hgvsp'] or ''), + v['var_id'], + _safe(v['caid']), + v['published'], + _safe("; ".join(v['met_codes'])), + str(v['not_met_count']), + mo, + ])) + return lines, unmapped + + +def build_assembly(variants, assembly, outdir): + log("\n=== {} ===".format(assembly)) + entries, unmapped = create_bed(variants, assembly) + log(" mapped: {} unmapped: {}".format(len(entries), len(unmapped))) + if not entries: + return None + bed = os.path.join(outdir, "TP53VCEPCuratedVars_{}.bed".format(assembly)) + with open(bed, 'w') as f: + f.write("\n".join(entries) + "\n") + bash("sort -k1,1 -k2,2n {0} -o {0}".format(bed)) + as_file = os.path.join(outdir, "TP53VCEPCuratedVars.as") + bb = os.path.join(outdir, "TP53VCEPCuratedVars{}.bb".format(assembly.capitalize())) + bash("bedToBigBed -as={} -type=bed9+9 -tab {} {} {}".format( + as_file, bed, CHROM_SIZES[assembly], bb)) + log(" wrote {}".format(bb)) + return bb + + +def write_tsv(variants, path): + cols = ['var_id', 'caid', 'display', 'hgvsc', 'hgvsp', 'classification', + 'published', 'met_codes', 'not_met_count', + 'hg38_start', 'hg38_end', 'hg19_start', 'hg19_end'] + with open(path, 'w') as f: + f.write("\t".join(cols) + "\n") + for v in variants: + h38 = v['hg38_bed'] or (None, None) + h19 = v['hg19_bed'] or (None, None) + row = [ + v['var_id'], v['caid'], v['display'], v['hgvsc'] or '', + v['hgvsp'] or '', v['classification'] or '', v['published'], + ';'.join(v['met_codes']), str(v['not_met_count']), + str(h38[0] if h38[0] is not None else ''), + str(h38[1] if h38[1] is not None else ''), + str(h19[0] if h19[0] is not None else ''), + str(h19[1] if h19[1] is not None else ''), + ] + f.write("\t".join(row) + "\n") + + +def backfill_clinvar_for_unlinked(variants): + """For records that came in without a ClinVar var_id, look them up by + HGVSc and backfill var_id + hgvsp from ClinVar esummary. Some EvRepo + records (e.g. complex indels like c.322_332delinsATTCA) come through + the variantInterpretations endpoint with hgvsp empty and no variationId + populated; we recover both via ClinVar's esearch by VarName + esummary. + """ + eutils_search = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi" + eutils_summary = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi" + fixed = 0 + # Pace requests to stay well under NCBI's 3 req/sec limit and avoid 429s + # on first-hit cold cache. + time.sleep(1.0) + for v in variants: + if v['var_id'] or not v['hgvsc']: + continue + m = re.match(r'NM_000546(?:\.\d+)?:(c\.[^\s(]+)', v['hgvsc']) + if not m: + continue + c_form = m.group(1) + success = False + for attempt in range(3): + try: + url = (eutils_search + + "?db=clinvar&retmode=json&term=" + + urllib.parse.quote('"{}"[VarName] AND tp53[gene]'.format(c_form))) + with urllib.request.urlopen(url, timeout=20) as resp: + hits = json.loads(resp.read())['esearchresult']['idlist'] + if not hits: + break + vid = hits[0] + time.sleep(0.5) + url = "{}?db=clinvar&id={}&retmode=json".format(eutils_summary, vid) + with urllib.request.urlopen(url, timeout=20) as resp: + summ = json.loads(resp.read())['result'][vid] + v['var_id'] = vid + title = summ.get('title', '') + mp = re.search(r'\(p\.[^)]+\)', title) + if mp and not v['hgvsp']: + v['hgvsp'] = mp.group(0)[1:-1] + fixed += 1 + success = True + break + except urllib.error.HTTPError as e: + if e.code == 429 and attempt < 2: + time.sleep(2.0 * (attempt + 1)) + continue + log(" backfill skipped for {}: {}".format(c_form, e)) + break + except Exception as e: + log(" backfill skipped for {}: {}".format(c_form, e)) + break + if success: + time.sleep(0.5) + if fixed: + log("Backfilled ClinVar var_id for {} previously-unlinked records".format(fixed)) + + +def fetch_all_variants(): + entries = fetch_evrepo() + variants = [parse_entry(e) for e in entries] + variants = [v for v in variants if v['classification']] + log("Parsed {} variants with classifications".format(len(variants))) + backfill_clinvar_for_unlinked(variants) + return variants + + +def main(): + p = argparse.ArgumentParser(description=__doc__) + p.add_argument('-o', '--output-dir', + default=os.path.dirname(os.path.abspath(__file__)), + help='Output directory (default: script directory)') + p.add_argument('--db', action='append', + help='hg38 or hg19 (repeat); default both') + args = p.parse_args() + outdir = args.output_dir + os.makedirs(outdir, exist_ok=True) + dbs = args.db if args.db else ['hg38', 'hg19'] + + variants = fetch_all_variants() + if not variants: + log("ERROR: no variants parsed from EvRepo") + sys.exit(1) + + tsv = os.path.join(outdir, "tp53_vcep_clinvar_variants.tsv") + write_tsv(variants, tsv) + log("TSV: {}".format(tsv)) + + as_file = os.path.join(outdir, "TP53VCEPCuratedVars.as") + with open(as_file, 'w') as f: + f.write(AUTOSQL) + + for db in dbs: + build_assembly(variants, db, outdir) + + from collections import Counter + cls_counts = Counter(v['classification'] for v in variants) + log("\nClassification counts:") + for c, n in cls_counts.most_common(): + log(" {}: {}".format(c, n)) + + +if __name__ == "__main__": + main()