f24ea956ba3b7a8c7ee8be0826d66489b85e0bbc lrnassar Tue Apr 28 18:22:48 2026 -0700 Adding TP53 VCEP track hub build scripts. refs #37399 16 Python scripts that build the 15 bigBed tracks for the ClinGen TP53 VCEP track hub (CSpec GN009 v2.4.0, NM_000546.6 / NP_000537.3). Includes the NON-FINAL Provisional Classification (Tavtigian point sum across PM1 + PS3/BS3 + PP3/BP4 + AF + BS2 + splicing PP3), gnomAD v4.1 AF codes, FLOSSIES BS2 evidence, Bioinformatic PP3/BP4 (missense + single-aa in-frame del), VCEP curated variants from EvRepo with ClinVar VCV backfill, PM1 (clinical domains + cancerhotspots), PVS1 regions and splice sites, and the Functional Evidence composite (VCEP preliminary PS3/BS3 with per-paper raw scores from Kato/Giacomelli/Kawaguchi/Funk). diff --git src/hg/makeDb/scripts/tp53/tp53ProvisionalClass.py src/hg/makeDb/scripts/tp53/tp53ProvisionalClass.py new file mode 100644 index 00000000000..a304e51f4bf --- /dev/null +++ src/hg/makeDb/scripts/tp53/tp53ProvisionalClass.py @@ -0,0 +1,593 @@ +#!/usr/bin/env python3 +""" +TP53 VCEP NON-FINAL Provisional Classification track generator. + +For every possible p53 missense protein change, sum the Tavtigian points +from applicable evidence codes the VCEP has pre-computed: + + * PM1 hotspot (Clinical Domains + cancerhotspots.org) + * PS3 / BS3 (CSpec GN009 v2.4.0 Table S3) + * PP3 / BP4 (CSpec GN009 v2.4.0 Table S2) + * BA1 / BS1 / PM2 (gnomAD v4.1 exomes, per CSpec FAF/grpmax thresholds) + * BS2 (FLOSSIES healthy-women-over-70 cohort) + * Splicing PP3 (SpliceAI >= 0.2) + +The point sum is bucketed into P / LP / VUS / LB / B per the CSpec +classification ranges. BA1 is stand-alone Benign and forces class = Benign +regardless of other evidence. + +DELIBERATELY EXCLUDED from the sum (documented in every mouseover): + - PVS1 (null variants only; handled in separate track) + - PS1 / PS2 / PS4 / PP1 / PP4 / BS4 / BP7 (require clinical observations) + +This is NOT a VCEP classification — the warning is in every mouseover +since clinicians live in the mouseover, not the description page. + +bigBed 9+9. +""" + +import argparse +import json +import os +import re +import sys + +import openpyxl + +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) +import tp53FuncLib as lib + +DEFAULT_OUTDIR = "/hive/users/lrnassar/claude/RM37399/provisionalClass" +SRC_S3 = "/hive/users/lrnassar/claude/RM37399/tp53_downloads/Functional-worksheet.xlsx" +SRC_S2 = "/hive/users/lrnassar/claude/RM37399/tp53_downloads/bioinformatic_worksheet.xlsx" +HOTSPOTS_JSON = "/hive/users/lrnassar/claude/RM37399/cancerHotspots/cancerhotspots_single.json" +AF_BED_TPL = "/hive/users/lrnassar/claude/RM37399/afFrequencies/TP53AF_{}.bed" +FLOSSIES_BED_TPL = "/hive/users/lrnassar/claude/RM37399/flossies/TP53Flossies_{}.bed" + +PM1_HARDCODED_CODONS = {175, 245, 248, 249, 273, 282} + +PROT_RE = re.compile(r'^([A-Z])(\d+)([A-Z])$') +HGVSP3_RE = re.compile(r'^p\.([A-Z][a-z]{2})(\d+)([A-Z][a-z]{2})$') +HGVSC_RE = re.compile(r'^c\.(\d+)([ACGT])>([ACGT])$') + +THREE_TO_ONE = { + 'Ala':'A','Arg':'R','Asn':'N','Asp':'D','Cys':'C','Glu':'E','Gln':'Q', + 'Gly':'G','His':'H','Ile':'I','Leu':'L','Lys':'K','Met':'M','Phe':'F', + 'Pro':'P','Ser':'S','Thr':'T','Trp':'W','Tyr':'Y','Val':'V', +} + +# Tavtigian points per code +POINTS = { + 'PM1_Moderate': 2, 'PM1_Supporting': 1, + 'PS3_Strong': 4, 'PS3': 4, 'PS3_Moderate': 2, 'PS3_Supporting': 1, + 'BS3_Strong': -4, 'BS3': -4, 'BS3_Supporting': -1, + 'PP3_Moderate': 2, 'PP3_moderate': 2, 'PP3': 1, + 'BP4_Moderate': -2, 'BP4_moderate': -2, 'BP4': -1, + 'BA1': 0, # stand-alone B -> not summed; forces class + 'BS1': -4, + 'PM2_Supporting': 1, + 'BS2': -4, + 'No evidence': 0, 'Indeterminate': 0, +} + +SPLICE_PP3_THRESHOLD = 0.2 # CSpec GN009: SpliceAI >= 0.2 -> PP3 splicing + + +def bucket(pts): + if pts >= 10: + return 'Pathogenic' + if 6 <= pts <= 9: + return 'Likely pathogenic' + if -6 <= pts <= -2: + return 'Likely benign' + if pts <= -7: + return 'Benign' + return 'Uncertain significance' + + +CLASS_COLOR = { + 'Pathogenic': '210,0,0', + 'Likely pathogenic': '245,152,152', + 'Uncertain significance': '0,0,136', + 'Likely benign': '213,247,213', + 'Benign': '0,210,0', +} + +AUTOSQL = """table TP53ProvisionalClass +"TP53 NON-FINAL Provisional Classification by Tavtigian point sum (NOT a VCEP classification)" + ( + string chrom; "Reference sequence chromosome or scaffold" + uint chromStart; "Start position in chromosome" + uint chromEnd; "End position in chromosome" + string name; "Missense change (e.g., R175H)" + uint score; "Not used, all 0" + char[1] strand; "Not used, all ." + uint thickStart; "Same as chromStart" + uint thickEnd; "Same as chromEnd" + uint reserved; "RGB color" + string provisionalClass; "Provisional class (Pathogenic/LP/VUS/LB/Benign)" + int totalPoints; "Sum of applied Tavtigian points" + string appliedCodes; "Codes contributing (semicolon-separated)" + string pm1; "PM1 contribution" + string ps3bs3; "PS3/BS3 contribution (from Table S3)" + string pp3bp4; "PP3/BP4 contribution (Table S2 + splicing)" + string af; "AF code (BA1/BS1/PM2_Supporting/none)" + string bs2; "BS2 evidence (FLOSSIES cohort observation)" + string spliceAI; "Max SpliceAI delta (from Table S2)" + lstring _mouseOver; "HTML mouseover" + ) +""" + + +def parse_missense_short(p): + if not isinstance(p, str): + return None + m = PROT_RE.match(p.strip()) + if not m: + return None + return (m.group(1), int(m.group(2)), m.group(3)) + + +def parse_hgvsp3(p): + if not isinstance(p, str): + return None + m = HGVSP3_RE.match(p.strip()) + if not m: + return None + wt3, codon, alt3 = m.group(1), int(m.group(2)), m.group(3) + if wt3 not in THREE_TO_ONE or alt3 not in THREE_TO_ONE: + return None + return (THREE_TO_ONE[wt3], codon, THREE_TO_ONE[alt3]) + + +def load_s3(path): + """Table S3: per-missense preliminary PS3/BS3. Key by (wt, codon, alt) 1-letter.""" + wb = openpyxl.load_workbook(path, data_only=True) + ws = wb["Supplementary Table S3"] + out = {} + for row in ws.iter_rows(min_row=4, values_only=True): + k = parse_missense_short(row[0]) + if not k: + continue + out[k] = { + 'code': str(row[7]).strip() if row[7] is not None else 'No evidence', + } + return out + + +def load_s2(path): + """Table S2: per c.X>Y missense with preliminary PP3/BP4 + max SpliceAI. + + Returns {(wt, codon, alt) -> {code, spliceai_max, paths: [(c_pos, ref_nt, alt_nt), ...]}}. + 'paths' enumerates every c.X>Y combination that yields the protein change — + used downstream to look up AF at each contributing genomic coord. + """ + wb = openpyxl.load_workbook(path, data_only=True) + ws = wb["Supplementary Table S2"] + out = {} + for row in ws.iter_rows(min_row=4, values_only=True): + k = parse_hgvsp3(row[1]) + if not k: + continue + hgvsc = row[0] + path_tuple = None + if isinstance(hgvsc, str): + mc = HGVSC_RE.match(hgvsc.strip()) + if mc: + path_tuple = (int(mc.group(1)), mc.group(2), mc.group(3)) + new_code = str(row[4]).strip() if row[4] is not None else 'No evidence' + try: + new_spliceai = float(row[5]) if row[5] is not None else 0.0 + except (ValueError, TypeError): + new_spliceai = 0.0 + existing = out.get(k) + if existing is None: + out[k] = { + 'code': new_code, + 'spliceai': new_spliceai, + 'paths': [path_tuple] if path_tuple else [], + } + else: + if _s2_priority(new_code) > _s2_priority(existing['code']): + existing['code'] = new_code + if new_spliceai > existing['spliceai']: + existing['spliceai'] = new_spliceai + if path_tuple and path_tuple not in existing['paths']: + existing['paths'].append(path_tuple) + return out + + +def _s2_priority(code): + order = { + 'PP3_moderate': 5, 'BP4_moderate': 5, + 'PP3': 4, 'BP4': 4, + 'No evidence': 1, + } + return order.get(code, 0) + + +def load_hotspot_occurrences(path): + out = {} + if not os.path.exists(path): + return out + with open(path) as f: + data = json.load(f) + for h in data: + if h.get('hugoSymbol') != 'TP53': + continue + res = h.get('residue') or '' + if not res: + continue + wt = res[0] + try: + codon = int(res[1:]) + except ValueError: + continue + for alt, count in (h.get('variantAminoAcid') or {}).items(): + if alt == wt or alt in ('*', 'X') or count is None or count < 2: + continue + out[(codon, alt)] = count + return out + + +def load_af_lookup(db): + """Read TP53AF_.bed and return {(chrom, pos1based, ref, alt) -> code}. + + The display name in the AF bed is 'chrN-pos-ref-alt' (1-based pos). + """ + out = {} + path = AF_BED_TPL.format(db) + if not os.path.exists(path): + return out + with open(path) as f: + for line in f: + flds = line.rstrip("\n").split("\t") + if len(flds) < 11: + continue + disp = flds[3] + code = flds[9] + m = re.match(r'^(chr[^-]+)-(\d+)-([ACGT-]+)-([ACGT-]+)$', disp) + if not m: + continue + out[(m.group(1), int(m.group(2)), m.group(3), m.group(4))] = code + return out + + +def load_flossies_lookup(db): + """Read TP53Flossies_.bed and return set of (chrom, pos1based, ref, alt) + keys for rows with BS2 applicability = 'BS2'. Match is variant-specific + (a FLOSSIES observation supports BS2 only for the exact nt change observed, + not for any change at the same codon).""" + out = set() + path = FLOSSIES_BED_TPL.format(db) + if not os.path.exists(path): + return out + with open(path) as f: + for line in f: + flds = line.rstrip("\n").split("\t") + if len(flds) < 19: + continue + if flds[9] != 'BS2': + continue + chrom = flds[0] + start = int(flds[1]) + ref = flds[17] + alt = flds[18] + out.add((chrom, start + 1, ref, alt)) + return out + + +def pm1_code_and_points(wt, codon, alt, hotspot_occ): + if codon in PM1_HARDCODED_CODONS: + return ('PM1_Moderate', 2) + count = hotspot_occ.get((codon, alt), 0) + if count >= 10: + return ('PM1_Moderate', 2) + if count >= 2: + return ('PM1_Supporting', 1) + return ('', 0) + + +def ps3_bs3_label_and_points(code): + if code in ('PS3', 'PS3_Strong'): + return ('PS3_Strong', 4) + if code == 'PS3_Moderate': + return (code, 2) + if code == 'PS3_Supporting': + return (code, 1) + if code == 'BS3_Supporting': + return (code, -1) + if code in ('BS3', 'BS3_Strong'): + return ('BS3_Strong', -4) + return ('', 0) + + +def pp3_bp4_label_and_points(code): + if code == 'PP3_moderate': + return ('PP3_Moderate', 2) + if code == 'PP3': + return ('PP3', 1) + if code == 'BP4': + return ('BP4', -1) + if code == 'BP4_moderate': + return ('BP4_Moderate', -2) + return ('', 0) + + +def af_code_and_points(wt, codon, alt, paths, af_lookup, tx): + """For a (wt, codon, alt) protein change, look up gnomAD v4.1 AF at every + c.X>Y path and return the strongest applicable code. + + Priority (most-benign first): BA1 > BS1 > PM2_Supporting > none. + BA1 is stand-alone Benign; the caller forces classification = Benign. + """ + chrom = tx['chrom'] + strand = tx['strand'] + rcomp = {'A':'T','T':'A','C':'G','G':'C'} + seen = [] + for c_pos, c_ref, c_alt in paths: + g = lib.cdna_coding_to_genomic(c_pos, tx) + if g is None: + continue + if strand == '-': + g_ref = rcomp.get(c_ref, c_ref) + g_alt = rcomp.get(c_alt, c_alt) + else: + g_ref = c_ref + g_alt = c_alt + # AF lookup uses 1-based start + key = (chrom, g + 1, g_ref, g_alt) + code = af_lookup.get(key) + if code: + seen.append(code) + if 'BA1' in seen: + return ('BA1', 0, 'stand-alone Benign') + if 'BS1' in seen: + return ('BS1', POINTS['BS1'], '-4 pts') + if 'PM2_Supporting' in seen: + return ('PM2_Supporting', POINTS['PM2_Supporting'], '+1 pt') + return ('', 0, '') + + +def bs2_observed(wt, codon, paths, flossies_lookup, tx): + """Return True if any (c.X>Y) path resolves to a FLOSSIES variant that + matches by exact genomic position AND ref/alt. BS2 requires the SAME + nt change — an observation of c.1120G>A does not support BS2 for + c.1120G>C even though both yield p.G374R.""" + chrom = tx['chrom'] + strand = tx['strand'] + rcomp = {'A':'T','T':'A','C':'G','G':'C'} + for c_pos, c_ref, c_alt in paths: + g = lib.cdna_coding_to_genomic(c_pos, tx) + if g is None: + continue + if strand == '-': + g_ref = rcomp.get(c_ref, c_ref) + g_alt = rcomp.get(c_alt, c_alt) + else: + g_ref = c_ref + g_alt = c_alt + if (chrom, g + 1, g_ref, g_alt) in flossies_lookup: + return True + return False + + +CAVEATS_STR = ( + "PVS1 (null variants only — see PVS1 tracks); " + "PS1/PS2/PS4/PP1/PP4/BS4/BP7 (require clinical observations)." +) + + +HEADER_WARNING = ( + "" + "NON-FINAL: NOT a VCEP classification — preliminary point-sum only." + "" +) + + +def mouseover(name, cls, pts, pm1, ps3, pp3, af_lbl, bs2_lbl, applied, + spliceai, splice_pp3_active, splice_overrode_bp4, codon, ba1): + splice_section = "" + if spliceai and spliceai > 0: + if splice_pp3_active and splice_overrode_bp4: + splice_section = ( + "
SpliceAI: {sa:.2f} — " + "" + "splicing PP3 applies (supersedes missense BP4); " + "potential splice disruption." + "" + ).format(sa=spliceai) + elif splice_pp3_active: + splice_section = ( + "
SpliceAI: {sa:.2f} — splicing PP3 applies" + ).format(sa=spliceai) + else: + splice_section = "
SpliceAI: {sa:.2f}".format(sa=spliceai) + codon72 = "" + if codon == 72: + codon72 = ( + "
Note: Codon 72 PS3/BS3 data is measured on the R72 " + "haplotype (rs1042522); reference is P72. P72X variants are NOT " + "in Table S3 (only R72X is). See description page for details." + ) + ba1_section = "" + if ba1: + ba1_section = ( + "
BA1 stand-alone Benign: " + "FAF ≥ 0.001 in gnomAD v4.1 forces class = Benign." + ) + return ( + "{warn}" + "
Variant: p.{name} (NP_000537.3)" + "
Provisional class: {cls} ({pts} pts)" + "
PM1: {pm1}" + "
PS3/BS3: {ps3}" + "
PP3/BP4: {pp3}" + "
AF (gnomAD v4.1): {af}" + "
BS2 (FLOSSIES): {bs2}" + "
Applied codes: {applied}" + "{splice}{ba1_section}{codon72}" + "
NOT included in this sum: {cav}" + ).format(warn=HEADER_WARNING, + name=name, cls=cls, pts=pts, + pm1=pm1 or "No contribution", + ps3=ps3 or "No contribution", + pp3=pp3 or "No contribution", + af=af_lbl or "No contribution", + bs2=bs2_lbl, + applied=applied or "(none)", + splice=splice_section, + ba1_section=ba1_section, + codon72=codon72, + cav=CAVEATS_STR) + + +def generate_bed(s3, s2, hotspot_occ, af_lookup, flossies_lookup, tx): + lines = [] + chrom = tx['chrom'] + for (wt, codon, alt), s3_rec in sorted(s3.items(), key=lambda kv: (kv[0][1], kv[0][2])): + pm1_lbl, pm1_pts = pm1_code_and_points(wt, codon, alt, hotspot_occ) + ps3_lbl, ps3_pts = ps3_bs3_label_and_points(s3_rec['code']) + s2_rec = s2.get((wt, codon, alt)) + pp3_lbl, pp3_pts = ('', 0) + spliceai = 0.0 + paths = [] + if s2_rec: + pp3_lbl, pp3_pts = pp3_bp4_label_and_points(s2_rec['code']) + spliceai = s2_rec.get('spliceai', 0.0) + paths = s2_rec.get('paths', []) + + # Megan's splicing rule: SpliceAI >= 0.2 -> PP3 splicing applies. + # When the missense call is BP4 / BP4_Moderate, splicing PP3 + # supersedes — replace the BP4 contribution with PP3 (+1). + splice_pp3_active = spliceai >= SPLICE_PP3_THRESHOLD + splice_overrode_bp4 = False + if splice_pp3_active: + if pp3_lbl in ('BP4', 'BP4_Moderate'): + pp3_lbl = 'PP3 (splicing)' + pp3_pts = 1 + splice_overrode_bp4 = True + elif not pp3_lbl: + pp3_lbl = 'PP3 (splicing)' + pp3_pts = 1 + + # Allele-frequency code (BA1 / BS1 / PM2_Supporting) + af_code, af_pts, af_qty = af_code_and_points( + wt, codon, alt, paths, af_lookup, tx) + af_lbl = "{} ({})".format(af_code, af_qty) if af_code else '' + ba1 = (af_code == 'BA1') + + # BS2 from FLOSSIES + bs2_applies = bs2_observed(wt, codon, paths, flossies_lookup, tx) + bs2_pts = POINTS['BS2'] if bs2_applies else 0 + bs2_lbl = "BS2 (-4 pts)" if bs2_applies else "Not observed" + + total = pm1_pts + ps3_pts + pp3_pts + af_pts + bs2_pts + + if ba1: + cls = 'Benign' + else: + cls = bucket(total) + + color = CLASS_COLOR[cls] + + applied_codes = [] + if pm1_lbl: applied_codes.append("{} (+{})".format(pm1_lbl, pm1_pts)) + if ps3_lbl: + sign = "+" if ps3_pts > 0 else "" + applied_codes.append("{} ({}{})".format(ps3_lbl, sign, ps3_pts)) + if pp3_lbl: + sign = "+" if pp3_pts > 0 else "" + applied_codes.append("{} ({}{})".format(pp3_lbl, sign, pp3_pts)) + if af_code == 'BA1': + applied_codes.append("BA1 (stand-alone B)") + elif af_code: + sign = "+" if af_pts > 0 else "" + applied_codes.append("{} ({}{})".format(af_code, sign, af_pts)) + if bs2_applies: + applied_codes.append("BS2 (-4)") + applied = "; ".join(applied_codes) + + short = "{}{}{}".format(wt, codon, alt) + mo = mouseover(short, cls, total, + pm1_lbl, ps3_lbl, pp3_lbl, af_lbl, bs2_lbl, applied, + spliceai, splice_pp3_active, splice_overrode_bp4, + codon, ba1) + + segs = lib.aa_codon_genomic(codon, tx) + for g_start, g_end, _ex in segs: + lines.append("\t".join([ + chrom, str(g_start), str(g_end), + short, "0", ".", + str(g_start), str(g_end), + color, + cls, str(total), + applied, + pm1_lbl or "-", + ps3_lbl or "-", + pp3_lbl or "-", + af_code or "-", + "BS2" if bs2_applies else "-", + "{:.2f}".format(spliceai) if spliceai else "0.00", + mo, + ])) + return lines + + +def build(db, outdir): + print("=== {} ===".format(db)) + os.makedirs(outdir, exist_ok=True) + s3 = load_s3(SRC_S3) + s2 = load_s2(SRC_S2) + hotspots = load_hotspot_occurrences(HOTSPOTS_JSON) + af_lookup = load_af_lookup(db) + flossies_lookup = load_flossies_lookup(db) + print(" S3 entries: {} S2 entries: {} " + "cancerhotspots: {} AF: {} FLOSSIES BS2: {}".format( + len(s3), len(s2), len(hotspots), + len(af_lookup), len(flossies_lookup))) + + tx = lib.get_transcript_info(db) + bed_lines = generate_bed(s3, s2, hotspots, af_lookup, flossies_lookup, tx) + print(" {} BED rows".format(len(bed_lines))) + + as_file = os.path.join(outdir, "TP53ProvisionalClass.as") + lib.write_autosql(as_file, AUTOSQL) + bed = os.path.join(outdir, "TP53ProvisionalClass_{}.bed".format(db)) + with open(bed, 'w') as f: + f.write("\n".join(bed_lines) + "\n") + lib.bash("sort -k1,1 -k2,2n {0} -o {0}".format(bed)) + bb = os.path.join(outdir, "TP53ProvisionalClass{}.bb".format(db.capitalize())) + lib.run_bedToBigBed(bed, as_file, bb, lib.chrom_sizes_path(db), "bed9+9") + print(" wrote {}".format(bb)) + + from collections import Counter + cnt = Counter() + af_cnt = Counter() + bs2_cnt = 0 + with open(bed) as f: + for line in f: + flds = line.split("\t") + cnt[flds[9]] += 1 + af_cnt[flds[13]] += 1 + if flds[14] == 'BS2': + bs2_cnt += 1 + print(" Class distribution:") + for k, n in cnt.most_common(): + print(" {}: {}".format(k, n)) + print(" AF code distribution: {}".format(dict(af_cnt))) + print(" BS2 applied: {} rows".format(bs2_cnt)) + + +def main(): + p = argparse.ArgumentParser(description=__doc__) + p.add_argument('-o', '--output-dir', default=DEFAULT_OUTDIR) + p.add_argument('--db', action='append', help='hg38 or hg19 (repeat). Default hg38.') + args = p.parse_args() + dbs = args.db if args.db else ['hg38'] + for db in dbs: + build(db, args.output_dir) + + +if __name__ == "__main__": + main()