a56d88e05670b759ff3b32829542537ccc790c57 lrnassar Tue Apr 28 19:18:20 2026 -0700 Address CR feedback on insight + tp53 hub scripts. refs #37418 Drop duplicated bash() wrappers in favor of subprocess.run / check_output with list args, eliminating shell=True, embedded-quote concerns, and stderr-into-stdout merging. Centralize common operations as run_sort_bed/run_liftOver in tp53FuncLib alongside existing run_bedToBigBed. Switch HTML escaping to stdlib html.escape() consistently. insightHCIPriors mouseover (previously unescaped) now escapes HGVS fields, addressing the specific c.123A>G case Jonathan flagged. Replace invalid
tags with
across all five affected mouseover sites. diff --git src/hg/makeDb/scripts/tp53/tp53ProvisionalClass.py src/hg/makeDb/scripts/tp53/tp53ProvisionalClass.py index a304e51f4bf..0986e8bee81 100644 --- src/hg/makeDb/scripts/tp53/tp53ProvisionalClass.py +++ src/hg/makeDb/scripts/tp53/tp53ProvisionalClass.py @@ -1,593 +1,593 @@ #!/usr/bin/env python3 """ TP53 VCEP NON-FINAL Provisional Classification track generator. For every possible p53 missense protein change, sum the Tavtigian points from applicable evidence codes the VCEP has pre-computed: * PM1 hotspot (Clinical Domains + cancerhotspots.org) * PS3 / BS3 (CSpec GN009 v2.4.0 Table S3) * PP3 / BP4 (CSpec GN009 v2.4.0 Table S2) * BA1 / BS1 / PM2 (gnomAD v4.1 exomes, per CSpec FAF/grpmax thresholds) * BS2 (FLOSSIES healthy-women-over-70 cohort) * Splicing PP3 (SpliceAI >= 0.2) The point sum is bucketed into P / LP / VUS / LB / B per the CSpec classification ranges. BA1 is stand-alone Benign and forces class = Benign regardless of other evidence. DELIBERATELY EXCLUDED from the sum (documented in every mouseover): - PVS1 (null variants only; handled in separate track) - PS1 / PS2 / PS4 / PP1 / PP4 / BS4 / BP7 (require clinical observations) This is NOT a VCEP classification — the warning is in every mouseover since clinicians live in the mouseover, not the description page. bigBed 9+9. """ import argparse import json import os import re import sys import openpyxl sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) import tp53FuncLib as lib DEFAULT_OUTDIR = "/hive/users/lrnassar/claude/RM37399/provisionalClass" SRC_S3 = "/hive/users/lrnassar/claude/RM37399/tp53_downloads/Functional-worksheet.xlsx" SRC_S2 = "/hive/users/lrnassar/claude/RM37399/tp53_downloads/bioinformatic_worksheet.xlsx" HOTSPOTS_JSON = "/hive/users/lrnassar/claude/RM37399/cancerHotspots/cancerhotspots_single.json" AF_BED_TPL = "/hive/users/lrnassar/claude/RM37399/afFrequencies/TP53AF_{}.bed" FLOSSIES_BED_TPL = "/hive/users/lrnassar/claude/RM37399/flossies/TP53Flossies_{}.bed" PM1_HARDCODED_CODONS = {175, 245, 248, 249, 273, 282} PROT_RE = re.compile(r'^([A-Z])(\d+)([A-Z])$') HGVSP3_RE = re.compile(r'^p\.([A-Z][a-z]{2})(\d+)([A-Z][a-z]{2})$') HGVSC_RE = re.compile(r'^c\.(\d+)([ACGT])>([ACGT])$') THREE_TO_ONE = { 'Ala':'A','Arg':'R','Asn':'N','Asp':'D','Cys':'C','Glu':'E','Gln':'Q', 'Gly':'G','His':'H','Ile':'I','Leu':'L','Lys':'K','Met':'M','Phe':'F', 'Pro':'P','Ser':'S','Thr':'T','Trp':'W','Tyr':'Y','Val':'V', } # Tavtigian points per code POINTS = { 'PM1_Moderate': 2, 'PM1_Supporting': 1, 'PS3_Strong': 4, 'PS3': 4, 'PS3_Moderate': 2, 'PS3_Supporting': 1, 'BS3_Strong': -4, 'BS3': -4, 'BS3_Supporting': -1, 'PP3_Moderate': 2, 'PP3_moderate': 2, 'PP3': 1, 'BP4_Moderate': -2, 'BP4_moderate': -2, 'BP4': -1, 'BA1': 0, # stand-alone B -> not summed; forces class 'BS1': -4, 'PM2_Supporting': 1, 'BS2': -4, 'No evidence': 0, 'Indeterminate': 0, } SPLICE_PP3_THRESHOLD = 0.2 # CSpec GN009: SpliceAI >= 0.2 -> PP3 splicing def bucket(pts): if pts >= 10: return 'Pathogenic' if 6 <= pts <= 9: return 'Likely pathogenic' if -6 <= pts <= -2: return 'Likely benign' if pts <= -7: return 'Benign' return 'Uncertain significance' CLASS_COLOR = { 'Pathogenic': '210,0,0', 'Likely pathogenic': '245,152,152', 'Uncertain significance': '0,0,136', 'Likely benign': '213,247,213', 'Benign': '0,210,0', } AUTOSQL = """table TP53ProvisionalClass "TP53 NON-FINAL Provisional Classification by Tavtigian point sum (NOT a VCEP classification)" ( string chrom; "Reference sequence chromosome or scaffold" uint chromStart; "Start position in chromosome" uint chromEnd; "End position in chromosome" string name; "Missense change (e.g., R175H)" uint score; "Not used, all 0" char[1] strand; "Not used, all ." uint thickStart; "Same as chromStart" uint thickEnd; "Same as chromEnd" uint reserved; "RGB color" string provisionalClass; "Provisional class (Pathogenic/LP/VUS/LB/Benign)" int totalPoints; "Sum of applied Tavtigian points" string appliedCodes; "Codes contributing (semicolon-separated)" string pm1; "PM1 contribution" string ps3bs3; "PS3/BS3 contribution (from Table S3)" string pp3bp4; "PP3/BP4 contribution (Table S2 + splicing)" string af; "AF code (BA1/BS1/PM2_Supporting/none)" string bs2; "BS2 evidence (FLOSSIES cohort observation)" string spliceAI; "Max SpliceAI delta (from Table S2)" lstring _mouseOver; "HTML mouseover" ) """ def parse_missense_short(p): if not isinstance(p, str): return None m = PROT_RE.match(p.strip()) if not m: return None return (m.group(1), int(m.group(2)), m.group(3)) def parse_hgvsp3(p): if not isinstance(p, str): return None m = HGVSP3_RE.match(p.strip()) if not m: return None wt3, codon, alt3 = m.group(1), int(m.group(2)), m.group(3) if wt3 not in THREE_TO_ONE or alt3 not in THREE_TO_ONE: return None return (THREE_TO_ONE[wt3], codon, THREE_TO_ONE[alt3]) def load_s3(path): """Table S3: per-missense preliminary PS3/BS3. Key by (wt, codon, alt) 1-letter.""" wb = openpyxl.load_workbook(path, data_only=True) ws = wb["Supplementary Table S3"] out = {} for row in ws.iter_rows(min_row=4, values_only=True): k = parse_missense_short(row[0]) if not k: continue out[k] = { 'code': str(row[7]).strip() if row[7] is not None else 'No evidence', } return out def load_s2(path): """Table S2: per c.X>Y missense with preliminary PP3/BP4 + max SpliceAI. Returns {(wt, codon, alt) -> {code, spliceai_max, paths: [(c_pos, ref_nt, alt_nt), ...]}}. 'paths' enumerates every c.X>Y combination that yields the protein change — used downstream to look up AF at each contributing genomic coord. """ wb = openpyxl.load_workbook(path, data_only=True) ws = wb["Supplementary Table S2"] out = {} for row in ws.iter_rows(min_row=4, values_only=True): k = parse_hgvsp3(row[1]) if not k: continue hgvsc = row[0] path_tuple = None if isinstance(hgvsc, str): mc = HGVSC_RE.match(hgvsc.strip()) if mc: path_tuple = (int(mc.group(1)), mc.group(2), mc.group(3)) new_code = str(row[4]).strip() if row[4] is not None else 'No evidence' try: new_spliceai = float(row[5]) if row[5] is not None else 0.0 except (ValueError, TypeError): new_spliceai = 0.0 existing = out.get(k) if existing is None: out[k] = { 'code': new_code, 'spliceai': new_spliceai, 'paths': [path_tuple] if path_tuple else [], } else: if _s2_priority(new_code) > _s2_priority(existing['code']): existing['code'] = new_code if new_spliceai > existing['spliceai']: existing['spliceai'] = new_spliceai if path_tuple and path_tuple not in existing['paths']: existing['paths'].append(path_tuple) return out def _s2_priority(code): order = { 'PP3_moderate': 5, 'BP4_moderate': 5, 'PP3': 4, 'BP4': 4, 'No evidence': 1, } return order.get(code, 0) def load_hotspot_occurrences(path): out = {} if not os.path.exists(path): return out with open(path) as f: data = json.load(f) for h in data: if h.get('hugoSymbol') != 'TP53': continue res = h.get('residue') or '' if not res: continue wt = res[0] try: codon = int(res[1:]) except ValueError: continue for alt, count in (h.get('variantAminoAcid') or {}).items(): if alt == wt or alt in ('*', 'X') or count is None or count < 2: continue out[(codon, alt)] = count return out def load_af_lookup(db): """Read TP53AF_.bed and return {(chrom, pos1based, ref, alt) -> code}. The display name in the AF bed is 'chrN-pos-ref-alt' (1-based pos). """ out = {} path = AF_BED_TPL.format(db) if not os.path.exists(path): return out with open(path) as f: for line in f: flds = line.rstrip("\n").split("\t") if len(flds) < 11: continue disp = flds[3] code = flds[9] m = re.match(r'^(chr[^-]+)-(\d+)-([ACGT-]+)-([ACGT-]+)$', disp) if not m: continue out[(m.group(1), int(m.group(2)), m.group(3), m.group(4))] = code return out def load_flossies_lookup(db): """Read TP53Flossies_.bed and return set of (chrom, pos1based, ref, alt) keys for rows with BS2 applicability = 'BS2'. Match is variant-specific (a FLOSSIES observation supports BS2 only for the exact nt change observed, not for any change at the same codon).""" out = set() path = FLOSSIES_BED_TPL.format(db) if not os.path.exists(path): return out with open(path) as f: for line in f: flds = line.rstrip("\n").split("\t") if len(flds) < 19: continue if flds[9] != 'BS2': continue chrom = flds[0] start = int(flds[1]) ref = flds[17] alt = flds[18] out.add((chrom, start + 1, ref, alt)) return out def pm1_code_and_points(wt, codon, alt, hotspot_occ): if codon in PM1_HARDCODED_CODONS: return ('PM1_Moderate', 2) count = hotspot_occ.get((codon, alt), 0) if count >= 10: return ('PM1_Moderate', 2) if count >= 2: return ('PM1_Supporting', 1) return ('', 0) def ps3_bs3_label_and_points(code): if code in ('PS3', 'PS3_Strong'): return ('PS3_Strong', 4) if code == 'PS3_Moderate': return (code, 2) if code == 'PS3_Supporting': return (code, 1) if code == 'BS3_Supporting': return (code, -1) if code in ('BS3', 'BS3_Strong'): return ('BS3_Strong', -4) return ('', 0) def pp3_bp4_label_and_points(code): if code == 'PP3_moderate': return ('PP3_Moderate', 2) if code == 'PP3': return ('PP3', 1) if code == 'BP4': return ('BP4', -1) if code == 'BP4_moderate': return ('BP4_Moderate', -2) return ('', 0) def af_code_and_points(wt, codon, alt, paths, af_lookup, tx): """For a (wt, codon, alt) protein change, look up gnomAD v4.1 AF at every c.X>Y path and return the strongest applicable code. Priority (most-benign first): BA1 > BS1 > PM2_Supporting > none. BA1 is stand-alone Benign; the caller forces classification = Benign. """ chrom = tx['chrom'] strand = tx['strand'] rcomp = {'A':'T','T':'A','C':'G','G':'C'} seen = [] for c_pos, c_ref, c_alt in paths: g = lib.cdna_coding_to_genomic(c_pos, tx) if g is None: continue if strand == '-': g_ref = rcomp.get(c_ref, c_ref) g_alt = rcomp.get(c_alt, c_alt) else: g_ref = c_ref g_alt = c_alt # AF lookup uses 1-based start key = (chrom, g + 1, g_ref, g_alt) code = af_lookup.get(key) if code: seen.append(code) if 'BA1' in seen: return ('BA1', 0, 'stand-alone Benign') if 'BS1' in seen: return ('BS1', POINTS['BS1'], '-4 pts') if 'PM2_Supporting' in seen: return ('PM2_Supporting', POINTS['PM2_Supporting'], '+1 pt') return ('', 0, '') def bs2_observed(wt, codon, paths, flossies_lookup, tx): """Return True if any (c.X>Y) path resolves to a FLOSSIES variant that matches by exact genomic position AND ref/alt. BS2 requires the SAME nt change — an observation of c.1120G>A does not support BS2 for c.1120G>C even though both yield p.G374R.""" chrom = tx['chrom'] strand = tx['strand'] rcomp = {'A':'T','T':'A','C':'G','G':'C'} for c_pos, c_ref, c_alt in paths: g = lib.cdna_coding_to_genomic(c_pos, tx) if g is None: continue if strand == '-': g_ref = rcomp.get(c_ref, c_ref) g_alt = rcomp.get(c_alt, c_alt) else: g_ref = c_ref g_alt = c_alt if (chrom, g + 1, g_ref, g_alt) in flossies_lookup: return True return False CAVEATS_STR = ( "PVS1 (null variants only — see PVS1 tracks); " "PS1/PS2/PS4/PP1/PP4/BS4/BP7 (require clinical observations)." ) HEADER_WARNING = ( "" "NON-FINAL: NOT a VCEP classification — preliminary point-sum only." "" ) def mouseover(name, cls, pts, pm1, ps3, pp3, af_lbl, bs2_lbl, applied, spliceai, splice_pp3_active, splice_overrode_bp4, codon, ba1): splice_section = "" if spliceai and spliceai > 0: if splice_pp3_active and splice_overrode_bp4: splice_section = ( "
SpliceAI: {sa:.2f} — " "" "splicing PP3 applies (supersedes missense BP4); " "potential splice disruption." "" ).format(sa=spliceai) elif splice_pp3_active: splice_section = ( "
SpliceAI: {sa:.2f} — splicing PP3 applies" ).format(sa=spliceai) else: splice_section = "
SpliceAI: {sa:.2f}".format(sa=spliceai) codon72 = "" if codon == 72: codon72 = ( "
Note: Codon 72 PS3/BS3 data is measured on the R72 " "haplotype (rs1042522); reference is P72. P72X variants are NOT " "in Table S3 (only R72X is). See description page for details." ) ba1_section = "" if ba1: ba1_section = ( "
BA1 stand-alone Benign: " "FAF ≥ 0.001 in gnomAD v4.1 forces class = Benign." ) return ( "{warn}" "
Variant: p.{name} (NP_000537.3)" "
Provisional class: {cls} ({pts} pts)" "
PM1: {pm1}" "
PS3/BS3: {ps3}" "
PP3/BP4: {pp3}" "
AF (gnomAD v4.1): {af}" "
BS2 (FLOSSIES): {bs2}" "
Applied codes: {applied}" "{splice}{ba1_section}{codon72}" "
NOT included in this sum: {cav}" ).format(warn=HEADER_WARNING, name=name, cls=cls, pts=pts, pm1=pm1 or "No contribution", ps3=ps3 or "No contribution", pp3=pp3 or "No contribution", af=af_lbl or "No contribution", bs2=bs2_lbl, applied=applied or "(none)", splice=splice_section, ba1_section=ba1_section, codon72=codon72, cav=CAVEATS_STR) def generate_bed(s3, s2, hotspot_occ, af_lookup, flossies_lookup, tx): lines = [] chrom = tx['chrom'] for (wt, codon, alt), s3_rec in sorted(s3.items(), key=lambda kv: (kv[0][1], kv[0][2])): pm1_lbl, pm1_pts = pm1_code_and_points(wt, codon, alt, hotspot_occ) ps3_lbl, ps3_pts = ps3_bs3_label_and_points(s3_rec['code']) s2_rec = s2.get((wt, codon, alt)) pp3_lbl, pp3_pts = ('', 0) spliceai = 0.0 paths = [] if s2_rec: pp3_lbl, pp3_pts = pp3_bp4_label_and_points(s2_rec['code']) spliceai = s2_rec.get('spliceai', 0.0) paths = s2_rec.get('paths', []) # Megan's splicing rule: SpliceAI >= 0.2 -> PP3 splicing applies. # When the missense call is BP4 / BP4_Moderate, splicing PP3 # supersedes — replace the BP4 contribution with PP3 (+1). splice_pp3_active = spliceai >= SPLICE_PP3_THRESHOLD splice_overrode_bp4 = False if splice_pp3_active: if pp3_lbl in ('BP4', 'BP4_Moderate'): pp3_lbl = 'PP3 (splicing)' pp3_pts = 1 splice_overrode_bp4 = True elif not pp3_lbl: pp3_lbl = 'PP3 (splicing)' pp3_pts = 1 # Allele-frequency code (BA1 / BS1 / PM2_Supporting) af_code, af_pts, af_qty = af_code_and_points( wt, codon, alt, paths, af_lookup, tx) af_lbl = "{} ({})".format(af_code, af_qty) if af_code else '' ba1 = (af_code == 'BA1') # BS2 from FLOSSIES bs2_applies = bs2_observed(wt, codon, paths, flossies_lookup, tx) bs2_pts = POINTS['BS2'] if bs2_applies else 0 bs2_lbl = "BS2 (-4 pts)" if bs2_applies else "Not observed" total = pm1_pts + ps3_pts + pp3_pts + af_pts + bs2_pts if ba1: cls = 'Benign' else: cls = bucket(total) color = CLASS_COLOR[cls] applied_codes = [] if pm1_lbl: applied_codes.append("{} (+{})".format(pm1_lbl, pm1_pts)) if ps3_lbl: sign = "+" if ps3_pts > 0 else "" applied_codes.append("{} ({}{})".format(ps3_lbl, sign, ps3_pts)) if pp3_lbl: sign = "+" if pp3_pts > 0 else "" applied_codes.append("{} ({}{})".format(pp3_lbl, sign, pp3_pts)) if af_code == 'BA1': applied_codes.append("BA1 (stand-alone B)") elif af_code: sign = "+" if af_pts > 0 else "" applied_codes.append("{} ({}{})".format(af_code, sign, af_pts)) if bs2_applies: applied_codes.append("BS2 (-4)") applied = "; ".join(applied_codes) short = "{}{}{}".format(wt, codon, alt) mo = mouseover(short, cls, total, pm1_lbl, ps3_lbl, pp3_lbl, af_lbl, bs2_lbl, applied, spliceai, splice_pp3_active, splice_overrode_bp4, codon, ba1) segs = lib.aa_codon_genomic(codon, tx) for g_start, g_end, _ex in segs: lines.append("\t".join([ chrom, str(g_start), str(g_end), short, "0", ".", str(g_start), str(g_end), color, cls, str(total), applied, pm1_lbl or "-", ps3_lbl or "-", pp3_lbl or "-", af_code or "-", "BS2" if bs2_applies else "-", "{:.2f}".format(spliceai) if spliceai else "0.00", mo, ])) return lines def build(db, outdir): print("=== {} ===".format(db)) os.makedirs(outdir, exist_ok=True) s3 = load_s3(SRC_S3) s2 = load_s2(SRC_S2) hotspots = load_hotspot_occurrences(HOTSPOTS_JSON) af_lookup = load_af_lookup(db) flossies_lookup = load_flossies_lookup(db) print(" S3 entries: {} S2 entries: {} " "cancerhotspots: {} AF: {} FLOSSIES BS2: {}".format( len(s3), len(s2), len(hotspots), len(af_lookup), len(flossies_lookup))) tx = lib.get_transcript_info(db) bed_lines = generate_bed(s3, s2, hotspots, af_lookup, flossies_lookup, tx) print(" {} BED rows".format(len(bed_lines))) as_file = os.path.join(outdir, "TP53ProvisionalClass.as") lib.write_autosql(as_file, AUTOSQL) bed = os.path.join(outdir, "TP53ProvisionalClass_{}.bed".format(db)) with open(bed, 'w') as f: f.write("\n".join(bed_lines) + "\n") - lib.bash("sort -k1,1 -k2,2n {0} -o {0}".format(bed)) + lib.run_sort_bed(bed) bb = os.path.join(outdir, "TP53ProvisionalClass{}.bb".format(db.capitalize())) lib.run_bedToBigBed(bed, as_file, bb, lib.chrom_sizes_path(db), "bed9+9") print(" wrote {}".format(bb)) from collections import Counter cnt = Counter() af_cnt = Counter() bs2_cnt = 0 with open(bed) as f: for line in f: flds = line.split("\t") cnt[flds[9]] += 1 af_cnt[flds[13]] += 1 if flds[14] == 'BS2': bs2_cnt += 1 print(" Class distribution:") for k, n in cnt.most_common(): print(" {}: {}".format(k, n)) print(" AF code distribution: {}".format(dict(af_cnt))) print(" BS2 applied: {} rows".format(bs2_cnt)) def main(): p = argparse.ArgumentParser(description=__doc__) p.add_argument('-o', '--output-dir', default=DEFAULT_OUTDIR) p.add_argument('--db', action='append', help='hg38 or hg19 (repeat). Default hg38.') args = p.parse_args() dbs = args.db if args.db else ['hg38'] for db in dbs: build(db, args.output_dir) if __name__ == "__main__": main()