f24ea956ba3b7a8c7ee8be0826d66489b85e0bbc
lrnassar
Tue Apr 28 18:22:48 2026 -0700
Adding TP53 VCEP track hub build scripts. refs #37399
16 Python scripts that build the 15 bigBed tracks for the ClinGen TP53
VCEP track hub (CSpec GN009 v2.4.0, NM_000546.6 / NP_000537.3). Includes
the NON-FINAL Provisional Classification (Tavtigian point sum across PM1
+ PS3/BS3 + PP3/BP4 + AF + BS2 + splicing PP3), gnomAD v4.1 AF codes,
FLOSSIES BS2 evidence, Bioinformatic PP3/BP4 (missense + single-aa
in-frame del), VCEP curated variants from EvRepo with ClinVar VCV
backfill, PM1 (clinical domains + cancerhotspots), PVS1 regions and
splice sites, and the Functional Evidence composite (VCEP preliminary
PS3/BS3 with per-paper raw scores from Kato/Giacomelli/Kawaguchi/Funk).
diff --git src/hg/makeDb/scripts/tp53/tp53CancerHotspots.py src/hg/makeDb/scripts/tp53/tp53CancerHotspots.py
new file mode 100644
index 00000000000..09901e7a9f4
--- /dev/null
+++ src/hg/makeDb/scripts/tp53/tp53CancerHotspots.py
@@ -0,0 +1,179 @@
+#!/usr/bin/env python3
+"""
+TP53 VCEP cancerhotspots.org subtrack generator (PM1 Evidence composite).
+
+Fetches cancerhotspots.org single-residue hotspots, filters to TP53, and
+emits a per-AA-change bigBed 9+4 with ACMG PM1 strength assignment:
+
+ PM1_Moderate (+2 pts) >=10 somatic occurrences of this exact aa change
+ PM1_Supporting (+1 pt) 2-9 occurrences
+
+Synonymous (WT->WT) and stop-gain (*) variants are excluded; only missense.
+Live pull with a cached-snapshot fallback.
+"""
+
+import argparse
+import json
+import os
+import sys
+import urllib.request
+
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+import tp53FuncLib as lib
+
+DEFAULT_OUTDIR = "/hive/users/lrnassar/claude/RM37399/cancerHotspots"
+SNAPSHOT_NAME = "cancerhotspots_single.json"
+API_URL = "https://www.cancerhotspots.org/api/hotspots/single"
+
+COLOR_MOD = "230,3,131" # fuchsia — PM1_Moderate
+COLOR_SUP = "245,182,207" # light fuchsia — PM1_Supporting
+
+AUTOSQL = """table TP53CancerHotspots
+"TP53 somatic hotspots from cancerhotspots.org mapped to ACMG PM1 strength"
+ (
+ string chrom; "Reference sequence chromosome or scaffold"
+ uint chromStart; "Start position in chromosome"
+ uint chromEnd; "End position in chromosome"
+ string name; "Amino acid change (e.g. R175H)"
+ uint score; "Not used, all 0"
+ char[1] strand; "Not used, all ."
+ uint thickStart; "Same as chromStart"
+ uint thickEnd; "Same as chromEnd"
+ uint reserved; "RGB color"
+ string acmgCode; "PM1_Moderate or PM1_Supporting"
+ string points; "Tavtigian points contribution"
+ uint somaticCount; "Somatic occurrence count (cancerhotspots.org)"
+ lstring _mouseOver; "HTML mouseover"
+ )
+"""
+
+
+def fetch_or_load(outdir):
+ path = os.path.join(outdir, SNAPSHOT_NAME)
+ try:
+ req = urllib.request.Request(API_URL, headers={'User-Agent': 'UCSC-kent/TP53-hub'})
+ with urllib.request.urlopen(req, timeout=30) as resp:
+ body = resp.read()
+ data = json.loads(body)
+ # Write snapshot on success
+ with open(path, 'wb') as f:
+ f.write(body)
+ print(" fetched {} records from cancerhotspots.org".format(len(data)))
+ return data
+ except Exception as e:
+ if os.path.exists(path):
+ print(" WARNING: live fetch failed ({}); falling back to snapshot {}".format(e, path))
+ with open(path) as f:
+ return json.load(f)
+ raise RuntimeError("cancerhotspots fetch failed and no snapshot: {}".format(e))
+
+
+def records_for_tp53(data):
+ out = []
+ for h in data:
+ if h.get('hugoSymbol') != 'TP53':
+ continue
+ res = h.get('residue', '')
+ if not res:
+ continue
+ # residue is like "R175"; first char = WT aa, rest = codon position.
+ # cancerhotspots.org also reports splice-site somatic hotspots where
+ # residue starts with 'X' and variants contain 'sp'. Those are NOT
+ # missense and must not be classified PM1 (CSpec §PM1 is missense only).
+ wt = res[0]
+ if wt == 'X' or not wt.isalpha():
+ continue
+ try:
+ codon = int(res[1:])
+ except ValueError:
+ continue
+ variants = h.get('variantAminoAcid') or {}
+ for alt, count in variants.items():
+ if alt == wt: # synonymous — not missense
+ continue
+ if alt in ('*', 'X'): # stop-gain — handled by PVS1, not PM1
+ continue
+ if not alt.isalpha() or len(alt) != 1:
+ # splice-site codes ("sp", "fs", etc.) are not missense
+ continue
+ if count is None or count < 2:
+ continue
+ if count >= 10:
+ acmg, points, color = "PM1_Moderate", "+2", COLOR_MOD
+ else:
+ acmg, points, color = "PM1_Supporting", "+1", COLOR_SUP
+ out.append({
+ 'wt': wt,
+ 'codon': codon,
+ 'alt': alt,
+ 'count': count,
+ 'acmg': acmg,
+ 'points': points,
+ 'color': color,
+ 'name': "{}{}{}".format(wt, codon, alt),
+ })
+ return out
+
+
+def mouseover(rec):
+ return (
+ "{acmg} ({points} pts)"
+ "
Variant: {name} (TP53 NP_000537.3)"
+ "
Somatic occurrences (cancerhotspots.org): {count}"
+ "
Codon: {codon}"
+ "
Source: cancerhotspots.org — PM1 per CSpec GN009 v2.4.0"
+ ).format(**rec)
+
+
+def generate_bed(records, tx):
+ lines = []
+ chrom = tx['chrom']
+ for rec in records:
+ segs = lib.aa_codon_genomic(rec['codon'], tx)
+ if not segs:
+ continue
+ for g_start, g_end, _ex in segs:
+ lines.append("\t".join([
+ chrom, str(g_start), str(g_end),
+ rec['name'], "0", ".",
+ str(g_start), str(g_end),
+ rec['color'],
+ rec['acmg'], rec['points'], str(rec['count']),
+ mouseover(rec),
+ ]))
+ return lines
+
+
+def build(db, outdir):
+ print("=== {} ===".format(db))
+ os.makedirs(outdir, exist_ok=True)
+ data = fetch_or_load(outdir)
+ records = records_for_tp53(data)
+ print(" {} TP53 PM1 records".format(len(records)))
+ tx = lib.get_transcript_info(db)
+ bed_lines = generate_bed(records, tx)
+ print(" {} BED rows".format(len(bed_lines)))
+
+ as_file = os.path.join(outdir, "TP53CancerHotspots.as")
+ lib.write_autosql(as_file, AUTOSQL)
+ bed = os.path.join(outdir, "TP53CancerHotspots_{}.bed".format(db))
+ with open(bed, 'w') as f:
+ f.write("\n".join(bed_lines) + "\n")
+ lib.bash("sort -k1,1 -k2,2n {0} -o {0}".format(bed))
+ bb = os.path.join(outdir, "TP53CancerHotspots{}.bb".format(db.capitalize()))
+ lib.run_bedToBigBed(bed, as_file, bb, lib.chrom_sizes_path(db), "bed9+4")
+ print(" wrote {}".format(bb))
+
+
+def main():
+ p = argparse.ArgumentParser(description=__doc__)
+ p.add_argument('-o', '--output-dir', default=DEFAULT_OUTDIR)
+ p.add_argument('--db', action='append', help='hg38 or hg19 (repeat). Default hg38.')
+ args = p.parse_args()
+ dbs = args.db if args.db else ['hg38']
+ for db in dbs:
+ build(db, args.output_dir)
+
+
+if __name__ == "__main__":
+ main()