f24ea956ba3b7a8c7ee8be0826d66489b85e0bbc
lrnassar
Tue Apr 28 18:22:48 2026 -0700
Adding TP53 VCEP track hub build scripts. refs #37399
16 Python scripts that build the 15 bigBed tracks for the ClinGen TP53
VCEP track hub (CSpec GN009 v2.4.0, NM_000546.6 / NP_000537.3). Includes
the NON-FINAL Provisional Classification (Tavtigian point sum across PM1
+ PS3/BS3 + PP3/BP4 + AF + BS2 + splicing PP3), gnomAD v4.1 AF codes,
FLOSSIES BS2 evidence, Bioinformatic PP3/BP4 (missense + single-aa
in-frame del), VCEP curated variants from EvRepo with ClinVar VCV
backfill, PM1 (clinical domains + cancerhotspots), PVS1 regions and
splice sites, and the Functional Evidence composite (VCEP preliminary
PS3/BS3 with per-paper raw scores from Kato/Giacomelli/Kawaguchi/Funk).
diff --git src/hg/makeDb/scripts/tp53/tp53AFfrequencies.py src/hg/makeDb/scripts/tp53/tp53AFfrequencies.py
new file mode 100644
index 00000000000..3e85859e7b4
--- /dev/null
+++ src/hg/makeDb/scripts/tp53/tp53AFfrequencies.py
@@ -0,0 +1,279 @@
+#!/usr/bin/env python3
+"""
+TP53 VCEP Allele Frequencies (BA1/BS1/PM2) track generator.
+
+Pulls gnomAD v4.1 exome variants at the TP53 locus and classifies them
+per CSpec GN009 v2.4.0 thresholds:
+
+ BA1 FAF >= 0.001 stand-alone B
+ BS1 0.0003 <= FAF < 0.001 -4 pts
+ PM2_Supporting AF < 0.00003 global AND grpmax AF < 0.00004 +1 pt
+
+Uses faf95 (col 16) from the UCSC gnomAD v4.1 bigBed, plus grpmax AF
+(col 27) and global AF (col 15). CHIP note (col 29) surfaced in mouseover.
+
+Founder-effect ancestry groups (AJ/FIN/AMI/MID/Remaining) are EXCLUDED from
+the per-ancestry check per CSpec — our PM2_Supporting uses grpmax as a
+conservative proxy and flags when the proxy may miss qualifying variants.
+"""
+
+import argparse
+import os
+import sys
+
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+import tp53FuncLib as lib
+
+DEFAULT_OUTDIR = "/hive/users/lrnassar/claude/RM37399/afFrequencies"
+GNOMAD_BB = "/gbdb/hg38/gnomAD/v4.1/exomes/exomes.bb"
+
+# CSpec GN009 v2.4.0 TP53 thresholds
+BA1_FAF = 0.001
+BS1_FAF_LOW = 0.0003
+BS1_FAF_HIGH = 0.001
+PM2_AF_GLOBAL_MAX = 0.00003
+PM2_AF_GRPMAX_MAX = 0.00004
+
+COLORS = {
+ 'BA1': '2,82,66', # dark teal (stand-alone B)
+ 'BS1': '35,159,134', # teal
+ 'PM2_Supporting': '138,111,158', # purple
+}
+
+POINTS = {
+ 'BA1': 'stand-alone B',
+ 'BS1': '-4 pts',
+ 'PM2_Supporting': '+1 pt',
+}
+
+RULES = {
+ 'BA1': 'gnomAD v4.1 FAF >= 0.001 (0.1%)',
+ 'BS1': 'gnomAD v4.1 FAF in [0.0003, 0.001)',
+ 'PM2_Supporting': 'gnomAD v4.1 AF < 3e-5 global AND grpmax < 4e-5',
+}
+
+AUTOSQL = """table TP53AF
+"TP53 VCEP ACMG allele frequency classifications from gnomAD v4.1 exomes"
+ (
+ string chrom; "Reference sequence chromosome or scaffold"
+ uint chromStart; "Start position in chromosome"
+ uint chromEnd; "End position in chromosome"
+ string name; "Variant display name"
+ uint score; "Not used, all 0"
+ char[1] strand; "Not used, all ."
+ uint thickStart; "Same as chromStart"
+ uint thickEnd; "Same as chromEnd"
+ uint reserved; "RGB color"
+ string acmgCode; "BA1 / BS1 / PM2_Supporting"
+ string points; "Tavtigian points"
+ string af; "Global AF in gnomAD v4.1 exomes"
+ string faf; "Filter allele frequency (faf95)"
+ string grpmax_af; "AF in the grpmax population"
+ string grpmax_pop; "Population with grpmax AF"
+ string chipNote; "gnomAD CHIP annotation (if any)"
+ lstring _mouseOver; "HTML mouseover"
+ )
+"""
+
+
+def classify(af_global, faf, af_grpmax):
+ if faf is not None and faf >= BA1_FAF:
+ return 'BA1'
+ if faf is not None and BS1_FAF_LOW <= faf < BS1_FAF_HIGH:
+ return 'BS1'
+ # PM2_Supporting: rare globally AND grpmax under limit
+ if (af_global is not None and af_global < PM2_AF_GLOBAL_MAX
+ and af_grpmax is not None and af_grpmax < PM2_AF_GRPMAX_MAX
+ and (af_global > 0 or af_grpmax > 0)):
+ return 'PM2_Supporting'
+ return None
+
+
+def safe_float(s):
+ if s is None or s == '' or s == 'N/A':
+ return None
+ try:
+ return float(s)
+ except ValueError:
+ return None
+
+
+def mouseover(display, code, ref, alt, af_global, faf, af_grpmax, grpmax_pop, chip):
+ chip_line = ""
+ if chip and chip not in ('N/A', ''):
+ chip_line = "
CHIP note: {}".format(chip)
+ return (
+ "Variant: {disp} ({ref}>{alt})"
+ "
ACMG code: {code} ({pts})"
+ "
Rule: {rule}"
+ "
Global AF: {af}"
+ "
FAF (faf95): {faf}"
+ "
grpmax AF: {gmax} ({gpop})"
+ "{chip}"
+ "
Source: gnomAD v4.1 exomes"
+ ).format(
+ disp=display, ref=ref, alt=alt,
+ code=code, pts=POINTS[code], rule=RULES[code],
+ af="{:.2e}".format(af_global) if af_global is not None else "N/A",
+ faf="{:.2e}".format(faf) if faf is not None else "N/A",
+ gmax="{:.2e}".format(af_grpmax) if af_grpmax is not None else "N/A",
+ gpop=grpmax_pop or "N/A",
+ chip=chip_line,
+ )
+
+
+def classify_and_build_rows(tx, chrom):
+ """Query gnomAD v4.1 exomes on hg38 and emit a list of classified rows
+ keyed by an immutable hg38 identifier. The hg38 identifier is used as
+ the 'name' field so the hg19 build can look up the same row after liftOver
+ and rewrite the display text to reflect hg19 coords."""
+ raw_bed = "/tmp/tp53_gnomad_{}.bed".format(os.getpid())
+ cmd = "bigBedToBed {} -chrom={} -start={} -end={} {}".format(
+ GNOMAD_BB, chrom, tx['txStart'], tx['txEnd'], raw_bed)
+ lib.bash(cmd)
+ with open(raw_bed) as f:
+ rows = [line.rstrip("\n").split("\t") for line in f]
+ os.remove(raw_bed)
+ print(" {} variants in TP53 region (hg38)".format(len(rows)))
+
+ # Build rows keyed on the hg38 display id
+ classified = [] # list of dicts with all fields; hg38 coords fixed
+ stats = dict(total=len(rows), BA1=0, BS1=0, PM2=0, skipped=0)
+ for r in rows:
+ c_start = int(r[1])
+ c_end = int(r[2])
+ ref = r[9]
+ alt = r[10]
+ af_global = safe_float(r[14])
+ faf = safe_float(r[15])
+ af_grpmax = safe_float(r[26])
+ grpmax_pop = r[23] if r[23] not in ('N/A', '') else None
+ chip = r[28] if len(r) > 28 else ''
+ hg38_name = "{}-{}-{}-{}".format(chrom, c_start + 1, ref, alt)
+
+ code = classify(af_global, faf, af_grpmax)
+ if code is None:
+ stats['skipped'] += 1
+ continue
+ stats[code if code in ('BA1', 'BS1') else 'PM2'] += 1
+ classified.append({
+ 'hg38_name': hg38_name,
+ 'hg38_start': c_start,
+ 'hg38_end': c_end,
+ 'chrom': chrom,
+ 'ref': ref, 'alt': alt,
+ 'af_global': af_global, 'faf': faf, 'af_grpmax': af_grpmax,
+ 'grpmax_pop': grpmax_pop, 'chip': chip,
+ 'code': code,
+ })
+ print(" classified: BA1={BA1} BS1={BS1} PM2={PM2} skipped={skipped}".format(**stats))
+ return classified
+
+
+def emit_rows(classified, assembly, coord_lookup=None):
+ """Format rows for the given assembly. coord_lookup is a dict keyed on
+ hg38_name → (chrom, 0-based start, end) for hg19 liftOver. For hg38 pass
+ None — rows use their native hg38 coords."""
+ lines = []
+ for rec in classified:
+ if assembly == 'hg38':
+ chrom = rec['chrom']
+ start = rec['hg38_start']
+ end = rec['hg38_end']
+ else:
+ if rec['hg38_name'] not in coord_lookup:
+ continue
+ chrom, start, end = coord_lookup[rec['hg38_name']]
+ # Use assembly-appropriate display name so hg19 viewers see hg19 pos
+ display = "{}-{}-{}-{}".format(chrom, start + 1, rec['ref'], rec['alt'])
+ color = COLORS[rec['code']]
+ mo = mouseover(display, rec['code'], rec['ref'], rec['alt'],
+ rec['af_global'], rec['faf'], rec['af_grpmax'],
+ rec['grpmax_pop'], rec['chip'])
+ lines.append("\t".join([
+ chrom, str(start), str(end),
+ display, "0", ".",
+ str(start), str(end),
+ color, rec['code'], POINTS[rec['code']],
+ "{:.2e}".format(rec['af_global']) if rec['af_global'] is not None else "N/A",
+ "{:.2e}".format(rec['faf']) if rec['faf'] is not None else "N/A",
+ "{:.2e}".format(rec['af_grpmax']) if rec['af_grpmax'] is not None else "N/A",
+ rec['grpmax_pop'] or "N/A",
+ rec['chip'] or "",
+ mo,
+ ]))
+ return lines
+
+
+def liftover_hg38_to_hg19(classified, outdir):
+ """Lift each hg38 coord to hg19, returning dict hg38_name → (chrom,start,end)."""
+ chain = "/cluster/data/hg38/bed/liftOver/hg38ToHg19.over.chain.gz"
+ input_bed = os.path.join(outdir, ".tp53af_lift_in.bed")
+ output_bed = os.path.join(outdir, ".tp53af_lift_out.bed")
+ unmapped = os.path.join(outdir, ".tp53af_unmapped.bed")
+ with open(input_bed, 'w') as f:
+ for rec in classified:
+ f.write("{}\t{}\t{}\t{}\n".format(
+ rec['chrom'], rec['hg38_start'], rec['hg38_end'], rec['hg38_name']))
+ lib.bash("liftOver {} {} {} {}".format(input_bed, chain, output_bed, unmapped))
+ lookup = {}
+ with open(output_bed) as f:
+ for line in f:
+ flds = line.rstrip("\n").split("\t")
+ if len(flds) >= 4:
+ lookup[flds[3]] = (flds[0], int(flds[1]), int(flds[2]))
+ for p in [input_bed, output_bed, unmapped]:
+ if os.path.exists(p):
+ os.remove(p)
+ return lookup
+
+
+def build(db, outdir):
+ print("=== {} ===".format(db))
+ os.makedirs(outdir, exist_ok=True)
+ # We always query gnomAD on hg38 (the source), then lift to hg19 if needed
+ tx_hg38 = lib.get_transcript_info('hg38')
+ classified = classify_and_build_rows(tx_hg38, tx_hg38['chrom'])
+
+ as_file = os.path.join(outdir, "TP53AF.as")
+ lib.write_autosql(as_file, AUTOSQL)
+ bed = os.path.join(outdir, "TP53AF_{}.bed".format(db))
+ bb = os.path.join(outdir, "TP53AF{}.bb".format(db.capitalize()))
+
+ if db == 'hg38':
+ lines = emit_rows(classified, 'hg38')
+ with open(bed, 'w') as f:
+ f.write("\n".join(lines) + "\n")
+ lib.bash("sort -k1,1 -k2,2n {0} -o {0}".format(bed))
+ lib.run_bedToBigBed(bed, as_file, bb, lib.chrom_sizes_path(db), "bed9+8")
+ print(" wrote {}".format(bb))
+ return
+
+ # hg19 build: liftOver each record and rewrite display name
+ lookup = liftover_hg38_to_hg19(classified, outdir)
+ dropped = len(classified) - len(lookup)
+ if dropped:
+ print(" liftOver dropped {} variants".format(dropped))
+ lines = emit_rows(classified, 'hg19', coord_lookup=lookup)
+ with open(bed, 'w') as f:
+ f.write("\n".join(lines) + "\n")
+ lib.bash("sort -k1,1 -k2,2n {0} -o {0}".format(bed))
+ lib.run_bedToBigBed(bed, as_file, bb, lib.chrom_sizes_path(db), "bed9+8")
+ print(" wrote {}".format(bb))
+ return
+
+
+
+
+def main():
+ p = argparse.ArgumentParser(description=__doc__)
+ p.add_argument('-o', '--output-dir', default=DEFAULT_OUTDIR)
+ p.add_argument('--db', action='append', help='hg38 or hg19 (repeat). Default hg38.')
+ args = p.parse_args()
+ dbs = args.db if args.db else ['hg38']
+ for db in dbs:
+ build(db, args.output_dir)
+
+
+if __name__ == "__main__":
+ main()