f24ea956ba3b7a8c7ee8be0826d66489b85e0bbc
lrnassar
  Tue Apr 28 18:22:48 2026 -0700
Adding TP53 VCEP track hub build scripts. refs #37399

16 Python scripts that build the 15 bigBed tracks for the ClinGen TP53
VCEP track hub (CSpec GN009 v2.4.0, NM_000546.6 / NP_000537.3). Includes
the NON-FINAL Provisional Classification (Tavtigian point sum across PM1
+ PS3/BS3 + PP3/BP4 + AF + BS2 + splicing PP3), gnomAD v4.1 AF codes,
FLOSSIES BS2 evidence, Bioinformatic PP3/BP4 (missense + single-aa
in-frame del), VCEP curated variants from EvRepo with ClinVar VCV
backfill, PM1 (clinical domains + cancerhotspots), PVS1 regions and
splice sites, and the Functional Evidence composite (VCEP preliminary
PS3/BS3 with per-paper raw scores from Kato/Giacomelli/Kawaguchi/Funk).

diff --git src/hg/makeDb/scripts/tp53/tp53PVS1Splice.py src/hg/makeDb/scripts/tp53/tp53PVS1Splice.py
new file mode 100644
index 00000000000..8403d0b88e5
--- /dev/null
+++ src/hg/makeDb/scripts/tp53/tp53PVS1Splice.py
@@ -0,0 +1,219 @@
+#!/usr/bin/env python3
+"""
+TP53 VCEP PVS1 Splice Sites subtrack (under PVS1 Evidence composite).
+
+Parses Supplementary Table S1 (1,061 rows) from the CSpec GN009 v2.4.0
+splicing worksheet. Extracts the 120 canonical +/- 1,2 splice-site SNVs,
+forward-fills PVS1 strength assignments from the anchor variant to its
+two sibling variants at the same position, and emits bigBed 9+6.
+
+Transcript: NM_000546.6 (chr17 minus strand).
+"""
+
+import argparse
+import os
+import re
+import sys
+
+import openpyxl
+
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+import tp53FuncLib as lib
+
+DEFAULT_OUTDIR = "/hive/users/lrnassar/claude/RM37399/pvs1Splice"
+DEFAULT_SRC = "/hive/users/lrnassar/claude/RM37399/tp53_downloads/splicing_worksheet.xlsx"
+
+VAR_RE = re.compile(r'^c\.(-?\d+)([+-])([12])([ACGT])>([ACGT])$')
+
+# (color, points, acmg_for_display). RNA-variants use same color as canonical.
+STRENGTH_MAP = {
+    "PVS1":               ("180,0,0",  "+8", "PVS1"),
+    "PVS1 (RNA)":         ("180,0,0",  "+8", "PVS1 (RNA)"),
+    "PVS1_Strong":        ("210,0,0",  "+4", "PVS1_Strong"),
+    "PVS1_Strong (RNA)":  ("210,0,0",  "+4", "PVS1_Strong (RNA)"),
+    "PVS1_Moderate":      ("204,102,0","+2", "PVS1_Moderate"),
+    "PVS1_Moderate (RNA)":("204,102,0","+2", "PVS1_Moderate (RNA)"),
+    "PVS1_N/A":           ("128,128,128","0", "PVS1_N/A"),
+}
+UNASSIGNED = ("200,200,200", "0", "Not yet assigned by VCEP")
+
+AUTOSQL = """table TP53PVS1Splice
+"TP53 VCEP PVS1 splice-site variants (+/- 1,2) from CSpec GN009 v2.4.0 Table S1"
+   (
+   string chrom;       "Reference sequence chromosome or scaffold"
+   uint   chromStart;  "Start position in chromosome"
+   uint   chromEnd;    "End position in chromosome"
+   string name;        "HGVSc variant"
+   uint   score;       "Not used, all 0"
+   char[1] strand;     "Not used, all ."
+   uint   thickStart;  "Same as chromStart"
+   uint   thickEnd;    "Same as chromEnd"
+   uint   reserved;    "RGB color"
+   string acmgCode;    "VCEP-assigned PVS1 strength"
+   string points;      "Tavtigian points"
+   string assignSource; "'Assigned' or 'Inherited from site anchor'"
+   lstring rationale;  "VCEP rationale text (from Table S1)"
+   uint   exonNum;     "mRNA exon number"
+   lstring _mouseOver; "HTML mouseover"
+   )
+"""
+
+
+def cdna_splice_to_genomic(exon_num, sign, offset, tx):
+    """Return BED (g_start, g_end) for a canonical +/-1/+/-2 splice variant."""
+    n_exons = len(tx['exonStarts'])
+    if tx['strand'] == '-':
+        idx = n_exons - exon_num
+    else:
+        idx = exon_num - 1
+    ex_g_start = tx['exonStarts'][idx]
+    ex_g_end = tx['exonEnds'][idx]
+    if tx['strand'] == '-':
+        if sign == '+':
+            g = ex_g_start - offset
+        else:
+            g = ex_g_end + offset - 1
+    else:
+        if sign == '+':
+            g = ex_g_end + offset - 1
+        else:
+            g = ex_g_start - offset
+    return (g, g + 1)
+
+
+def load_variants(src_xlsx):
+    wb = openpyxl.load_workbook(src_xlsx, data_only=True)
+    ws = wb["Table S1"]
+    variants = []
+    for row in ws.iter_rows(min_row=4, values_only=True):
+        if row[0] is None or not row[2] or not isinstance(row[2], str):
+            continue
+        m = VAR_RE.match(row[2].strip())
+        if not m:
+            continue
+        variants.append({
+            'exon': int(row[0]),
+            'variant': row[2].strip(),
+            'c_pos': int(m.group(1)),
+            'sign': m.group(2),
+            'offset': int(m.group(3)),
+            'ref': m.group(4),
+            'alt': m.group(5),
+            'pvs1_raw': row[15],
+            'rationale_raw': row[16],
+        })
+    # Forward-fill within each splice-site position
+    by_site = {}
+    for v in variants:
+        key = (v['c_pos'], v['sign'], v['offset'])
+        by_site.setdefault(key, []).append(v)
+    for _key, vs in by_site.items():
+        anchor_pvs1 = next((x['pvs1_raw'] for x in vs if x['pvs1_raw']), None)
+        anchor_rat = next((x['rationale_raw'] for x in vs if x['rationale_raw']), None)
+        for x in vs:
+            if x['pvs1_raw']:
+                x['final_pvs1'] = x['pvs1_raw']
+                x['final_rat'] = x['rationale_raw'] or anchor_rat
+                x['assign_source'] = "Assigned"
+            elif anchor_pvs1:
+                x['final_pvs1'] = anchor_pvs1
+                x['final_rat'] = anchor_rat
+                x['assign_source'] = "Inherited from site anchor"
+            else:
+                x['final_pvs1'] = None
+                x['final_rat'] = None
+                x['assign_source'] = "Not yet assigned"
+    return variants
+
+
+def mouseover(v, color, pts, acmg):
+    # Build rationale line: assigned variants should not say "not explicitly
+    # assigned" just because the rationale cell is blank.
+    if v['final_rat']:
+        rat_text = v['final_rat']
+    elif v['final_pvs1']:
+        rat_text = ("VCEP-assigned strength; detailed rationale not populated "
+                    "in Table S1 for this position.")
+    else:
+        rat_text = "VCEP has not explicitly assigned a strength for this splice site."
+    return lib.html_ascii_safe((
+        "<b>PVS1 splice-site variant:</b> {name} ({pts} pts)"
+        "<br><b>ACMG code:</b> {acmg}"
+        "<br><b>Exon:</b> {exon} ({sign_label})"
+        "<br><b>Assignment:</b> {src}"
+        "<br><b>VCEP rationale:</b> {rat}"
+        "<br><b>Source:</b> CSpec GN009 v2.4.0 Supplementary Table S1"
+    ).format(
+        name=v['variant'], pts=pts, acmg=acmg,
+        exon=v['exon'],
+        sign_label="donor ({}{})".format(v['sign'], v['offset']) if v['sign'] == '+'
+        else "acceptor ({}{})".format(v['sign'], v['offset']),
+        src=v['assign_source'], rat=rat_text,
+    ))
+
+
+def generate_bed(variants, tx):
+    lines = []
+    chrom = tx['chrom']
+    for v in variants:
+        try:
+            g_start, g_end = cdna_splice_to_genomic(v['exon'], v['sign'], v['offset'], tx)
+        except IndexError:
+            continue
+        if g_start >= g_end:
+            continue
+        final = v['final_pvs1']
+        if final and final in STRENGTH_MAP:
+            color, pts, acmg = STRENGTH_MAP[final]
+        else:
+            color, pts, acmg = UNASSIGNED
+        lines.append("\t".join([
+            chrom, str(g_start), str(g_end),
+            v['variant'], "0", ".",
+            str(g_start), str(g_end),
+            color, acmg, pts, v['assign_source'],
+            lib.html_ascii_safe(v['final_rat'] or ""),
+            str(v['exon']),
+            mouseover(v, color, pts, acmg),
+        ]))
+    return lines
+
+
+def build(db, outdir, src_xlsx):
+    print("=== {} ===".format(db))
+    os.makedirs(outdir, exist_ok=True)
+    tx = lib.get_transcript_info(db)
+    variants = load_variants(src_xlsx)
+    print("  {} splice variants parsed".format(len(variants)))
+    from collections import Counter
+    dist = Counter(v['final_pvs1'] or "Unassigned" for v in variants)
+    print("  Strength distribution:")
+    for k, n in dist.most_common():
+        print("    {}: {}".format(k, n))
+    bed_lines = generate_bed(variants, tx)
+    print("  {} BED rows".format(len(bed_lines)))
+
+    as_file = os.path.join(outdir, "TP53PVS1Splice.as")
+    lib.write_autosql(as_file, AUTOSQL)
+    bed = os.path.join(outdir, "TP53PVS1Splice_{}.bed".format(db))
+    with open(bed, 'w') as f:
+        f.write("\n".join(bed_lines) + "\n")
+    lib.bash("sort -k1,1 -k2,2n {0} -o {0}".format(bed))
+    bb = os.path.join(outdir, "TP53PVS1Splice{}.bb".format(db.capitalize()))
+    lib.run_bedToBigBed(bed, as_file, bb, lib.chrom_sizes_path(db), "bed9+6")
+    print("  wrote {}".format(bb))
+
+
+def main():
+    p = argparse.ArgumentParser(description=__doc__)
+    p.add_argument('-o', '--output-dir', default=DEFAULT_OUTDIR)
+    p.add_argument('--db', action='append', help='hg38 or hg19 (repeat). Default hg38.')
+    p.add_argument('--src', default=DEFAULT_SRC, help='Source XLSX (Table S1)')
+    args = p.parse_args()
+    dbs = args.db if args.db else ['hg38']
+    for db in dbs:
+        build(db, args.output_dir, args.src)
+
+
+if __name__ == "__main__":
+    main()