f24ea956ba3b7a8c7ee8be0826d66489b85e0bbc
lrnassar
  Tue Apr 28 18:22:48 2026 -0700
Adding TP53 VCEP track hub build scripts. refs #37399

16 Python scripts that build the 15 bigBed tracks for the ClinGen TP53
VCEP track hub (CSpec GN009 v2.4.0, NM_000546.6 / NP_000537.3). Includes
the NON-FINAL Provisional Classification (Tavtigian point sum across PM1
+ PS3/BS3 + PP3/BP4 + AF + BS2 + splicing PP3), gnomAD v4.1 AF codes,
FLOSSIES BS2 evidence, Bioinformatic PP3/BP4 (missense + single-aa
in-frame del), VCEP curated variants from EvRepo with ClinVar VCV
backfill, PM1 (clinical domains + cancerhotspots), PVS1 regions and
splice sites, and the Functional Evidence composite (VCEP preliminary
PS3/BS3 with per-paper raw scores from Kato/Giacomelli/Kawaguchi/Funk).

diff --git src/hg/makeDb/scripts/tp53/tp53Flossies.py src/hg/makeDb/scripts/tp53/tp53Flossies.py
new file mode 100644
index 00000000000..3410cf07bef
--- /dev/null
+++ src/hg/makeDb/scripts/tp53/tp53Flossies.py
@@ -0,0 +1,253 @@
+#!/usr/bin/env python3
+"""
+TP53 VCEP FLOSSIES BS2 track generator.
+
+FLOSSIES (Female-specific aging cohort, https://whi.color.com/) sequences
+TP53 in ~4,942 women >70 unaffected by cancer. Per CSpec GN009 v2.4.0,
+observation in this cohort is a BS2 evidence source for TP53 (variant
+observed in healthy aged controls weakens pathogenic interpretation).
+
+Source: window.table_variants embedded in
+https://whi.color.com/gene/ENSG00000141510 (server-rendered HTML).
+Coordinates are hg19 / NM_000546.5; we lift to hg38 and reuse the same
+genomic position (NM_000546.5 and .6 are coding-identical for TP53).
+
+bigBed 9+8 with filterValues on bs2Applies and consequence.
+"""
+
+import argparse
+import json
+import os
+import sys
+
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+import tp53FuncLib as lib
+
+DEFAULT_OUTDIR = "/hive/users/lrnassar/claude/RM37399/flossies"
+DEFAULT_SRC = "/hive/users/lrnassar/claude/RM37399/flossies_tp53.json"
+
+COLORS = {
+    'BS2':           '35,159,134',  # teal (matches BS1 family in AF track)
+    'Informational': '180,180,180', # gray (non-coding observations)
+}
+
+# Per CSpec GN009 v2.4.0 §BS2: variant observed in cohort of healthy
+# adult women >70 weakens disease-causation evidence. We apply BS2 to
+# coding-region observations (missense, synonymous, splice). Non-coding
+# (UTR / deep intronic) observations are kept in the track for reference
+# but flagged "Informational" since BS2 is not standardly applied to
+# non-coding TP53 changes.
+CODING_CONSEQUENCES = {
+    'missense_variant', 'synonymous_variant', 'splice_donor_variant',
+    'splice_acceptor_variant', 'splice_region_variant', 'stop_gained',
+    'stop_lost', 'start_lost', 'frameshift_variant',
+    'inframe_insertion', 'inframe_deletion',
+}
+
+AUTOSQL = """table TP53Flossies
+"TP53 VCEP BS2 evidence: observations in the FLOSSIES healthy-women-over-70 cohort"
+   (
+   string chrom;        "Reference sequence chromosome or scaffold"
+   uint   chromStart;   "Start position in chromosome"
+   uint   chromEnd;     "End position in chromosome"
+   string name;         "Variant display name"
+   uint   score;        "Not used, all 0"
+   char[1] strand;      "Not used, all ."
+   uint   thickStart;   "Same as chromStart"
+   uint   thickEnd;     "Same as chromEnd"
+   uint   reserved;     "RGB color"
+   string bs2Applies;   "BS2 / Informational"
+   string points;       "Tavtigian points (-4 for BS2, 0 for Informational)"
+   string consequence;  "Predicted consequence (missense, synonymous, UTR, etc.)"
+   string hgvsc;        "HGVSc cDNA notation"
+   string hgvsp;        "HGVSp protein notation (if coding)"
+   uint   alleleCount;  "Allele count across the FLOSSIES cohort"
+   uint   alleleNum;    "Total allele count (cohort size x 2)"
+   string popBreakdown; "Per-population AC/AN"
+   string genomicRef;   "Genomic reference allele (matches chromStart base on + strand)"
+   string genomicAlt;   "Genomic alternate allele"
+   lstring _mouseOver;  "HTML mouseover"
+   )
+"""
+
+
+def categorize(consequence):
+    """Return ('BS2', '-4 pts') or ('Informational', '0 pts')."""
+    if consequence in CODING_CONSEQUENCES:
+        return ('BS2', '-4 pts')
+    return ('Informational', '0 pts')
+
+
+def mouseover(disp, hgvsc, hgvsp, conseq, applies, pts,
+              ac, an, pop_acs, pop_ans):
+    pop_lines = []
+    for pop in pop_acs:
+        an_p = pop_ans.get(pop, 0) or 0
+        ac_p = pop_acs.get(pop, 0) or 0
+        if an_p:
+            pop_lines.append("{}: {}/{} ({:.4f})".format(
+                pop, ac_p, an_p, ac_p / an_p))
+        else:
+            pop_lines.append("{}: {}/{}".format(pop, ac_p, an_p))
+    af = (ac / an) if an else 0.0
+    bs2_section = ""
+    if applies == 'BS2':
+        bs2_section = (
+            "<br><b>BS2 applicability:</b> Yes ({} pts) "
+            "&#8212; coding observation in healthy aged-female cohort"
+        ).format(pts)
+    else:
+        bs2_section = (
+            "<br><b>BS2 applicability:</b> Informational only "
+            "&#8212; non-coding consequence; BS2 is not standardly applied "
+            "to TP53 UTR/intronic changes per CSpec GN009"
+        )
+    hgvsp_line = ""
+    if hgvsp:
+        hgvsp_line = "<br><b>Protein:</b> {}".format(hgvsp)
+    return (
+        "<b>FLOSSIES observation (BS2 evidence)</b>"
+        "<br><b>Variant:</b> {disp}"
+        "<br><b>cDNA:</b> {hgvsc}"
+        "{hgvsp_line}"
+        "<br><b>Consequence:</b> {cons}"
+        "<br><b>Cohort:</b> healthy women &gt;70 yo (FLOSSIES, WHI/UW)"
+        "<br><b>Allele count:</b> {ac}/{an} (AF {af:.4f})"
+        "<br><b>By population:</b> {pop}"
+        "{bs2}"
+        "<br><b>Source:</b> FLOSSIES (whi.color.com); "
+        "applied per CSpec GN009 v2.4.0 &#167;BS2"
+    ).format(
+        disp=disp, hgvsc=hgvsc or "N/A", hgvsp_line=hgvsp_line,
+        cons=conseq, ac=ac, an=an, af=af,
+        pop=" | ".join(pop_lines) if pop_lines else "N/A",
+        bs2=bs2_section,
+    )
+
+
+def load_flossies(src_json):
+    with open(src_json) as f:
+        return json.load(f)
+
+
+def hg19_to_hg38_lift(records, outdir):
+    """Lift FLOSSIES hg19 coords to hg38; return dict keyed on variant_id."""
+    chain = "/gbdb/hg19/liftOver/hg19ToHg38.over.chain.gz"
+    if not os.path.exists(chain):
+        chain = "/cluster/data/hg19/bed/liftOver/hg19ToHg38.over.chain.gz"
+    in_bed = os.path.join(outdir, ".flossies_lift_in.bed")
+    out_bed = os.path.join(outdir, ".flossies_lift_out.bed")
+    unmapped = os.path.join(outdir, ".flossies_unmapped.bed")
+    with open(in_bed, 'w') as f:
+        for v in records:
+            pos = int(v['pos'])
+            ref = v['ref']
+            f.write("chr{}\t{}\t{}\t{}\n".format(
+                v['chrom'], pos - 1, pos - 1 + len(ref), v['variant_id']))
+    lib.bash("liftOver {} {} {} {}".format(in_bed, chain, out_bed, unmapped))
+    lookup = {}
+    with open(out_bed) as f:
+        for line in f:
+            flds = line.rstrip("\n").split("\t")
+            if len(flds) >= 4:
+                lookup[flds[3]] = (flds[0], int(flds[1]), int(flds[2]))
+    for p in [in_bed, out_bed, unmapped]:
+        if os.path.exists(p):
+            os.remove(p)
+    return lookup
+
+
+def emit_rows(records, assembly, hg38_lookup):
+    """Build BED9+8 lines for the requested assembly."""
+    lines = []
+    skipped = 0
+    for v in records:
+        ref = v['ref']
+        alt = v['alt']
+        pos_hg19 = int(v['pos'])
+        if assembly == 'hg19':
+            chrom = "chr{}".format(v['chrom'])
+            start = pos_hg19 - 1
+            end = start + len(ref)
+        else:
+            if v['variant_id'] not in hg38_lookup:
+                skipped += 1
+                continue
+            chrom, start, end = hg38_lookup[v['variant_id']]
+        conseq = v.get('major_consequence') or 'unknown'
+        applies, pts = categorize(conseq)
+        color = COLORS[applies]
+        hgvsc = v.get('HGVSc') or ''
+        hgvsp = v.get('HGVSp') or ''
+        ac = int(v.get('allele_count') or 0)
+        an = int(v.get('allele_num') or 0)
+        pop_acs = v.get('pop_acs') or {}
+        pop_ans = v.get('pop_ans') or {}
+        pop_breakdown = "; ".join(
+            "{}={}/{}".format(p, pop_acs.get(p, 0), pop_ans.get(p, 0))
+            for p in pop_acs
+        )
+        disp = "{}:{}{}>{}".format(chrom, start + 1, ref, alt)
+        name = hgvsp if hgvsp else (hgvsc if hgvsc else disp)
+        mo = mouseover(disp, hgvsc, hgvsp, conseq, applies, pts,
+                       ac, an, pop_acs, pop_ans)
+        lines.append("\t".join([
+            chrom, str(start), str(end),
+            name, "0", ".",
+            str(start), str(end),
+            color, applies, pts,
+            conseq, hgvsc, hgvsp,
+            str(ac), str(an), pop_breakdown,
+            ref, alt,
+            mo,
+        ]))
+    if skipped:
+        print("  liftOver dropped {} variants".format(skipped))
+    return lines
+
+
+def build(db, outdir, src_json):
+    print("=== {} ===".format(db))
+    os.makedirs(outdir, exist_ok=True)
+    records = load_flossies(src_json)
+    print("  {} FLOSSIES records loaded".format(len(records)))
+    from collections import Counter
+    consequences = Counter(r.get('major_consequence') for r in records)
+    print("  Consequences: {}".format(dict(consequences)))
+
+    if db == 'hg38':
+        hg38_lookup = hg19_to_hg38_lift(records, outdir)
+        print("  liftOver matched {}/{} variants".format(
+            len(hg38_lookup), len(records)))
+        lines = emit_rows(records, 'hg38', hg38_lookup)
+    else:
+        lines = emit_rows(records, 'hg19', None)
+    print("  {} BED rows".format(len(lines)))
+
+    bs2_count = sum(1 for L in lines if "\tBS2\t" in L)
+    print("  BS2 applicable: {}".format(bs2_count))
+
+    as_file = os.path.join(outdir, "TP53Flossies.as")
+    lib.write_autosql(as_file, AUTOSQL)
+    bed = os.path.join(outdir, "TP53Flossies_{}.bed".format(db))
+    with open(bed, 'w') as f:
+        f.write("\n".join(lines) + "\n")
+    lib.bash("sort -k1,1 -k2,2n {0} -o {0}".format(bed))
+    bb = os.path.join(outdir, "TP53Flossies{}.bb".format(db.capitalize()))
+    lib.run_bedToBigBed(bed, as_file, bb, lib.chrom_sizes_path(db), "bed9+10")
+    print("  wrote {}".format(bb))
+
+
+def main():
+    p = argparse.ArgumentParser(description=__doc__)
+    p.add_argument('-o', '--output-dir', default=DEFAULT_OUTDIR)
+    p.add_argument('--db', action='append', help='hg38 or hg19 (repeat). Default hg38.')
+    p.add_argument('--src', default=DEFAULT_SRC)
+    args = p.parse_args()
+    dbs = args.db if args.db else ['hg38']
+    for db in dbs:
+        build(db, args.output_dir, args.src)
+
+
+if __name__ == "__main__":
+    main()