f24ea956ba3b7a8c7ee8be0826d66489b85e0bbc lrnassar Tue Apr 28 18:22:48 2026 -0700 Adding TP53 VCEP track hub build scripts. refs #37399 16 Python scripts that build the 15 bigBed tracks for the ClinGen TP53 VCEP track hub (CSpec GN009 v2.4.0, NM_000546.6 / NP_000537.3). Includes the NON-FINAL Provisional Classification (Tavtigian point sum across PM1 + PS3/BS3 + PP3/BP4 + AF + BS2 + splicing PP3), gnomAD v4.1 AF codes, FLOSSIES BS2 evidence, Bioinformatic PP3/BP4 (missense + single-aa in-frame del), VCEP curated variants from EvRepo with ClinVar VCV backfill, PM1 (clinical domains + cancerhotspots), PVS1 regions and splice sites, and the Functional Evidence composite (VCEP preliminary PS3/BS3 with per-paper raw scores from Kato/Giacomelli/Kawaguchi/Funk). diff --git src/hg/makeDb/scripts/tp53/tp53ClinDomains.py src/hg/makeDb/scripts/tp53/tp53ClinDomains.py new file mode 100644 index 00000000000..bb5622540bd --- /dev/null +++ src/hg/makeDb/scripts/tp53/tp53ClinDomains.py @@ -0,0 +1,159 @@ +#!/usr/bin/env python3 +""" +TP53 VCEP Clinical Domains track generator. + +Builds bigBed 9+5 for the seven clinically relevant TP53 protein domains +defined in ClinGen CSpec GN009 v2.4.0 (PM1), plus the six PM1_Moderate +hotspot codons (R175, G245, R248, R249, R273, R282) overlaid on the DBD. + +Transcript: NM_000546.6 / NP_000537.3 (MANE Select), 393 aa, chr17 minus strand. +PM1 is applicable for TP53 (unlike the MMR genes in the InSiGHT hub). +""" + +import os +import sys + +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) +import tp53FuncLib as lib + +DEFAULT_OUTDIR = "/hive/users/lrnassar/claude/RM37399/clinDomains" + +# Domains from CSpec GN009 v2.4.0 §PM1 (aa ranges on NP_000537.3) +DOMAINS = [ + ("TAD1", 17, 25, "Transactivation domain 1"), + ("TAD2", 48, 56, "Transactivation domain 2"), + ("PRR", 64, 92, "Proline-rich region"), + ("DBD", 100, 292, "DNA binding domain"), + ("Hinge", 293, 324, "Hinge"), + ("OD", 325, 356, "Oligomerization (tetramerization) domain"), + ("CTD", 368, 387, "C-terminal (basic / regulatory) domain"), +] + +# PM1_Moderate hotspot codons. Tavtigian +2 points each. +HOTSPOT_CODONS = [ + (175, "R175", "DNA-contact / structural hotspot"), + (245, "G245", "structural hotspot"), + (248, "R248", "DNA-contact hotspot"), + (249, "R249", "structural hotspot"), + (273, "R273", "DNA-contact hotspot"), + (282, "R282", "DNA-contact hotspot"), +] + +DOMAIN_COLOR = "138,111,158" # purple +HOTSPOT_COLOR = "230,3,131" # fuchsia + +AUTOSQL = """table TP53clinDomains +"TP53 VCEP clinically relevant protein domains and PM1_Moderate hotspot codons (NM_000546.6)" + ( + string chrom; "Reference sequence chromosome or scaffold" + uint chromStart; "Start position in chromosome" + uint chromEnd; "End position in chromosome" + string name; "Domain name or hotspot residue" + uint score; "Not used, all 0" + char[1] strand; "Not used, all ." + uint thickStart; "Same as chromStart" + uint thickEnd; "Same as chromEnd" + uint reserved; "RGB value" + string domainType; "Either 'Domain' or 'PM1_Moderate hotspot'" + string NMaccession; "Transcript accession (NM_000546.6)" + string AAlocation; "Amino acid range" + string description; "Description or role" + lstring _mouseOver; "HTML mouseover" + ) +""" + + +def domain_mouseover(name, desc, aa_lo, aa_hi): + return ( + "<b>Domain:</b> {name} ({desc})" + "<br><b>Gene:</b> TP53" + "<br><b>Transcript:</b> {tx} (NP_000537.3)" + "<br><b>Amino acid loc:</b> {lo}-{hi}" + "<br><b>Note:</b> used in PVS1 decision tree; PM1 hotspot codons " + "overlaid on DBD contribute +2 pts each" + ).format(name=name, desc=desc, tx=lib.TRANSCRIPT, lo=aa_lo, hi=aa_hi) + + +def hotspot_mouseover(label, role, codon): + return ( + "<b>PM1_Moderate hotspot:</b> {label} (+2 pts)" + "<br><b>Role:</b> {role}" + "<br><b>Gene:</b> TP53" + "<br><b>Transcript:</b> {tx} (NP_000537.3)" + "<br><b>Codon:</b> {codon}" + "<br><b>ACMG code:</b> PM1_Moderate" + ).format(label=label, role=role, tx=lib.TRANSCRIPT, codon=codon) + + +def generate_bed(tx): + lines = [] + chrom = tx['chrom'] + for name, aa_lo, aa_hi, desc in DOMAINS: + mo = domain_mouseover(name, desc, aa_lo, aa_hi) + for g_start, g_end, _ex in lib.aa_to_genomic(aa_lo, aa_hi, tx): + if g_start >= g_end: + continue + lines.append("\t".join([ + chrom, str(g_start), str(g_end), + name, "0", ".", + str(g_start), str(g_end), + DOMAIN_COLOR, + "Domain", lib.TRANSCRIPT, + "{}-{}".format(aa_lo, aa_hi), + desc, + mo, + ])) + for codon, label, role in HOTSPOT_CODONS: + mo = hotspot_mouseover(label, role, codon) + for g_start, g_end, _ex in lib.aa_codon_genomic(codon, tx): + if g_start >= g_end: + continue + lines.append("\t".join([ + chrom, str(g_start), str(g_end), + label, "0", ".", + str(g_start), str(g_end), + HOTSPOT_COLOR, + "PM1_Moderate hotspot", lib.TRANSCRIPT, + str(codon), + role, + mo, + ])) + return lines + + +def build(db, outdir): + print("=== {} ===".format(db)) + tx = lib.get_transcript_info(db) + print(" {} at {}:{}-{} {}".format( + tx['name'], tx['chrom'], tx['txStart'], tx['txEnd'], tx['strand'])) + bed_lines = generate_bed(tx) + print(" {} BED rows".format(len(bed_lines))) + + os.makedirs(outdir, exist_ok=True) + as_file = os.path.join(outdir, "TP53clinDomains.as") + lib.write_autosql(as_file, AUTOSQL) + + bed = os.path.join(outdir, "TP53clinDomains_{}.bed".format(db)) + with open(bed, 'w') as f: + f.write("\n".join(bed_lines) + "\n") + lib.bash("sort -k1,1 -k2,2n {0} -o {0}".format(bed)) + + bb = os.path.join(outdir, "TP53clinDomains{}.bb".format(db.capitalize())) + lib.run_bedToBigBed(bed, as_file, bb, lib.chrom_sizes_path(db), "bed9+5") + print(" wrote {}".format(bb)) + + +def main(): + import argparse + p = argparse.ArgumentParser(description=__doc__) + p.add_argument('-o', '--output-dir', default=DEFAULT_OUTDIR) + p.add_argument('--db', action='append', + help='Assembly db (hg38 or hg19); repeat for both. Default: hg38 only.') + args = p.parse_args() + dbs = args.db if args.db else ['hg38'] + for db in dbs: + build(db, args.output_dir) + + +if __name__ == "__main__": + main()