a56d88e05670b759ff3b32829542537ccc790c57
lrnassar
Tue Apr 28 19:18:20 2026 -0700
Address CR feedback on insight + tp53 hub scripts. refs #37418
Drop duplicated bash() wrappers in favor of subprocess.run / check_output
with list args, eliminating shell=True, embedded-quote concerns, and
stderr-into-stdout merging. Centralize common operations as
run_sort_bed/run_liftOver in tp53FuncLib alongside existing run_bedToBigBed.
Switch HTML escaping to stdlib html.escape() consistently. insightHCIPriors
mouseover (previously unescaped) now escapes HGVS fields, addressing the
specific c.123A>G case Jonathan flagged. Replace invalid tags with
across all five affected mouseover sites.
diff --git src/hg/makeDb/scripts/tp53/tp53PVS1Splice.py src/hg/makeDb/scripts/tp53/tp53PVS1Splice.py
index 8403d0b88e5..841004e8e93 100644
--- src/hg/makeDb/scripts/tp53/tp53PVS1Splice.py
+++ src/hg/makeDb/scripts/tp53/tp53PVS1Splice.py
@@ -1,219 +1,219 @@
#!/usr/bin/env python3
"""
TP53 VCEP PVS1 Splice Sites subtrack (under PVS1 Evidence composite).
Parses Supplementary Table S1 (1,061 rows) from the CSpec GN009 v2.4.0
splicing worksheet. Extracts the 120 canonical +/- 1,2 splice-site SNVs,
forward-fills PVS1 strength assignments from the anchor variant to its
two sibling variants at the same position, and emits bigBed 9+6.
Transcript: NM_000546.6 (chr17 minus strand).
"""
import argparse
import os
import re
import sys
import openpyxl
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
import tp53FuncLib as lib
DEFAULT_OUTDIR = "/hive/users/lrnassar/claude/RM37399/pvs1Splice"
DEFAULT_SRC = "/hive/users/lrnassar/claude/RM37399/tp53_downloads/splicing_worksheet.xlsx"
VAR_RE = re.compile(r'^c\.(-?\d+)([+-])([12])([ACGT])>([ACGT])$')
# (color, points, acmg_for_display). RNA-variants use same color as canonical.
STRENGTH_MAP = {
"PVS1": ("180,0,0", "+8", "PVS1"),
"PVS1 (RNA)": ("180,0,0", "+8", "PVS1 (RNA)"),
"PVS1_Strong": ("210,0,0", "+4", "PVS1_Strong"),
"PVS1_Strong (RNA)": ("210,0,0", "+4", "PVS1_Strong (RNA)"),
"PVS1_Moderate": ("204,102,0","+2", "PVS1_Moderate"),
"PVS1_Moderate (RNA)":("204,102,0","+2", "PVS1_Moderate (RNA)"),
"PVS1_N/A": ("128,128,128","0", "PVS1_N/A"),
}
UNASSIGNED = ("200,200,200", "0", "Not yet assigned by VCEP")
AUTOSQL = """table TP53PVS1Splice
"TP53 VCEP PVS1 splice-site variants (+/- 1,2) from CSpec GN009 v2.4.0 Table S1"
(
string chrom; "Reference sequence chromosome or scaffold"
uint chromStart; "Start position in chromosome"
uint chromEnd; "End position in chromosome"
string name; "HGVSc variant"
uint score; "Not used, all 0"
char[1] strand; "Not used, all ."
uint thickStart; "Same as chromStart"
uint thickEnd; "Same as chromEnd"
uint reserved; "RGB color"
string acmgCode; "VCEP-assigned PVS1 strength"
string points; "Tavtigian points"
string assignSource; "'Assigned' or 'Inherited from site anchor'"
lstring rationale; "VCEP rationale text (from Table S1)"
uint exonNum; "mRNA exon number"
lstring _mouseOver; "HTML mouseover"
)
"""
def cdna_splice_to_genomic(exon_num, sign, offset, tx):
"""Return BED (g_start, g_end) for a canonical +/-1/+/-2 splice variant."""
n_exons = len(tx['exonStarts'])
if tx['strand'] == '-':
idx = n_exons - exon_num
else:
idx = exon_num - 1
ex_g_start = tx['exonStarts'][idx]
ex_g_end = tx['exonEnds'][idx]
if tx['strand'] == '-':
if sign == '+':
g = ex_g_start - offset
else:
g = ex_g_end + offset - 1
else:
if sign == '+':
g = ex_g_end + offset - 1
else:
g = ex_g_start - offset
return (g, g + 1)
def load_variants(src_xlsx):
wb = openpyxl.load_workbook(src_xlsx, data_only=True)
ws = wb["Table S1"]
variants = []
for row in ws.iter_rows(min_row=4, values_only=True):
if row[0] is None or not row[2] or not isinstance(row[2], str):
continue
m = VAR_RE.match(row[2].strip())
if not m:
continue
variants.append({
'exon': int(row[0]),
'variant': row[2].strip(),
'c_pos': int(m.group(1)),
'sign': m.group(2),
'offset': int(m.group(3)),
'ref': m.group(4),
'alt': m.group(5),
'pvs1_raw': row[15],
'rationale_raw': row[16],
})
# Forward-fill within each splice-site position
by_site = {}
for v in variants:
key = (v['c_pos'], v['sign'], v['offset'])
by_site.setdefault(key, []).append(v)
for _key, vs in by_site.items():
anchor_pvs1 = next((x['pvs1_raw'] for x in vs if x['pvs1_raw']), None)
anchor_rat = next((x['rationale_raw'] for x in vs if x['rationale_raw']), None)
for x in vs:
if x['pvs1_raw']:
x['final_pvs1'] = x['pvs1_raw']
x['final_rat'] = x['rationale_raw'] or anchor_rat
x['assign_source'] = "Assigned"
elif anchor_pvs1:
x['final_pvs1'] = anchor_pvs1
x['final_rat'] = anchor_rat
x['assign_source'] = "Inherited from site anchor"
else:
x['final_pvs1'] = None
x['final_rat'] = None
x['assign_source'] = "Not yet assigned"
return variants
def mouseover(v, color, pts, acmg):
# Build rationale line: assigned variants should not say "not explicitly
# assigned" just because the rationale cell is blank.
if v['final_rat']:
rat_text = v['final_rat']
elif v['final_pvs1']:
rat_text = ("VCEP-assigned strength; detailed rationale not populated "
"in Table S1 for this position.")
else:
rat_text = "VCEP has not explicitly assigned a strength for this splice site."
return lib.html_ascii_safe((
"PVS1 splice-site variant: {name} ({pts} pts)"
"
ACMG code: {acmg}"
"
Exon: {exon} ({sign_label})"
"
Assignment: {src}"
"
VCEP rationale: {rat}"
"
Source: CSpec GN009 v2.4.0 Supplementary Table S1"
).format(
name=v['variant'], pts=pts, acmg=acmg,
exon=v['exon'],
sign_label="donor ({}{})".format(v['sign'], v['offset']) if v['sign'] == '+'
else "acceptor ({}{})".format(v['sign'], v['offset']),
src=v['assign_source'], rat=rat_text,
))
def generate_bed(variants, tx):
lines = []
chrom = tx['chrom']
for v in variants:
try:
g_start, g_end = cdna_splice_to_genomic(v['exon'], v['sign'], v['offset'], tx)
except IndexError:
continue
if g_start >= g_end:
continue
final = v['final_pvs1']
if final and final in STRENGTH_MAP:
color, pts, acmg = STRENGTH_MAP[final]
else:
color, pts, acmg = UNASSIGNED
lines.append("\t".join([
chrom, str(g_start), str(g_end),
v['variant'], "0", ".",
str(g_start), str(g_end),
color, acmg, pts, v['assign_source'],
lib.html_ascii_safe(v['final_rat'] or ""),
str(v['exon']),
mouseover(v, color, pts, acmg),
]))
return lines
def build(db, outdir, src_xlsx):
print("=== {} ===".format(db))
os.makedirs(outdir, exist_ok=True)
tx = lib.get_transcript_info(db)
variants = load_variants(src_xlsx)
print(" {} splice variants parsed".format(len(variants)))
from collections import Counter
dist = Counter(v['final_pvs1'] or "Unassigned" for v in variants)
print(" Strength distribution:")
for k, n in dist.most_common():
print(" {}: {}".format(k, n))
bed_lines = generate_bed(variants, tx)
print(" {} BED rows".format(len(bed_lines)))
as_file = os.path.join(outdir, "TP53PVS1Splice.as")
lib.write_autosql(as_file, AUTOSQL)
bed = os.path.join(outdir, "TP53PVS1Splice_{}.bed".format(db))
with open(bed, 'w') as f:
f.write("\n".join(bed_lines) + "\n")
- lib.bash("sort -k1,1 -k2,2n {0} -o {0}".format(bed))
+ lib.run_sort_bed(bed)
bb = os.path.join(outdir, "TP53PVS1Splice{}.bb".format(db.capitalize()))
lib.run_bedToBigBed(bed, as_file, bb, lib.chrom_sizes_path(db), "bed9+6")
print(" wrote {}".format(bb))
def main():
p = argparse.ArgumentParser(description=__doc__)
p.add_argument('-o', '--output-dir', default=DEFAULT_OUTDIR)
p.add_argument('--db', action='append', help='hg38 or hg19 (repeat). Default hg38.')
p.add_argument('--src', default=DEFAULT_SRC, help='Source XLSX (Table S1)')
args = p.parse_args()
dbs = args.db if args.db else ['hg38']
for db in dbs:
build(db, args.output_dir, args.src)
if __name__ == "__main__":
main()