a56d88e05670b759ff3b32829542537ccc790c57
lrnassar
  Tue Apr 28 19:18:20 2026 -0700
Address CR feedback on insight + tp53 hub scripts. refs #37418

Drop duplicated bash() wrappers in favor of subprocess.run / check_output
with list args, eliminating shell=True, embedded-quote concerns, and
stderr-into-stdout merging. Centralize common operations as
run_sort_bed/run_liftOver in tp53FuncLib alongside existing run_bedToBigBed.

Switch HTML escaping to stdlib html.escape() consistently. insightHCIPriors
mouseover (previously unescaped) now escapes HGVS fields, addressing the
specific c.123A>G case Jonathan flagged. Replace invalid </br> tags with
<br> across all five affected mouseover sites.

diff --git src/hg/makeDb/scripts/tp53/tp53Flossies.py src/hg/makeDb/scripts/tp53/tp53Flossies.py
index 3410cf07bef..580695ca24b 100644
--- src/hg/makeDb/scripts/tp53/tp53Flossies.py
+++ src/hg/makeDb/scripts/tp53/tp53Flossies.py
@@ -1,253 +1,253 @@
 #!/usr/bin/env python3
 """
 TP53 VCEP FLOSSIES BS2 track generator.
 
 FLOSSIES (Female-specific aging cohort, https://whi.color.com/) sequences
 TP53 in ~4,942 women >70 unaffected by cancer. Per CSpec GN009 v2.4.0,
 observation in this cohort is a BS2 evidence source for TP53 (variant
 observed in healthy aged controls weakens pathogenic interpretation).
 
 Source: window.table_variants embedded in
 https://whi.color.com/gene/ENSG00000141510 (server-rendered HTML).
 Coordinates are hg19 / NM_000546.5; we lift to hg38 and reuse the same
 genomic position (NM_000546.5 and .6 are coding-identical for TP53).
 
 bigBed 9+8 with filterValues on bs2Applies and consequence.
 """
 
 import argparse
 import json
 import os
 import sys
 
 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
 import tp53FuncLib as lib
 
 DEFAULT_OUTDIR = "/hive/users/lrnassar/claude/RM37399/flossies"
 DEFAULT_SRC = "/hive/users/lrnassar/claude/RM37399/flossies_tp53.json"
 
 COLORS = {
     'BS2':           '35,159,134',  # teal (matches BS1 family in AF track)
     'Informational': '180,180,180', # gray (non-coding observations)
 }
 
 # Per CSpec GN009 v2.4.0 §BS2: variant observed in cohort of healthy
 # adult women >70 weakens disease-causation evidence. We apply BS2 to
 # coding-region observations (missense, synonymous, splice). Non-coding
 # (UTR / deep intronic) observations are kept in the track for reference
 # but flagged "Informational" since BS2 is not standardly applied to
 # non-coding TP53 changes.
 CODING_CONSEQUENCES = {
     'missense_variant', 'synonymous_variant', 'splice_donor_variant',
     'splice_acceptor_variant', 'splice_region_variant', 'stop_gained',
     'stop_lost', 'start_lost', 'frameshift_variant',
     'inframe_insertion', 'inframe_deletion',
 }
 
 AUTOSQL = """table TP53Flossies
 "TP53 VCEP BS2 evidence: observations in the FLOSSIES healthy-women-over-70 cohort"
    (
    string chrom;        "Reference sequence chromosome or scaffold"
    uint   chromStart;   "Start position in chromosome"
    uint   chromEnd;     "End position in chromosome"
    string name;         "Variant display name"
    uint   score;        "Not used, all 0"
    char[1] strand;      "Not used, all ."
    uint   thickStart;   "Same as chromStart"
    uint   thickEnd;     "Same as chromEnd"
    uint   reserved;     "RGB color"
    string bs2Applies;   "BS2 / Informational"
    string points;       "Tavtigian points (-4 for BS2, 0 for Informational)"
    string consequence;  "Predicted consequence (missense, synonymous, UTR, etc.)"
    string hgvsc;        "HGVSc cDNA notation"
    string hgvsp;        "HGVSp protein notation (if coding)"
    uint   alleleCount;  "Allele count across the FLOSSIES cohort"
    uint   alleleNum;    "Total allele count (cohort size x 2)"
    string popBreakdown; "Per-population AC/AN"
    string genomicRef;   "Genomic reference allele (matches chromStart base on + strand)"
    string genomicAlt;   "Genomic alternate allele"
    lstring _mouseOver;  "HTML mouseover"
    )
 """
 
 
 def categorize(consequence):
     """Return ('BS2', '-4 pts') or ('Informational', '0 pts')."""
     if consequence in CODING_CONSEQUENCES:
         return ('BS2', '-4 pts')
     return ('Informational', '0 pts')
 
 
 def mouseover(disp, hgvsc, hgvsp, conseq, applies, pts,
               ac, an, pop_acs, pop_ans):
     pop_lines = []
     for pop in pop_acs:
         an_p = pop_ans.get(pop, 0) or 0
         ac_p = pop_acs.get(pop, 0) or 0
         if an_p:
             pop_lines.append("{}: {}/{} ({:.4f})".format(
                 pop, ac_p, an_p, ac_p / an_p))
         else:
             pop_lines.append("{}: {}/{}".format(pop, ac_p, an_p))
     af = (ac / an) if an else 0.0
     bs2_section = ""
     if applies == 'BS2':
         bs2_section = (
             "<br><b>BS2 applicability:</b> Yes ({} pts) "
             "&#8212; coding observation in healthy aged-female cohort"
         ).format(pts)
     else:
         bs2_section = (
             "<br><b>BS2 applicability:</b> Informational only "
             "&#8212; non-coding consequence; BS2 is not standardly applied "
             "to TP53 UTR/intronic changes per CSpec GN009"
         )
     hgvsp_line = ""
     if hgvsp:
         hgvsp_line = "<br><b>Protein:</b> {}".format(hgvsp)
     return (
         "<b>FLOSSIES observation (BS2 evidence)</b>"
         "<br><b>Variant:</b> {disp}"
         "<br><b>cDNA:</b> {hgvsc}"
         "{hgvsp_line}"
         "<br><b>Consequence:</b> {cons}"
         "<br><b>Cohort:</b> healthy women &gt;70 yo (FLOSSIES, WHI/UW)"
         "<br><b>Allele count:</b> {ac}/{an} (AF {af:.4f})"
         "<br><b>By population:</b> {pop}"
         "{bs2}"
         "<br><b>Source:</b> FLOSSIES (whi.color.com); "
         "applied per CSpec GN009 v2.4.0 &#167;BS2"
     ).format(
         disp=disp, hgvsc=hgvsc or "N/A", hgvsp_line=hgvsp_line,
         cons=conseq, ac=ac, an=an, af=af,
         pop=" | ".join(pop_lines) if pop_lines else "N/A",
         bs2=bs2_section,
     )
 
 
 def load_flossies(src_json):
     with open(src_json) as f:
         return json.load(f)
 
 
 def hg19_to_hg38_lift(records, outdir):
     """Lift FLOSSIES hg19 coords to hg38; return dict keyed on variant_id."""
     chain = "/gbdb/hg19/liftOver/hg19ToHg38.over.chain.gz"
     if not os.path.exists(chain):
         chain = "/cluster/data/hg19/bed/liftOver/hg19ToHg38.over.chain.gz"
     in_bed = os.path.join(outdir, ".flossies_lift_in.bed")
     out_bed = os.path.join(outdir, ".flossies_lift_out.bed")
     unmapped = os.path.join(outdir, ".flossies_unmapped.bed")
     with open(in_bed, 'w') as f:
         for v in records:
             pos = int(v['pos'])
             ref = v['ref']
             f.write("chr{}\t{}\t{}\t{}\n".format(
                 v['chrom'], pos - 1, pos - 1 + len(ref), v['variant_id']))
-    lib.bash("liftOver {} {} {} {}".format(in_bed, chain, out_bed, unmapped))
+    lib.run_liftOver(in_bed, chain, out_bed, unmapped)
     lookup = {}
     with open(out_bed) as f:
         for line in f:
             flds = line.rstrip("\n").split("\t")
             if len(flds) >= 4:
                 lookup[flds[3]] = (flds[0], int(flds[1]), int(flds[2]))
     for p in [in_bed, out_bed, unmapped]:
         if os.path.exists(p):
             os.remove(p)
     return lookup
 
 
 def emit_rows(records, assembly, hg38_lookup):
     """Build BED9+8 lines for the requested assembly."""
     lines = []
     skipped = 0
     for v in records:
         ref = v['ref']
         alt = v['alt']
         pos_hg19 = int(v['pos'])
         if assembly == 'hg19':
             chrom = "chr{}".format(v['chrom'])
             start = pos_hg19 - 1
             end = start + len(ref)
         else:
             if v['variant_id'] not in hg38_lookup:
                 skipped += 1
                 continue
             chrom, start, end = hg38_lookup[v['variant_id']]
         conseq = v.get('major_consequence') or 'unknown'
         applies, pts = categorize(conseq)
         color = COLORS[applies]
         hgvsc = v.get('HGVSc') or ''
         hgvsp = v.get('HGVSp') or ''
         ac = int(v.get('allele_count') or 0)
         an = int(v.get('allele_num') or 0)
         pop_acs = v.get('pop_acs') or {}
         pop_ans = v.get('pop_ans') or {}
         pop_breakdown = "; ".join(
             "{}={}/{}".format(p, pop_acs.get(p, 0), pop_ans.get(p, 0))
             for p in pop_acs
         )
         disp = "{}:{}{}>{}".format(chrom, start + 1, ref, alt)
         name = hgvsp if hgvsp else (hgvsc if hgvsc else disp)
         mo = mouseover(disp, hgvsc, hgvsp, conseq, applies, pts,
                        ac, an, pop_acs, pop_ans)
         lines.append("\t".join([
             chrom, str(start), str(end),
             name, "0", ".",
             str(start), str(end),
             color, applies, pts,
             conseq, hgvsc, hgvsp,
             str(ac), str(an), pop_breakdown,
             ref, alt,
             mo,
         ]))
     if skipped:
         print("  liftOver dropped {} variants".format(skipped))
     return lines
 
 
 def build(db, outdir, src_json):
     print("=== {} ===".format(db))
     os.makedirs(outdir, exist_ok=True)
     records = load_flossies(src_json)
     print("  {} FLOSSIES records loaded".format(len(records)))
     from collections import Counter
     consequences = Counter(r.get('major_consequence') for r in records)
     print("  Consequences: {}".format(dict(consequences)))
 
     if db == 'hg38':
         hg38_lookup = hg19_to_hg38_lift(records, outdir)
         print("  liftOver matched {}/{} variants".format(
             len(hg38_lookup), len(records)))
         lines = emit_rows(records, 'hg38', hg38_lookup)
     else:
         lines = emit_rows(records, 'hg19', None)
     print("  {} BED rows".format(len(lines)))
 
     bs2_count = sum(1 for L in lines if "\tBS2\t" in L)
     print("  BS2 applicable: {}".format(bs2_count))
 
     as_file = os.path.join(outdir, "TP53Flossies.as")
     lib.write_autosql(as_file, AUTOSQL)
     bed = os.path.join(outdir, "TP53Flossies_{}.bed".format(db))
     with open(bed, 'w') as f:
         f.write("\n".join(lines) + "\n")
-    lib.bash("sort -k1,1 -k2,2n {0} -o {0}".format(bed))
+    lib.run_sort_bed(bed)
     bb = os.path.join(outdir, "TP53Flossies{}.bb".format(db.capitalize()))
     lib.run_bedToBigBed(bed, as_file, bb, lib.chrom_sizes_path(db), "bed9+10")
     print("  wrote {}".format(bb))
 
 
 def main():
     p = argparse.ArgumentParser(description=__doc__)
     p.add_argument('-o', '--output-dir', default=DEFAULT_OUTDIR)
     p.add_argument('--db', action='append', help='hg38 or hg19 (repeat). Default hg38.')
     p.add_argument('--src', default=DEFAULT_SRC)
     args = p.parse_args()
     dbs = args.db if args.db else ['hg38']
     for db in dbs:
         build(db, args.output_dir, args.src)
 
 
 if __name__ == "__main__":
     main()