src/hg/makeDb/scripts/tp53/tp53AFfrequencies.py a56d88e05670b759ff3b32829542537ccc790c57

a56d88e05670b759ff3b32829542537ccc790c57
lrnassar
  Tue Apr 28 19:18:20 2026 -0700
Address CR feedback on insight + tp53 hub scripts. refs #37418

Drop duplicated bash() wrappers in favor of subprocess.run / check_output
with list args, eliminating shell=True, embedded-quote concerns, and
stderr-into-stdout merging. Centralize common operations as
run_sort_bed/run_liftOver in tp53FuncLib alongside existing run_bedToBigBed.

Switch HTML escaping to stdlib html.escape() consistently. insightHCIPriors
mouseover (previously unescaped) now escapes HGVS fields, addressing the
specific c.123A>G case Jonathan flagged. Replace invalid </br> tags with
<br> across all five affected mouseover sites.

diff --git src/hg/makeDb/scripts/tp53/tp53AFfrequencies.py src/hg/makeDb/scripts/tp53/tp53AFfrequencies.py
index 3e85859e7b4..e2437044d85 100644
--- src/hg/makeDb/scripts/tp53/tp53AFfrequencies.py
+++ src/hg/makeDb/scripts/tp53/tp53AFfrequencies.py
@@ -7,30 +7,31 @@
 
     BA1            FAF >= 0.001                                     stand-alone B
     BS1            0.0003 <= FAF < 0.001                            -4 pts
     PM2_Supporting AF < 0.00003 global AND grpmax AF < 0.00004      +1 pt
 
 Uses faf95 (col 16) from the UCSC gnomAD v4.1 bigBed, plus grpmax AF
 (col 27) and global AF (col 15). CHIP note (col 29) surfaced in mouseover.
 
 Founder-effect ancestry groups (AJ/FIN/AMI/MID/Remaining) are EXCLUDED from
 the per-ancestry check per CSpec &#8212; our PM2_Supporting uses grpmax as a
 conservative proxy and flags when the proxy may miss qualifying variants.
 """
 
 import argparse
 import os
+import subprocess
 import sys
 
 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
 import tp53FuncLib as lib
 
 DEFAULT_OUTDIR = "/hive/users/lrnassar/claude/RM37399/afFrequencies"
 GNOMAD_BB = "/gbdb/hg38/gnomAD/v4.1/exomes/exomes.bb"
 
 # CSpec GN009 v2.4.0 TP53 thresholds
 BA1_FAF      = 0.001
 BS1_FAF_LOW  = 0.0003
 BS1_FAF_HIGH = 0.001
 PM2_AF_GLOBAL_MAX = 0.00003
 PM2_AF_GRPMAX_MAX = 0.00004
 
@@ -116,33 +117,37 @@
         code=code, pts=POINTS[code], rule=RULES[code],
         af="{:.2e}".format(af_global) if af_global is not None else "N/A",
         faf="{:.2e}".format(faf) if faf is not None else "N/A",
         gmax="{:.2e}".format(af_grpmax) if af_grpmax is not None else "N/A",
         gpop=grpmax_pop or "N/A",
         chip=chip_line,
     )
 
 
 def classify_and_build_rows(tx, chrom):
     """Query gnomAD v4.1 exomes on hg38 and emit a list of classified rows
     keyed by an immutable hg38 identifier. The hg38 identifier is used as
     the 'name' field so the hg19 build can look up the same row after liftOver
     and rewrite the display text to reflect hg19 coords."""
     raw_bed = "/tmp/tp53_gnomad_{}.bed".format(os.getpid())
-    cmd = "bigBedToBed {} -chrom={} -start={} -end={} {}".format(
-        GNOMAD_BB, chrom, tx['txStart'], tx['txEnd'], raw_bed)
-    lib.bash(cmd)
+    subprocess.run(
+        ["bigBedToBed", GNOMAD_BB,
+         "-chrom=" + chrom,
+         "-start=" + str(tx['txStart']),
+         "-end=" + str(tx['txEnd']),
+         raw_bed],
+        check=True)
     with open(raw_bed) as f:
         rows = [line.rstrip("\n").split("\t") for line in f]
     os.remove(raw_bed)
     print("  {} variants in TP53 region (hg38)".format(len(rows)))
 
     # Build rows keyed on the hg38 display id
     classified = []  # list of dicts with all fields; hg38 coords fixed
     stats = dict(total=len(rows), BA1=0, BS1=0, PM2=0, skipped=0)
     for r in rows:
         c_start = int(r[1])
         c_end = int(r[2])
         ref = r[9]
         alt = r[10]
         af_global = safe_float(r[14])
         faf = safe_float(r[15])
@@ -203,73 +208,73 @@
             mo,
         ]))
     return lines
 
 
 def liftover_hg38_to_hg19(classified, outdir):
     """Lift each hg38 coord to hg19, returning dict hg38_name &#8594; (chrom,start,end)."""
     chain = "/cluster/data/hg38/bed/liftOver/hg38ToHg19.over.chain.gz"
     input_bed = os.path.join(outdir, ".tp53af_lift_in.bed")
     output_bed = os.path.join(outdir, ".tp53af_lift_out.bed")
     unmapped = os.path.join(outdir, ".tp53af_unmapped.bed")
     with open(input_bed, 'w') as f:
         for rec in classified:
             f.write("{}\t{}\t{}\t{}\n".format(
                 rec['chrom'], rec['hg38_start'], rec['hg38_end'], rec['hg38_name']))
-    lib.bash("liftOver {} {} {} {}".format(input_bed, chain, output_bed, unmapped))
+    lib.run_liftOver(input_bed, chain, output_bed, unmapped)
     lookup = {}
     with open(output_bed) as f:
         for line in f:
             flds = line.rstrip("\n").split("\t")
             if len(flds) >= 4:
                 lookup[flds[3]] = (flds[0], int(flds[1]), int(flds[2]))
     for p in [input_bed, output_bed, unmapped]:
         if os.path.exists(p):
             os.remove(p)
     return lookup
 
 
 def build(db, outdir):
     print("=== {} ===".format(db))
     os.makedirs(outdir, exist_ok=True)
     # We always query gnomAD on hg38 (the source), then lift to hg19 if needed
     tx_hg38 = lib.get_transcript_info('hg38')
     classified = classify_and_build_rows(tx_hg38, tx_hg38['chrom'])
 
     as_file = os.path.join(outdir, "TP53AF.as")
     lib.write_autosql(as_file, AUTOSQL)
     bed = os.path.join(outdir, "TP53AF_{}.bed".format(db))
     bb = os.path.join(outdir, "TP53AF{}.bb".format(db.capitalize()))
 
     if db == 'hg38':
         lines = emit_rows(classified, 'hg38')
         with open(bed, 'w') as f:
             f.write("\n".join(lines) + "\n")
-        lib.bash("sort -k1,1 -k2,2n {0} -o {0}".format(bed))
+        lib.run_sort_bed(bed)
         lib.run_bedToBigBed(bed, as_file, bb, lib.chrom_sizes_path(db), "bed9+8")
         print("  wrote {}".format(bb))
         return
 
     # hg19 build: liftOver each record and rewrite display name
     lookup = liftover_hg38_to_hg19(classified, outdir)
     dropped = len(classified) - len(lookup)
     if dropped:
         print("  liftOver dropped {} variants".format(dropped))
     lines = emit_rows(classified, 'hg19', coord_lookup=lookup)
     with open(bed, 'w') as f:
         f.write("\n".join(lines) + "\n")
-    lib.bash("sort -k1,1 -k2,2n {0} -o {0}".format(bed))
+    lib.run_sort_bed(bed)
     lib.run_bedToBigBed(bed, as_file, bb, lib.chrom_sizes_path(db), "bed9+8")
     print("  wrote {}".format(bb))
     return
 
 
 
 
 def main():
     p = argparse.ArgumentParser(description=__doc__)
     p.add_argument('-o', '--output-dir', default=DEFAULT_OUTDIR)
     p.add_argument('--db', action='append', help='hg38 or hg19 (repeat). Default hg38.')
     args = p.parse_args()
     dbs = args.db if args.db else ['hg38']
     for db in dbs:
         build(db, args.output_dir)