src/hg/makeDb/scripts/insight/insightFunctionalAssays.py a56d88e05670b759ff3b32829542537ccc790c57

a56d88e05670b759ff3b32829542537ccc790c57
lrnassar
  Tue Apr 28 19:18:20 2026 -0700
Address CR feedback on insight + tp53 hub scripts. refs #37418

Drop duplicated bash() wrappers in favor of subprocess.run / check_output
with list args, eliminating shell=True, embedded-quote concerns, and
stderr-into-stdout merging. Centralize common operations as
run_sort_bed/run_liftOver in tp53FuncLib alongside existing run_bedToBigBed.

Switch HTML escaping to stdlib html.escape() consistently. insightHCIPriors
mouseover (previously unescaped) now escapes HGVS fields, addressing the
specific c.123A>G case Jonathan flagged. Replace invalid </br> tags with
<br> across all five affected mouseover sites.

diff --git src/hg/makeDb/scripts/insight/insightFunctionalAssays.py src/hg/makeDb/scripts/insight/insightFunctionalAssays.py
index dd08b4c067a..9c7905d1544 100644
--- src/hg/makeDb/scripts/insight/insightFunctionalAssays.py
+++ src/hg/makeDb/scripts/insight/insightFunctionalAssays.py
@@ -13,36 +13,37 @@
 Each variant is classified according to ACMG PS3/BS3 criteria based on
 Tavtigian OddsPath thresholds or Jia LOF score thresholds.
 
 Output: BED9+7 bigBed files for hg38 and hg19.
 
 Supplementary data files (place in same directory as this script):
   drost2018_supplement.pdf - https://pmc.ncbi.nlm.nih.gov/articles/instance/7901556/bin/NIHMS1010100-supplement-CIMRA.pdf
   drost2020_supplement.docx - https://static-content.springer.com/esm/art%3A10.1038%2Fs41436-019-0736-2/MediaObjects/41436_2019_736_MOESM2_ESM.docx
   mmc2.xlsx - https://pmc.ncbi.nlm.nih.gov/articles/instance/7820803/bin/mmc2.xlsx
   rath2022_supplement.pdf - https://pmc.ncbi.nlm.nih.gov/articles/instance/9772141/bin/NIHMS1834139-supplement-supinfo.pdf
 
 Author: Generated for InSiGHT VCEP
 Date: 2025
 """
 
-import subprocess
+import html
 import os
 import re
+import subprocess
 import sys
-import zipfile
 import xml.etree.ElementTree as ET
+import zipfile
 
 try:
     import openpyxl
 except ImportError:
     print("ERROR: openpyxl is required. Install with: pip install openpyxl")
     sys.exit(1)
 
 # ============================================================================
 # Configuration
 # ============================================================================
 OUTPUT_DIR = "/hive/users/lrnassar/insightHub/functionalAssays"
 
 # Transcripts for coordinate mapping (current MANE versions in hgsql)
 TRANSCRIPTS = {
     'MLH1': 'NM_000249.4',
@@ -90,44 +91,34 @@
    uint   reserved;         "RGB color value"
    string gene;             "Gene symbol"
    string protein;          "Protein change (HGVSp notation)"
    string classification;   "ACMG evidence classification (PS3/BS3)"
    string clinVarId;        "ClinVar variation ID, if available"
    string score_value;      "Functional score (OddsPath or LOF)"
    string paperRef;         "Publication reference"
    lstring _mouseOver;      "HTML mouseover text"
    )
 """
 
 # ============================================================================
 # Utility functions
 # ============================================================================
 
-def bash(cmd):
-    """Run the cmd in bash subprocess"""
-    try:
-        rawBashOutput = subprocess.run(cmd, check=True, shell=True,
-                                       stdout=subprocess.PIPE, universal_newlines=True, stderr=subprocess.STDOUT)
-        bashStdout = rawBashOutput.stdout
-    except subprocess.CalledProcessError as e:
-        raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))
-    return(bashStdout)
-
 def get_transcript_info(db, accession):
     """Query hgsql to get transcript information from ncbiRefSeq"""
     query = f"SELECT name, chrom, strand, txStart, txEnd, cdsStart, cdsEnd, exonStarts, exonEnds FROM ncbiRefSeq WHERE name='{accession}'"
-    result = bash(f'hgsql {db} -Ne "{query}"')
+    result = subprocess.check_output(["hgsql", db, "-Ne", query], text=True)
 
     if not result.strip():
         raise ValueError(f"Transcript {accession} not found in {db}.ncbiRefSeq")
 
     fields = result.strip().split('\t')
 
     # Parse exon starts and ends (comma-separated, trailing comma)
     exon_starts = [int(x) for x in fields[7].rstrip(',').split(',')]
     exon_ends = [int(x) for x in fields[8].rstrip(',').split(',')]
 
     return {
         'name': fields[0],
         'chrom': fields[1],
         'strand': fields[2],
         'txStart': int(fields[3]),
@@ -299,43 +290,43 @@
     if val >= 0.4:
         return 'PS3_Strong'
     elif val >= 0:
         return 'Indeterminate'
     else:
         return 'BS3_Strong'
 
 # ============================================================================
 # MouseOver builder
 # ============================================================================
 
 def build_mouseover(name, protein, classification, clinvar_id, score_val, score_label, paper_key):
     """Build HTML mouseover text for a variant"""
     paper_info = PAPERS[paper_key]
     parts = []
-    parts.append(f"<b>HGVSc/HGVSp:</b> {name}")
-    parts.append(f"<b>Protein:</b> {protein}")
-    parts.append(f"<b>Classification:</b> {classification}")
+    parts.append(f"<b>HGVSc/HGVSp:</b> {html.escape(name)}")
+    parts.append(f"<b>Protein:</b> {html.escape(protein)}")
+    parts.append(f"<b>Classification:</b> {html.escape(classification)}")
 
     if clinvar_id and clinvar_id != 'NA' and clinvar_id != '':
         parts.append(f'<b>ClinVar ID:</b> <a href="https://www.ncbi.nlm.nih.gov/clinvar/variation/{clinvar_id}/" target="_blank">{clinvar_id}</a>')
     else:
         parts.append("<b>ClinVar ID:</b> N/A")
 
     parts.append(f"<b>{score_label}:</b> {score_val}")
     parts.append(f'<b>Paper:</b> <a href="https://pubmed.ncbi.nlm.nih.gov/{paper_info["pmid"]}/" target="_blank">{paper_info["ref"]}</a>')
 
-    return "</br>".join(parts)
+    return "<br>".join(parts)
 
 # ============================================================================
 # BED entry builder
 # ============================================================================
 
 def make_bed_entry(chrom, start, end, name, color, gene, protein,
                    classification, clinvar_id, score_val, paper_ref, mouseover):
     """Create a BED9+7 line"""
     return (f"{chrom}\t{start}\t{end}\t{name}\t0\t.\t{start}\t{end}\t{color}"
             f"\t{gene}\t{protein}\t{classification}\t{clinvar_id}"
             f"\t{score_val}\t{paper_ref}\t{mouseover}")
 
 # ============================================================================
 # Data parsing: Paper 1 - Drost et al. 2018
 # ============================================================================
@@ -875,41 +866,44 @@
 
     print(f"\n  Total variants to process: {len(all_variants)}")
 
     # Convert to BED entries
     print(f"\n  Querying transcript coordinates from {db}...")
     bed_entries = process_variants(all_variants, db, transcript_cache, stats)
 
     # Write BED file
     bed_file = os.path.join(OUTPUT_DIR, f"insightFunctionalAssays_{db}.bed")
     print(f"\n  Writing BED file: {bed_file}")
     with open(bed_file, 'w') as f:
         f.write('\n'.join(bed_entries) + '\n')
 
     # Sort BED file
     print("  Sorting BED file...")
-    bash(f"sort -k1,1 -k2,2n {bed_file} -o {bed_file}")
+    subprocess.run(["sort", "-k1,1", "-k2,2n", bed_file, "-o", bed_file], check=True)
 
     # Create bigBed
     as_file = os.path.join(OUTPUT_DIR, "insightFunctionalAssays.as")
     db_label = db.replace('hg', 'Hg')
     bb_file = os.path.join(OUTPUT_DIR, f"insightFunctionalAssays{db_label}.bb")
     chrom_sizes = f"/cluster/data/{db}/chrom.sizes"
 
     print(f"  Creating bigBed file: {bb_file}")
     try:
-        bash(f"bedToBigBed -as={as_file} -type=bed9+7 -tab {bed_file} {chrom_sizes} {bb_file}")
+        subprocess.run(
+            ["bedToBigBed", "-as=" + as_file, "-type=bed9+7", "-tab",
+             bed_file, chrom_sizes, bb_file],
+            check=True)
         print(f"    Successfully created: {bb_file}")
     except Exception as e:
         print(f"    ERROR creating bigBed: {e}")
 
     # Print stats
     print(f"\n  Statistics for {db}:")
     print(f"    Total included: {stats['included']}")
     for paper_key in ['drost2018', 'drost2020', 'jia2021', 'rath2022']:
         count = stats.get(f'included_{paper_key}', 0)
         if count > 0:
             print(f"      {PAPERS[paper_key]['ref']}: {count}")
     print(f"    Parse failed: {stats['parse_failed']}")
     print(f"    Coordinate failed: {stats['coord_failed']}")
     print(f"    Classify failed: {stats['classify_failed']}")