src/hg/makeDb/scripts/insight/insightAFfrequencies.py a56d88e05670b759ff3b32829542537ccc790c57

a56d88e05670b759ff3b32829542537ccc790c57
lrnassar
  Tue Apr 28 19:18:20 2026 -0700
Address CR feedback on insight + tp53 hub scripts. refs #37418

Drop duplicated bash() wrappers in favor of subprocess.run / check_output
with list args, eliminating shell=True, embedded-quote concerns, and
stderr-into-stdout merging. Centralize common operations as
run_sort_bed/run_liftOver in tp53FuncLib alongside existing run_bedToBigBed.

Switch HTML escaping to stdlib html.escape() consistently. insightHCIPriors
mouseover (previously unescaped) now escapes HGVS fields, addressing the
specific c.123A>G case Jonathan flagged. Replace invalid </br> tags with
<br> across all five affected mouseover sites.

diff --git src/hg/makeDb/scripts/insight/insightAFfrequencies.py src/hg/makeDb/scripts/insight/insightAFfrequencies.py
index 5e8a1df83c3..33c7b7055cb 100644
--- src/hg/makeDb/scripts/insight/insightAFfrequencies.py
+++ src/hg/makeDb/scripts/insight/insightAFfrequencies.py
@@ -1,46 +1,37 @@
 #!/usr/bin/env python3
 """
 InSiGHT VCEP AF Frequencies Track Generator
 
 This script generates a UCSC Genome Browser track (bigBed format) that applies
 ACMG classification guidelines based on gnomAD v4.1 exome AF_grpmax values
 for Lynch syndrome genes: MLH1, MSH2, MSH6, and PMS2.
 
 Coordinates are extracted from gnomAD v4.1 exomes bigBed, with detailed
 annotations retrieved from the companion tab.gz file.
 
 Author: Generated for InSiGHT VCEP
 Date: 2025
 """
 
-import subprocess
-import os
-import struct
 import bisect
 import gzip
+import html
 import json
-
-def bash(cmd):
-    """Run the cmd in bash subprocess"""
-    try:
-        rawBashOutput = subprocess.run(cmd, check=True, shell=True,
-                                       stdout=subprocess.PIPE, universal_newlines=True, stderr=subprocess.STDOUT)
-        bashStdout = rawBashOutput.stdout
-    except subprocess.CalledProcessError as e:
-        raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))
-    return(bashStdout)
+import os
+import struct
+import subprocess
 
 # ============================================================================
 # Configuration
 # ============================================================================
 OUTPUT_DIR = "/hive/users/lrnassar/insightHub/afFrequencies"
 GNOMAD_BB = "/gbdb/hg38/gnomAD/v4.1/exomes/exomes.bb"
 GNOMAD_TAB = "/gbdb/hg38/gnomAD/v4.1/exomes/gnomad.v4.1.exomes.details.tab.gz"
 GNOMAD_GZI = "/gbdb/hg38/gnomAD/v4.1/exomes/gnomad.v4.1.exomes.details.tab.gz.gzi"
 
 # Gene to transcript mapping (coordinates will be queried from hgsql)
 TRANSCRIPTS = {
     'MLH1': 'NM_000249.4',
     'MSH2': 'NM_000251.3',
     'MSH6': 'NM_000179.3',
     'PMS2': 'NM_000535.7',
@@ -173,53 +164,59 @@
 
             idx += 1
 
         return result.decode('utf-8', errors='replace')
 
     def close(self):
         self.gz_handle.close()
 
 # ============================================================================
 # Main Processing Functions
 # ============================================================================
 
 def get_transcript_info(accession):
     """Query hgsql to get transcript information from hg38.ncbiRefSeq"""
     query = f"SELECT name, chrom, strand, txStart, txEnd FROM ncbiRefSeq WHERE name='{accession}'"
-    result = bash(f'hgsql hg38 -Ne "{query}"')
+    result = subprocess.check_output(["hgsql", "hg38", "-Ne", query], text=True)
 
     if not result.strip():
         raise ValueError(f"Transcript {accession} not found in hg38.ncbiRefSeq")
 
     fields = result.strip().split('\t')
 
     return {
         'name': fields[0],
         'chrom': fields[1],
         'strand': fields[2],
         'txStart': int(fields[3]),
         'txEnd': int(fields[4]),
     }
 
 def extract_variants(gene, gene_info):
     """Extract gnomAD variants for a gene region"""
     chrom = gene_info['chrom']
     start = gene_info['txStart']
     end = gene_info['txEnd']
 
     output_file = os.path.join(OUTPUT_DIR, f"gnomad_{gene}_raw.bed")
-    bash(f"bigBedToBed {GNOMAD_BB} -chrom={chrom} -start={start} -end={end} {output_file}")
+    subprocess.run(
+        ["bigBedToBed", GNOMAD_BB,
+         "-chrom=" + chrom,
+         "-start=" + str(start),
+         "-end=" + str(end),
+         output_file],
+        check=True)
 
     variants = []
     with open(output_file, 'r') as f:
         for line in f:
             fields = line.strip().split('\t')
             variants.append(fields)
 
     return variants
 
 def get_hgvsc_for_transcript(vep_json, transcript):
     """Extract HGVSc for specific transcript from VEP JSON"""
     try:
         vep_data = json.loads(vep_json)
         # Find gene that contains this transcript
         for gene_name, gene_data in vep_data.items():
@@ -288,34 +285,36 @@
         if hgvsc is None:
             stats['wrong_transcript'] += 1
             continue
 
         # Classify variant
         acmg_code = classify_variant(af_grpmax, gene)
 
         if acmg_code is None:
             stats['below_threshold'] += 1
             continue
 
         # Create BED entry
         color = COLORS[acmg_code]
         rule = RULES[gene][acmg_code]
 
-        # HTML-encode special characters for mouseOver
-        rule_html = rule.replace('≥', '&ge;').replace('≤', '&le;').replace('>', '&gt;').replace('<', '&lt;')
+        # HTML-escape; UCSC mouseover doesn't render raw ≤/≥ so map to entities.
+        rule_html = html.escape(rule).replace('≥', '&ge;').replace('≤', '&le;')
 
-        mouse_over = f"<b>HGVSc:</b> {hgvsc}</br><b>ACMG code:</b> {acmg_code}</br><b>Rule:</b> {rule_html}"
+        mouse_over = (f"<b>HGVSc:</b> {html.escape(hgvsc)}<br>"
+                      f"<b>ACMG code:</b> {html.escape(acmg_code)}<br>"
+                      f"<b>Rule:</b> {rule_html}")
 
         bed_line = f"{chrom}\t{chromStart}\t{chromEnd}\t{hgvsc}\t0\t.\t{chromStart}\t{chromEnd}\t{color}\t{acmg_code}\t{rule}\t{mouse_over}"
         bed_entries.append(bed_line)
         stats['included'] += 1
 
     print(f"  Included: {len(bed_entries)} variants")
     return bed_entries
 
 def main():
     print("=" * 70)
     print("InSiGHT VCEP AF Frequencies Track Generator")
     print("=" * 70)
 
     # Create output directory if needed
     os.makedirs(OUTPUT_DIR, exist_ok=True)
@@ -352,39 +351,42 @@
     for gene, accession in TRANSCRIPTS.items():
         tx_info = transcript_info[gene]
         entries = process_gene(gene, accession, tx_info, gzi_reader, stats)
         all_bed_entries.extend(entries)
 
     gzi_reader.close()
 
     # Write BED file
     bed_file = os.path.join(OUTPUT_DIR, "InSiGHTAF_hg38.bed")
     print(f"\nWriting BED file: {bed_file}")
     with open(bed_file, 'w') as f:
         f.write('\n'.join(all_bed_entries) + '\n')
 
     # Sort BED file
     print("Sorting BED file...")
-    bash(f"sort -k1,1 -k2,2n {bed_file} -o {bed_file}")
+    subprocess.run(["sort", "-k1,1", "-k2,2n", bed_file, "-o", bed_file], check=True)
 
     # Create bigBed for hg38
     bb_file = os.path.join(OUTPUT_DIR, "InSiGHTAFHg38.bb")
     chrom_sizes = "/cluster/data/hg38/chrom.sizes"
 
     print(f"\nCreating bigBed file: {bb_file}")
     try:
-        bash(f"bedToBigBed -as={as_file} -type=bed9+3 -tab {bed_file} {chrom_sizes} {bb_file}")
+        subprocess.run(
+            ["bedToBigBed", "-as=" + as_file, "-type=bed9+3", "-tab",
+             bed_file, chrom_sizes, bb_file],
+            check=True)
         print(f"  Successfully created: {bb_file}")
     except Exception as e:
         print(f"  ERROR creating bigBed: {e}")
 
     # LiftOver to hg19
     # liftOver only handles basic BED, so we need to:
     # 1. Extract chrom, chromStart, chromEnd, name (for joining)
     # 2. Lift those coordinates
     # 3. Rejoin with the rest of the fields
     print(f"\nLifting over to hg19...")
     bed_file_hg19 = os.path.join(OUTPUT_DIR, "InSiGHTAF_hg19.bed")
     unmapped_file = os.path.join(OUTPUT_DIR, "InSiGHTAF_unmapped.bed")
     chain_file = "/cluster/data/hg38/bed/liftOver/hg38ToHg19.over.chain.gz"
     bb_file_hg19 = os.path.join(OUTPUT_DIR, "InSiGHTAFHg19.bb")
 
@@ -393,53 +395,60 @@
         bed4_file = os.path.join(OUTPUT_DIR, "InSiGHTAF_hg38_bed4.bed")
         lifted_bed4 = os.path.join(OUTPUT_DIR, "InSiGHTAF_hg19_bed4.bed")
 
         # Extract BED4 and create lookup dict for extra fields
         extra_fields = {}  # name -> (fields 5-12)
         with open(bed_file, 'r') as fin, open(bed4_file, 'w') as fout:
             for line in fin:
                 fields = line.strip().split('\t')
                 name = fields[3]
                 # Store extra fields (score, strand, thickStart, thickEnd, color, acmgCode, rule, mouseOver)
                 extra_fields[name] = fields[4:]
                 # Write BED4
                 fout.write(f"{fields[0]}\t{fields[1]}\t{fields[2]}\t{fields[3]}\n")
 
         # Run liftOver on BED4
-        bash(f"liftOver {bed4_file} {chain_file} {lifted_bed4} {unmapped_file}")
-        unmapped_count = int(bash(f"wc -l < {unmapped_file}").strip()) // 2
+        subprocess.run(
+            ["liftOver", bed4_file, chain_file, lifted_bed4, unmapped_file],
+            check=True)
+        # liftOver writes 2 lines per unmapped record (a # comment then the BED row).
+        with open(unmapped_file) as fh:
+            unmapped_count = sum(1 for _ in fh) // 2
         print(f"  Lifted over, {unmapped_count} variants could not be mapped")
 
         # Rejoin lifted coordinates with extra fields
         with open(lifted_bed4, 'r') as fin, open(bed_file_hg19, 'w') as fout:
             for line in fin:
                 fields = line.strip().split('\t')
                 chrom, start, end, name = fields[0], fields[1], fields[2], fields[3]
                 if name in extra_fields:
                     extra = extra_fields[name]
                     # Update thickStart/thickEnd (fields 2 and 3 in extra, 0-indexed) to match new coords
                     extra[2] = start  # thickStart
                     extra[3] = end    # thickEnd
                     fout.write(f"{chrom}\t{start}\t{end}\t{name}\t" + "\t".join(extra) + "\n")
 
         # Sort hg19 BED
-        bash(f"sort -k1,1 -k2,2n {bed_file_hg19} -o {bed_file_hg19}")
+        subprocess.run(["sort", "-k1,1", "-k2,2n", bed_file_hg19, "-o", bed_file_hg19], check=True)
 
         # Create bigBed for hg19
         chrom_sizes_hg19 = "/cluster/data/hg19/chrom.sizes"
 
-        bash(f"bedToBigBed -as={as_file} -type=bed9+3 -tab {bed_file_hg19} {chrom_sizes_hg19} {bb_file_hg19}")
+        subprocess.run(
+            ["bedToBigBed", "-as=" + as_file, "-type=bed9+3", "-tab",
+             bed_file_hg19, chrom_sizes_hg19, bb_file_hg19],
+            check=True)
         print(f"  Successfully created: {bb_file_hg19}")
 
         # Cleanup temp files
         os.remove(bed4_file)
         os.remove(lifted_bed4)
     except Exception as e:
         print(f"  ERROR during liftOver: {e}")
 
     # Print statistics
     print("\n" + "=" * 70)
     print("Statistics")
     print("=" * 70)
     print(f"  Variants included in track:        {stats['included']}")
     print(f"  Excluded - No AF_grpmax (N/A):     {stats['no_af_grpmax']}")
     print(f"  Excluded - No HGVSc in details:    {stats['no_hgvsc']}")