a56d88e05670b759ff3b32829542537ccc790c57 lrnassar Tue Apr 28 19:18:20 2026 -0700 Address CR feedback on insight + tp53 hub scripts. refs #37418 Drop duplicated bash() wrappers in favor of subprocess.run / check_output with list args, eliminating shell=True, embedded-quote concerns, and stderr-into-stdout merging. Centralize common operations as run_sort_bed/run_liftOver in tp53FuncLib alongside existing run_bedToBigBed. Switch HTML escaping to stdlib html.escape() consistently. insightHCIPriors mouseover (previously unescaped) now escapes HGVS fields, addressing the specific c.123A>G case Jonathan flagged. Replace invalid </br> tags with <br> across all five affected mouseover sites. diff --git src/hg/makeDb/scripts/insight/insightFunctionalAssays.py src/hg/makeDb/scripts/insight/insightFunctionalAssays.py index dd08b4c067a..9c7905d1544 100644 --- src/hg/makeDb/scripts/insight/insightFunctionalAssays.py +++ src/hg/makeDb/scripts/insight/insightFunctionalAssays.py @@ -13,36 +13,37 @@ Each variant is classified according to ACMG PS3/BS3 criteria based on Tavtigian OddsPath thresholds or Jia LOF score thresholds. Output: BED9+7 bigBed files for hg38 and hg19. Supplementary data files (place in same directory as this script): drost2018_supplement.pdf - https://pmc.ncbi.nlm.nih.gov/articles/instance/7901556/bin/NIHMS1010100-supplement-CIMRA.pdf drost2020_supplement.docx - https://static-content.springer.com/esm/art%3A10.1038%2Fs41436-019-0736-2/MediaObjects/41436_2019_736_MOESM2_ESM.docx mmc2.xlsx - https://pmc.ncbi.nlm.nih.gov/articles/instance/7820803/bin/mmc2.xlsx rath2022_supplement.pdf - https://pmc.ncbi.nlm.nih.gov/articles/instance/9772141/bin/NIHMS1834139-supplement-supinfo.pdf Author: Generated for InSiGHT VCEP Date: 2025 """ -import subprocess +import html import os import re +import subprocess import sys -import zipfile import xml.etree.ElementTree as ET +import zipfile try: import openpyxl except ImportError: print("ERROR: openpyxl is required. Install with: pip install openpyxl") sys.exit(1) # ============================================================================ # Configuration # ============================================================================ OUTPUT_DIR = "/hive/users/lrnassar/insightHub/functionalAssays" # Transcripts for coordinate mapping (current MANE versions in hgsql) TRANSCRIPTS = { 'MLH1': 'NM_000249.4', @@ -90,44 +91,34 @@ uint reserved; "RGB color value" string gene; "Gene symbol" string protein; "Protein change (HGVSp notation)" string classification; "ACMG evidence classification (PS3/BS3)" string clinVarId; "ClinVar variation ID, if available" string score_value; "Functional score (OddsPath or LOF)" string paperRef; "Publication reference" lstring _mouseOver; "HTML mouseover text" ) """ # ============================================================================ # Utility functions # ============================================================================ -def bash(cmd): - """Run the cmd in bash subprocess""" - try: - rawBashOutput = subprocess.run(cmd, check=True, shell=True, - stdout=subprocess.PIPE, universal_newlines=True, stderr=subprocess.STDOUT) - bashStdout = rawBashOutput.stdout - except subprocess.CalledProcessError as e: - raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output)) - return(bashStdout) - def get_transcript_info(db, accession): """Query hgsql to get transcript information from ncbiRefSeq""" query = f"SELECT name, chrom, strand, txStart, txEnd, cdsStart, cdsEnd, exonStarts, exonEnds FROM ncbiRefSeq WHERE name='{accession}'" - result = bash(f'hgsql {db} -Ne "{query}"') + result = subprocess.check_output(["hgsql", db, "-Ne", query], text=True) if not result.strip(): raise ValueError(f"Transcript {accession} not found in {db}.ncbiRefSeq") fields = result.strip().split('\t') # Parse exon starts and ends (comma-separated, trailing comma) exon_starts = [int(x) for x in fields[7].rstrip(',').split(',')] exon_ends = [int(x) for x in fields[8].rstrip(',').split(',')] return { 'name': fields[0], 'chrom': fields[1], 'strand': fields[2], 'txStart': int(fields[3]), @@ -299,43 +290,43 @@ if val >= 0.4: return 'PS3_Strong' elif val >= 0: return 'Indeterminate' else: return 'BS3_Strong' # ============================================================================ # MouseOver builder # ============================================================================ def build_mouseover(name, protein, classification, clinvar_id, score_val, score_label, paper_key): """Build HTML mouseover text for a variant""" paper_info = PAPERS[paper_key] parts = [] - parts.append(f"<b>HGVSc/HGVSp:</b> {name}") - parts.append(f"<b>Protein:</b> {protein}") - parts.append(f"<b>Classification:</b> {classification}") + parts.append(f"<b>HGVSc/HGVSp:</b> {html.escape(name)}") + parts.append(f"<b>Protein:</b> {html.escape(protein)}") + parts.append(f"<b>Classification:</b> {html.escape(classification)}") if clinvar_id and clinvar_id != 'NA' and clinvar_id != '': parts.append(f'<b>ClinVar ID:</b> <a href="https://www.ncbi.nlm.nih.gov/clinvar/variation/{clinvar_id}/" target="_blank">{clinvar_id}</a>') else: parts.append("<b>ClinVar ID:</b> N/A") parts.append(f"<b>{score_label}:</b> {score_val}") parts.append(f'<b>Paper:</b> <a href="https://pubmed.ncbi.nlm.nih.gov/{paper_info["pmid"]}/" target="_blank">{paper_info["ref"]}</a>') - return "</br>".join(parts) + return "<br>".join(parts) # ============================================================================ # BED entry builder # ============================================================================ def make_bed_entry(chrom, start, end, name, color, gene, protein, classification, clinvar_id, score_val, paper_ref, mouseover): """Create a BED9+7 line""" return (f"{chrom}\t{start}\t{end}\t{name}\t0\t.\t{start}\t{end}\t{color}" f"\t{gene}\t{protein}\t{classification}\t{clinvar_id}" f"\t{score_val}\t{paper_ref}\t{mouseover}") # ============================================================================ # Data parsing: Paper 1 - Drost et al. 2018 # ============================================================================ @@ -875,41 +866,44 @@ print(f"\n Total variants to process: {len(all_variants)}") # Convert to BED entries print(f"\n Querying transcript coordinates from {db}...") bed_entries = process_variants(all_variants, db, transcript_cache, stats) # Write BED file bed_file = os.path.join(OUTPUT_DIR, f"insightFunctionalAssays_{db}.bed") print(f"\n Writing BED file: {bed_file}") with open(bed_file, 'w') as f: f.write('\n'.join(bed_entries) + '\n') # Sort BED file print(" Sorting BED file...") - bash(f"sort -k1,1 -k2,2n {bed_file} -o {bed_file}") + subprocess.run(["sort", "-k1,1", "-k2,2n", bed_file, "-o", bed_file], check=True) # Create bigBed as_file = os.path.join(OUTPUT_DIR, "insightFunctionalAssays.as") db_label = db.replace('hg', 'Hg') bb_file = os.path.join(OUTPUT_DIR, f"insightFunctionalAssays{db_label}.bb") chrom_sizes = f"/cluster/data/{db}/chrom.sizes" print(f" Creating bigBed file: {bb_file}") try: - bash(f"bedToBigBed -as={as_file} -type=bed9+7 -tab {bed_file} {chrom_sizes} {bb_file}") + subprocess.run( + ["bedToBigBed", "-as=" + as_file, "-type=bed9+7", "-tab", + bed_file, chrom_sizes, bb_file], + check=True) print(f" Successfully created: {bb_file}") except Exception as e: print(f" ERROR creating bigBed: {e}") # Print stats print(f"\n Statistics for {db}:") print(f" Total included: {stats['included']}") for paper_key in ['drost2018', 'drost2020', 'jia2021', 'rath2022']: count = stats.get(f'included_{paper_key}', 0) if count > 0: print(f" {PAPERS[paper_key]['ref']}: {count}") print(f" Parse failed: {stats['parse_failed']}") print(f" Coordinate failed: {stats['coord_failed']}") print(f" Classify failed: {stats['classify_failed']}")