a56d88e05670b759ff3b32829542537ccc790c57 lrnassar Tue Apr 28 19:18:20 2026 -0700 Address CR feedback on insight + tp53 hub scripts. refs #37418 Drop duplicated bash() wrappers in favor of subprocess.run / check_output with list args, eliminating shell=True, embedded-quote concerns, and stderr-into-stdout merging. Centralize common operations as run_sort_bed/run_liftOver in tp53FuncLib alongside existing run_bedToBigBed. Switch HTML escaping to stdlib html.escape() consistently. insightHCIPriors mouseover (previously unescaped) now escapes HGVS fields, addressing the specific c.123A>G case Jonathan flagged. Replace invalid
tags with
across all five affected mouseover sites. diff --git src/hg/makeDb/scripts/insight/buildInsightClinVar.py src/hg/makeDb/scripts/insight/buildInsightClinVar.py index 7a1d2728f17..a4f4eca2cef 100644 --- src/hg/makeDb/scripts/insight/buildInsightClinVar.py +++ src/hg/makeDb/scripts/insight/buildInsightClinVar.py @@ -18,30 +18,31 @@ python3 buildInsightClinVar.py [--output-dir DIR] Output files: - insight_clinvar_variants.tsv: Combined variant data from ClinVar - insightClinVar.as: AutoSQL schema file - insightClinVar_hg19.bed: BED file for hg19 - insightClinVar_hg38.bed: BED file for hg38 - insightClinVarHg19.bb: bigBed file for hg19 - insightClinVarHg38.bb: bigBed file for hg38 Author: UCSC Genome Browser Group Date: 2026 """ import argparse +import html import os import subprocess import sys import tempfile import time import urllib.request import xml.etree.ElementTree as ET # ============================================================================ # Configuration # ============================================================================ # Genes to fetch from ClinVar (Lynch syndrome MMR genes) GENES = ["MLH1", "MSH2", "MSH6", "PMS2"] @@ -98,48 +99,30 @@ string dateEvaluated; "Date of classification" lstring comment; "InSiGHT submitter comment" lstring _mouseOver; "HTML mouseover text" ) """ # ============================================================================ # Utility Functions # ============================================================================ def log(msg): """Print log message to stderr""" print(msg, file=sys.stderr) -def bash(cmd): - """Run a bash command and return output""" - result = subprocess.run(cmd, shell=True, capture_output=True, text=True) - if result.returncode != 0: - raise RuntimeError(f"Command failed: {cmd}\n{result.stderr}") - return result.stdout - - -def escape_html(text): - """Escape special characters for HTML""" - if not text: - return "" - return (str(text).replace('&', '&') - .replace('<', '<') - .replace('>', '>') - .replace('"', '"')) - - def fetch_url(url, max_retries=3): """Fetch URL with retries""" for attempt in range(max_retries): try: req = urllib.request.Request(url) with urllib.request.urlopen(req, timeout=120) as response: return response.read().decode('utf-8') except Exception as e: if attempt < max_retries - 1: log(f" Retry {attempt + 1} after error: {e}") time.sleep(2) else: raise @@ -317,31 +300,33 @@ Returns: dict mapping id to (chrom, start, end) in target assembly """ if not coords: return {} with tempfile.NamedTemporaryFile(mode='w', suffix='.bed', delete=False) as f: input_bed = f.name for var_id, (chrom, start, end) in coords.items(): f.write(f"{chrom}\t{start}\t{end}\t{var_id}\n") output_bed = input_bed.replace('.bed', '.lifted.bed') unmapped_bed = input_bed.replace('.bed', '.unmapped.bed') try: - bash(f"liftOver {input_bed} {chain_file} {output_bed} {unmapped_bed} 2>/dev/null") + subprocess.run( + ["liftOver", input_bed, chain_file, output_bed, unmapped_bed], + check=True, stderr=subprocess.DEVNULL) except Exception: for f in [input_bed, output_bed, unmapped_bed]: if os.path.exists(f): os.remove(f) return {} lifted = {} if os.path.exists(output_bed): with open(output_bed) as f: for line in f: fields = line.strip().split('\t') if len(fields) >= 4: lifted[fields[3]] = (fields[0], int(fields[1]), int(fields[2])) for f in [input_bed, output_bed, unmapped_bed]: @@ -408,37 +393,37 @@ else: # Missing coordinates unmapped.append(v) continue if start is None: unmapped.append(v) continue # Get color based on classification color = COLORS.get(v['classification'], DEFAULT_COLOR) # Build mouseover HTML clinvar_url = f"https://www.ncbi.nlm.nih.gov/clinvar/variation/{v['var_id']}/" mouse_over = ( - f"Variant: {escape_html(v['name'])}
" + f"Variant: {html.escape(v['name'])}
" f"ClinVar ID: {v['var_id']}
" - f"Classification: {escape_html(v['classification'])}
" - f"Date evaluated: {escape_html(v['date_evaluated'])}" + f"Classification: {html.escape(v['classification'])}
" + f"Date evaluated: {html.escape(v['date_evaluated'])}" ) if v['comment']: - mouse_over += f"
Comment: {escape_html(v['comment'])}" + mouse_over += f"
Comment: {html.escape(v['comment'])}" # Truncate name if too long name = v['name'] if len(v['name']) <= 200 else v['name'][:197] + "..." # Review status - use custom text review_status = "Reviewed by expert panel InSiGHT" # Build BED9+7 line comment = v['comment'].replace('\t', ' ').replace('\n', ' ') bed_fields = [ chrom, # chrom str(start), # chromStart str(end), # chromEnd name, # name '0', # score @@ -475,40 +460,43 @@ log(f" Mapped: {stats['mapped']} (native: {stats['mapped_native']}, liftOver: {stats['mapped_liftover']})") log(f" Unmapped: {len(unmapped)}") if not entries: log(" No entries to write!") return None, None # Write BED file bed_file = os.path.join(output_dir, f"insightClinVar_{assembly}.bed") log(f" Writing BED file: {bed_file}") with open(bed_file, 'w') as f: f.write('\n'.join(entries) + '\n') # Sort BED file log(f" Sorting BED file...") - bash(f"sort -k1,1 -k2,2n {bed_file} -o {bed_file}") + subprocess.run(["sort", "-k1,1", "-k2,2n", bed_file, "-o", bed_file], check=True) # Create bigBed as_file = os.path.join(output_dir, "insightClinVar.as") bb_file = os.path.join(output_dir, f"insightClinVar{assembly.capitalize()}.bb") chrom_sizes = CHROM_SIZES[assembly] log(f" Creating bigBed file: {bb_file}") try: - bash(f"bedToBigBed -as={as_file} -type=bed9+7 -tab {bed_file} {chrom_sizes} {bb_file}") + subprocess.run( + ["bedToBigBed", "-as=" + as_file, "-type=bed9+7", "-tab", + bed_file, chrom_sizes, bb_file], + check=True) log(f" Successfully created: {bb_file}") except Exception as e: log(f" ERROR creating bigBed: {e}") bb_file = None return bed_file, bb_file # ============================================================================ # Main Pipeline # ============================================================================ def main(): parser = argparse.ArgumentParser( description='Build InSiGHT ClinVar VCEP variants bigBed tracks'