a56d88e05670b759ff3b32829542537ccc790c57
lrnassar
Tue Apr 28 19:18:20 2026 -0700
Address CR feedback on insight + tp53 hub scripts. refs #37418
Drop duplicated bash() wrappers in favor of subprocess.run / check_output
with list args, eliminating shell=True, embedded-quote concerns, and
stderr-into-stdout merging. Centralize common operations as
run_sort_bed/run_liftOver in tp53FuncLib alongside existing run_bedToBigBed.
Switch HTML escaping to stdlib html.escape() consistently. insightHCIPriors
mouseover (previously unescaped) now escapes HGVS fields, addressing the
specific c.123A>G case Jonathan flagged. Replace invalid tags with
across all five affected mouseover sites.
diff --git src/hg/makeDb/scripts/insight/buildInsightClinVar.py src/hg/makeDb/scripts/insight/buildInsightClinVar.py
index 7a1d2728f17..a4f4eca2cef 100644
--- src/hg/makeDb/scripts/insight/buildInsightClinVar.py
+++ src/hg/makeDb/scripts/insight/buildInsightClinVar.py
@@ -18,30 +18,31 @@
python3 buildInsightClinVar.py [--output-dir DIR]
Output files:
- insight_clinvar_variants.tsv: Combined variant data from ClinVar
- insightClinVar.as: AutoSQL schema file
- insightClinVar_hg19.bed: BED file for hg19
- insightClinVar_hg38.bed: BED file for hg38
- insightClinVarHg19.bb: bigBed file for hg19
- insightClinVarHg38.bb: bigBed file for hg38
Author: UCSC Genome Browser Group
Date: 2026
"""
import argparse
+import html
import os
import subprocess
import sys
import tempfile
import time
import urllib.request
import xml.etree.ElementTree as ET
# ============================================================================
# Configuration
# ============================================================================
# Genes to fetch from ClinVar (Lynch syndrome MMR genes)
GENES = ["MLH1", "MSH2", "MSH6", "PMS2"]
@@ -98,48 +99,30 @@
string dateEvaluated; "Date of classification"
lstring comment; "InSiGHT submitter comment"
lstring _mouseOver; "HTML mouseover text"
)
"""
# ============================================================================
# Utility Functions
# ============================================================================
def log(msg):
"""Print log message to stderr"""
print(msg, file=sys.stderr)
-def bash(cmd):
- """Run a bash command and return output"""
- result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
- if result.returncode != 0:
- raise RuntimeError(f"Command failed: {cmd}\n{result.stderr}")
- return result.stdout
-
-
-def escape_html(text):
- """Escape special characters for HTML"""
- if not text:
- return ""
- return (str(text).replace('&', '&')
- .replace('<', '<')
- .replace('>', '>')
- .replace('"', '"'))
-
-
def fetch_url(url, max_retries=3):
"""Fetch URL with retries"""
for attempt in range(max_retries):
try:
req = urllib.request.Request(url)
with urllib.request.urlopen(req, timeout=120) as response:
return response.read().decode('utf-8')
except Exception as e:
if attempt < max_retries - 1:
log(f" Retry {attempt + 1} after error: {e}")
time.sleep(2)
else:
raise
@@ -317,31 +300,33 @@
Returns:
dict mapping id to (chrom, start, end) in target assembly
"""
if not coords:
return {}
with tempfile.NamedTemporaryFile(mode='w', suffix='.bed', delete=False) as f:
input_bed = f.name
for var_id, (chrom, start, end) in coords.items():
f.write(f"{chrom}\t{start}\t{end}\t{var_id}\n")
output_bed = input_bed.replace('.bed', '.lifted.bed')
unmapped_bed = input_bed.replace('.bed', '.unmapped.bed')
try:
- bash(f"liftOver {input_bed} {chain_file} {output_bed} {unmapped_bed} 2>/dev/null")
+ subprocess.run(
+ ["liftOver", input_bed, chain_file, output_bed, unmapped_bed],
+ check=True, stderr=subprocess.DEVNULL)
except Exception:
for f in [input_bed, output_bed, unmapped_bed]:
if os.path.exists(f):
os.remove(f)
return {}
lifted = {}
if os.path.exists(output_bed):
with open(output_bed) as f:
for line in f:
fields = line.strip().split('\t')
if len(fields) >= 4:
lifted[fields[3]] = (fields[0], int(fields[1]), int(fields[2]))
for f in [input_bed, output_bed, unmapped_bed]:
@@ -408,37 +393,37 @@
else:
# Missing coordinates
unmapped.append(v)
continue
if start is None:
unmapped.append(v)
continue
# Get color based on classification
color = COLORS.get(v['classification'], DEFAULT_COLOR)
# Build mouseover HTML
clinvar_url = f"https://www.ncbi.nlm.nih.gov/clinvar/variation/{v['var_id']}/"
mouse_over = (
- f"Variant: {escape_html(v['name'])}
"
+ f"Variant: {html.escape(v['name'])}
"
f"ClinVar ID: {v['var_id']}
"
- f"Classification: {escape_html(v['classification'])}
"
- f"Date evaluated: {escape_html(v['date_evaluated'])}"
+ f"Classification: {html.escape(v['classification'])}
"
+ f"Date evaluated: {html.escape(v['date_evaluated'])}"
)
if v['comment']:
- mouse_over += f"
Comment: {escape_html(v['comment'])}"
+ mouse_over += f"
Comment: {html.escape(v['comment'])}"
# Truncate name if too long
name = v['name'] if len(v['name']) <= 200 else v['name'][:197] + "..."
# Review status - use custom text
review_status = "Reviewed by expert panel InSiGHT"
# Build BED9+7 line
comment = v['comment'].replace('\t', ' ').replace('\n', ' ')
bed_fields = [
chrom, # chrom
str(start), # chromStart
str(end), # chromEnd
name, # name
'0', # score
@@ -475,40 +460,43 @@
log(f" Mapped: {stats['mapped']} (native: {stats['mapped_native']}, liftOver: {stats['mapped_liftover']})")
log(f" Unmapped: {len(unmapped)}")
if not entries:
log(" No entries to write!")
return None, None
# Write BED file
bed_file = os.path.join(output_dir, f"insightClinVar_{assembly}.bed")
log(f" Writing BED file: {bed_file}")
with open(bed_file, 'w') as f:
f.write('\n'.join(entries) + '\n')
# Sort BED file
log(f" Sorting BED file...")
- bash(f"sort -k1,1 -k2,2n {bed_file} -o {bed_file}")
+ subprocess.run(["sort", "-k1,1", "-k2,2n", bed_file, "-o", bed_file], check=True)
# Create bigBed
as_file = os.path.join(output_dir, "insightClinVar.as")
bb_file = os.path.join(output_dir, f"insightClinVar{assembly.capitalize()}.bb")
chrom_sizes = CHROM_SIZES[assembly]
log(f" Creating bigBed file: {bb_file}")
try:
- bash(f"bedToBigBed -as={as_file} -type=bed9+7 -tab {bed_file} {chrom_sizes} {bb_file}")
+ subprocess.run(
+ ["bedToBigBed", "-as=" + as_file, "-type=bed9+7", "-tab",
+ bed_file, chrom_sizes, bb_file],
+ check=True)
log(f" Successfully created: {bb_file}")
except Exception as e:
log(f" ERROR creating bigBed: {e}")
bb_file = None
return bed_file, bb_file
# ============================================================================
# Main Pipeline
# ============================================================================
def main():
parser = argparse.ArgumentParser(
description='Build InSiGHT ClinVar VCEP variants bigBed tracks'