ed3cb243b9151cae7d1859712148fa1d1e1a2196
lrnassar
Mon Apr 13 17:34:17 2026 -0700
Adding makedoc and build scripts for InSiGHT VCEP track hub. refs #36582
diff --git src/hg/makeDb/scripts/insight/buildInsightClinVar.py src/hg/makeDb/scripts/insight/buildInsightClinVar.py
new file mode 100644
index 00000000000..d1670392eaa
--- /dev/null
+++ src/hg/makeDb/scripts/insight/buildInsightClinVar.py
@@ -0,0 +1,593 @@
+#!/usr/bin/env python3
+"""
+InSiGHT ClinVar VCEP Variants Pipeline
+
+Fetches InSiGHT Variant Curation Expert Panel (VCEP) submissions from ClinVar
+for Lynch syndrome mismatch repair (MMR) genes, and creates UCSC Genome Browser
+bigBed tracks for both hg19 and hg38.
+
+Genes included:
+- MLH1
+- MSH2
+- MSH6
+- PMS2
+
+Only variants submitted by InSiGHT with "reviewed by expert panel" status are included.
+
+Usage:
+ python3 buildInsightClinVar.py [--output-dir DIR]
+
+Output files:
+ - insight_clinvar_variants.tsv: Combined variant data from ClinVar
+ - insightClinVar.as: AutoSQL schema file
+ - insightClinVar_hg19.bed: BED file for hg19
+ - insightClinVar_hg38.bed: BED file for hg38
+ - insightClinVarHg19.bb: bigBed file for hg19
+ - insightClinVarHg38.bb: bigBed file for hg38
+
+Author: UCSC Genome Browser Group
+Date: 2026
+"""
+
+import argparse
+import os
+import subprocess
+import sys
+import tempfile
+import time
+import urllib.request
+import xml.etree.ElementTree as ET
+
+# ============================================================================
+# Configuration
+# ============================================================================
+
+# Genes to fetch from ClinVar (Lynch syndrome MMR genes)
+GENES = ["MLH1", "MSH2", "MSH6", "PMS2"]
+
+# ClinVar API endpoints
+ESEARCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
+EFETCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
+
+# Batch size for API requests
+BATCH_SIZE = 100
+
+# Delay between API requests (be nice to NCBI)
+API_DELAY = 0.5
+
+# LiftOver chain files
+LIFTOVER_CHAINS = {
+ 'hg19_to_hg38': '/cluster/data/hg19/bed/liftOver/hg19ToHg38.over.chain.gz',
+ 'hg38_to_hg19': '/cluster/data/hg38/bed/liftOver/hg38ToHg19.over.chain.gz',
+}
+
+# Chromosome sizes files
+CHROM_SIZES = {
+ 'hg19': '/cluster/data/hg19/chrom.sizes',
+ 'hg38': '/cluster/data/hg38/chrom.sizes',
+}
+
+# Color mapping for ClinVar classifications (RGB format for bigBed)
+# Colors match ClinVar track conventions
+COLORS = {
+ 'Pathogenic': '210,0,0', # red
+ 'Likely pathogenic': '245,152,152', # pink/light red
+ 'Uncertain significance': '0,0,136', # dark blue
+ 'Likely benign': '213,247,213', # lime green
+ 'Benign': '0,210,0', # green
+}
+DEFAULT_COLOR = '136,136,136' # gray for unknown
+
+# AutoSQL definition for the BED9+7 format
+AUTOSQL = """table insightClinVar
+"InSiGHT VCEP curated variants from ClinVar for Lynch syndrome genes"
+ (
+ string chrom; "Reference sequence chromosome or scaffold"
+ uint chromStart; "Start position in chromosome"
+ uint chromEnd; "End position in chromosome"
+ string name; "Variant name (HGVS notation)"
+ uint score; "Not used, set to 0"
+ char[1] strand; "Not used, set to ."
+ uint thickStart; "Same as chromStart"
+ uint thickEnd; "Same as chromEnd"
+ uint reserved; "RGB color value"
+ string gene; "Gene symbol"
+ string varId; "ClinVar variation ID"
+ string classification; "Clinical significance classification"
+ string reviewStatus; "ClinVar review status"
+ string dateEvaluated; "Date of classification"
+ lstring comment; "InSiGHT submitter comment"
+ lstring _mouseOver; "HTML mouseover text"
+ )
+"""
+
+# ============================================================================
+# Utility Functions
+# ============================================================================
+
+def log(msg):
+ """Print log message to stderr"""
+ print(msg, file=sys.stderr)
+
+
+def bash(cmd):
+ """Run a bash command and return output"""
+ result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
+ if result.returncode != 0:
+ raise RuntimeError(f"Command failed: {cmd}\n{result.stderr}")
+ return result.stdout
+
+
+def escape_html(text):
+ """Escape special characters for HTML"""
+ if not text:
+ return ""
+ return (str(text).replace('&', '&')
+ .replace('<', '<')
+ .replace('>', '>')
+ .replace('"', '"'))
+
+
+def fetch_url(url, max_retries=3):
+ """Fetch URL with retries"""
+ for attempt in range(max_retries):
+ try:
+ req = urllib.request.Request(url)
+ with urllib.request.urlopen(req, timeout=120) as response:
+ return response.read().decode('utf-8')
+ except Exception as e:
+ if attempt < max_retries - 1:
+ log(f" Retry {attempt + 1} after error: {e}")
+ time.sleep(2)
+ else:
+ raise
+
+
+# ============================================================================
+# ClinVar API Functions
+# ============================================================================
+
+def search_clinvar_ids(gene):
+ """Search ClinVar for InSiGHT submissions for a gene, return variation IDs"""
+ search_term = f"{gene}[gene] AND InSiGHT[Submitter]"
+ url = f"{ESEARCH_URL}?db=clinvar&term={urllib.request.quote(search_term)}&retmax=10000"
+
+ log(f" Searching {gene}...")
+ xml_data = fetch_url(url)
+
+ # Parse IDs from XML
+ root = ET.fromstring(xml_data)
+ ids = [id_elem.text for id_elem in root.findall('.//Id')]
+ log(f" Found {len(ids)} variants")
+
+ return ids
+
+
+def fetch_variant_batch(var_ids):
+ """Fetch full VCV data for a batch of variation IDs"""
+ ids_str = ','.join(var_ids)
+ url = f"{EFETCH_URL}?db=clinvar&rettype=vcv&is_variationid&id={ids_str}"
+ return fetch_url(url)
+
+
+def parse_clinvar_xml(xml_data):
+ """Parse ClinVar XML and extract variant data with InSiGHT submissions"""
+ root = ET.fromstring(xml_data)
+ variants = []
+
+ for var_archive in root.findall('.//VariationArchive'):
+ var_id = var_archive.get('VariationID')
+ var_name = var_archive.get('VariationName', '')
+
+ # Get gene symbol
+ gene_elem = var_archive.find('.//Gene')
+ gene = gene_elem.get('Symbol') if gene_elem is not None else ''
+
+ # Get coordinates from SimpleAllele/Location
+ hg38_chr = hg38_start = hg38_end = None
+ hg19_chr = hg19_start = hg19_end = None
+
+ for seq_loc in var_archive.findall('.//SimpleAllele/Location/SequenceLocation'):
+ assembly = seq_loc.get('Assembly', '')
+ if assembly == 'GRCh38':
+ hg38_chr = 'chr' + seq_loc.get('Chr', '')
+ hg38_start = seq_loc.get('start')
+ hg38_end = seq_loc.get('stop')
+ elif assembly == 'GRCh37':
+ hg19_chr = 'chr' + seq_loc.get('Chr', '')
+ hg19_start = seq_loc.get('start')
+ hg19_end = seq_loc.get('stop')
+
+ # Find InSiGHT submission
+ insight_classification = ''
+ insight_review_status = ''
+ insight_date = ''
+ insight_comment = ''
+
+ for assertion in var_archive.findall('.//ClinicalAssertion'):
+ accession = assertion.find('.//ClinVarAccession')
+ if accession is not None:
+ submitter = accession.get('SubmitterName', '')
+ if 'InSiGHT' in submitter:
+ classification = assertion.find('.//Classification')
+ if classification is not None:
+ insight_date = classification.get('DateLastEvaluated', '')
+ review_elem = classification.find('ReviewStatus')
+ if review_elem is not None:
+ insight_review_status = review_elem.text
+ germ_class = classification.find('GermlineClassification')
+ if germ_class is not None:
+ insight_classification = germ_class.text
+ comment_elem = classification.find('Comment')
+ if comment_elem is not None:
+ insight_comment = comment_elem.text or ''
+ break
+
+ if insight_classification: # Only include if has InSiGHT classification
+ variants.append({
+ 'var_id': var_id,
+ 'gene': gene,
+ 'name': var_name,
+ 'hg38_chr': hg38_chr,
+ 'hg38_start': hg38_start,
+ 'hg38_end': hg38_end,
+ 'hg19_chr': hg19_chr,
+ 'hg19_start': hg19_start,
+ 'hg19_end': hg19_end,
+ 'classification': insight_classification,
+ 'review_status': insight_review_status,
+ 'date_evaluated': insight_date,
+ 'comment': insight_comment,
+ })
+
+ return variants
+
+
+# ============================================================================
+# Data Fetching
+# ============================================================================
+
+def fetch_all_variants():
+ """Fetch all InSiGHT VCEP variants from ClinVar"""
+ all_ids = []
+
+ log("Searching ClinVar for InSiGHT VCEP submissions...")
+ for gene in GENES:
+ ids = search_clinvar_ids(gene)
+ all_ids.extend(ids)
+ time.sleep(API_DELAY)
+
+ # Remove duplicates while preserving order
+ seen = set()
+ unique_ids = []
+ for id in all_ids:
+ if id not in seen:
+ seen.add(id)
+ unique_ids.append(id)
+
+ log(f"Total unique variation IDs: {len(unique_ids)}")
+
+ # Fetch variants in batches
+ all_variants = []
+ num_batches = (len(unique_ids) + BATCH_SIZE - 1) // BATCH_SIZE
+
+ log(f"Fetching variant data in {num_batches} batches...")
+ for i in range(0, len(unique_ids), BATCH_SIZE):
+ batch_num = i // BATCH_SIZE + 1
+ batch_ids = unique_ids[i:i + BATCH_SIZE]
+ log(f" Batch {batch_num}/{num_batches}: fetching {len(batch_ids)} variants...")
+
+ xml_data = fetch_variant_batch(batch_ids)
+ variants = parse_clinvar_xml(xml_data)
+ all_variants.extend(variants)
+
+ time.sleep(API_DELAY)
+
+ log(f"Total variants with InSiGHT classifications: {len(all_variants)}")
+ return all_variants
+
+
+def write_tsv(variants, output_file):
+ """Write variants to TSV file"""
+ log(f"Writing TSV file: {output_file}")
+
+ columns = ['var_id', 'gene', 'name', 'hg38_chr', 'hg38_start', 'hg38_end',
+ 'hg19_chr', 'hg19_start', 'hg19_end', 'classification',
+ 'review_status', 'date_evaluated', 'comment']
+
+ with open(output_file, 'w') as f:
+ f.write('\t'.join(columns) + '\n')
+ for v in variants:
+ row = [str(v.get(col, '') or '') for col in columns]
+ f.write('\t'.join(row) + '\n')
+
+
+# ============================================================================
+# LiftOver
+# ============================================================================
+
+def liftover_coords(coords, chain_file):
+ """
+ Lift over coordinates using UCSC liftOver.
+
+ Args:
+ coords: dict mapping id to (chrom, start, end)
+ chain_file: path to chain file
+
+ Returns:
+ dict mapping id to (chrom, start, end) in target assembly
+ """
+ if not coords:
+ return {}
+
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.bed', delete=False) as f:
+ input_bed = f.name
+ for var_id, (chrom, start, end) in coords.items():
+ f.write(f"{chrom}\t{start}\t{end}\t{var_id}\n")
+
+ output_bed = input_bed.replace('.bed', '.lifted.bed')
+ unmapped_bed = input_bed.replace('.bed', '.unmapped.bed')
+
+ try:
+ bash(f"liftOver {input_bed} {chain_file} {output_bed} {unmapped_bed} 2>/dev/null")
+ except:
+ for f in [input_bed, output_bed, unmapped_bed]:
+ if os.path.exists(f):
+ os.remove(f)
+ return {}
+
+ lifted = {}
+ if os.path.exists(output_bed):
+ with open(output_bed) as f:
+ for line in f:
+ fields = line.strip().split('\t')
+ if len(fields) >= 4:
+ lifted[fields[3]] = (fields[0], int(fields[1]), int(fields[2]))
+
+ for f in [input_bed, output_bed, unmapped_bed]:
+ if os.path.exists(f):
+ os.remove(f)
+
+ return lifted
+
+
+# ============================================================================
+# BED/bigBed Creation
+# ============================================================================
+
+def create_bed_entries(variants, assembly):
+ """Create BED entries for the given assembly"""
+ entries = []
+ unmapped = []
+ stats = {'total': 0, 'mapped': 0, 'mapped_native': 0, 'mapped_liftover': 0}
+
+ # Determine coordinate columns
+ if assembly == 'hg38':
+ chr_col, start_col, end_col = 'hg38_chr', 'hg38_start', 'hg38_end'
+ alt_chr_col, alt_start_col, alt_end_col = 'hg19_chr', 'hg19_start', 'hg19_end'
+ chain_file = LIFTOVER_CHAINS['hg19_to_hg38']
+ else:
+ chr_col, start_col, end_col = 'hg19_chr', 'hg19_start', 'hg19_end'
+ alt_chr_col, alt_start_col, alt_end_col = 'hg38_chr', 'hg38_start', 'hg38_end'
+ chain_file = LIFTOVER_CHAINS['hg38_to_hg19']
+
+ # First pass: identify variants needing liftOver
+ needs_liftover = {}
+ for v in variants:
+ stats['total'] += 1
+ if not v.get(start_col):
+ # Try alternate assembly coordinates for liftOver
+ if v.get(alt_start_col):
+ needs_liftover[v['var_id']] = (
+ v[alt_chr_col],
+ int(v[alt_start_col]) - 1, # Convert to 0-based
+ int(v[alt_end_col])
+ )
+
+ # Perform liftOver
+ lifted = {}
+ if needs_liftover:
+ log(f" Attempting liftOver for {len(needs_liftover)} variants...")
+ lifted = liftover_coords(needs_liftover, chain_file)
+ log(f" Successfully lifted: {len(lifted)}")
+
+ # Second pass: create BED entries
+ for v in variants:
+ chrom = v.get(chr_col)
+ start = v.get(start_col)
+ end = v.get(end_col)
+ used_liftover = False
+
+ if not start and v['var_id'] in lifted:
+ chrom, start, end = lifted[v['var_id']]
+ # Already 0-based from liftOver
+ used_liftover = True
+ elif start and end:
+ start = int(start) - 1 # Convert to 0-based
+ end = int(end)
+ else:
+ # Missing coordinates
+ unmapped.append(v)
+ continue
+
+ if start is None:
+ unmapped.append(v)
+ continue
+
+ # Get color based on classification
+ color = COLORS.get(v['classification'], DEFAULT_COLOR)
+
+ # Build mouseover HTML
+ clinvar_url = f"https://www.ncbi.nlm.nih.gov/clinvar/variation/{v['var_id']}/"
+ mouse_over = (
+ f"Variant: {escape_html(v['name'])}
"
+ f"ClinVar ID: {v['var_id']}
"
+ f"Classification: {escape_html(v['classification'])}
"
+ f"Date evaluated: {escape_html(v['date_evaluated'])}"
+ )
+ if v['comment']:
+ mouse_over += f"
Comment: {escape_html(v['comment'])}"
+
+ # Truncate name if too long
+ name = v['name'] if len(v['name']) <= 200 else v['name'][:197] + "..."
+
+ # Review status - use custom text
+ review_status = "Reviewed by expert panel InSiGHT"
+
+ # Build BED9+7 line
+ comment = v['comment'].replace('\t', ' ').replace('\n', ' ')
+ bed_fields = [
+ chrom, # chrom
+ str(start), # chromStart
+ str(end), # chromEnd
+ name, # name
+ '0', # score
+ '.', # strand
+ str(start), # thickStart
+ str(end), # thickEnd
+ color, # itemRgb
+ v['gene'], # gene
+ v['var_id'], # varId
+ v['classification'], # classification
+ review_status, # reviewStatus
+ v['date_evaluated'], # dateEvaluated
+ comment, # comment
+ mouse_over, # _mouseOver
+ ]
+
+ entries.append('\t'.join(bed_fields))
+ stats['mapped'] += 1
+ if used_liftover:
+ stats['mapped_liftover'] += 1
+ else:
+ stats['mapped_native'] += 1
+
+ return entries, stats, unmapped
+
+
+def create_track(variants, assembly, output_dir):
+ """Create BED and bigBed files for a given assembly"""
+ log(f"\nProcessing {assembly}...")
+
+ entries, stats, unmapped = create_bed_entries(variants, assembly)
+
+ log(f" Total variants: {stats['total']}")
+ log(f" Mapped: {stats['mapped']} (native: {stats['mapped_native']}, liftOver: {stats['mapped_liftover']})")
+ log(f" Unmapped: {len(unmapped)}")
+
+ if not entries:
+ log(" No entries to write!")
+ return None, None
+
+ # Write BED file
+ bed_file = os.path.join(output_dir, f"insightClinVar_{assembly}.bed")
+ log(f" Writing BED file: {bed_file}")
+ with open(bed_file, 'w') as f:
+ f.write('\n'.join(entries) + '\n')
+
+ # Sort BED file
+ log(f" Sorting BED file...")
+ bash(f"sort -k1,1 -k2,2n {bed_file} -o {bed_file}")
+
+ # Create bigBed
+ as_file = os.path.join(output_dir, "insightClinVar.as")
+ bb_file = os.path.join(output_dir, f"insightClinVar{assembly.capitalize()}.bb")
+ chrom_sizes = CHROM_SIZES[assembly]
+
+ log(f" Creating bigBed file: {bb_file}")
+ try:
+ bash(f"bedToBigBed -as={as_file} -type=bed9+7 -tab {bed_file} {chrom_sizes} {bb_file}")
+ log(f" Successfully created: {bb_file}")
+ except Exception as e:
+ log(f" ERROR creating bigBed: {e}")
+ bb_file = None
+
+ return bed_file, bb_file
+
+
+# ============================================================================
+# Main Pipeline
+# ============================================================================
+
+def main():
+ parser = argparse.ArgumentParser(
+ description='Build InSiGHT ClinVar VCEP variants bigBed tracks'
+ )
+ parser.add_argument(
+ '--output-dir', '-o',
+ default=os.path.dirname(os.path.abspath(__file__)),
+ help='Output directory (default: script directory)'
+ )
+ args = parser.parse_args()
+
+ output_dir = args.output_dir
+ os.makedirs(output_dir, exist_ok=True)
+
+ log("=" * 70)
+ log("InSiGHT ClinVar VCEP Variants Pipeline")
+ log("=" * 70)
+
+ # Step 1: Fetch data from ClinVar API
+ variants = fetch_all_variants()
+ if not variants:
+ log("ERROR: No variants fetched!")
+ sys.exit(1)
+
+ # Step 2: Write combined TSV
+ tsv_file = os.path.join(output_dir, "insight_clinvar_variants.tsv")
+ write_tsv(variants, tsv_file)
+
+ # Step 3: Write AutoSQL file
+ as_file = os.path.join(output_dir, "insightClinVar.as")
+ log(f"Writing AutoSQL file: {as_file}")
+ with open(as_file, 'w') as f:
+ f.write(AUTOSQL)
+
+ # Step 4: Create tracks for both assemblies
+ output_files = {}
+ for assembly in ['hg38', 'hg19']:
+ try:
+ bed_file, bb_file = create_track(variants, assembly, output_dir)
+ output_files[assembly] = {'bed': bed_file, 'bb': bb_file}
+ except Exception as e:
+ log(f"ERROR processing {assembly}: {e}")
+ import traceback
+ traceback.print_exc()
+
+ # Print summary
+ log("\n" + "=" * 70)
+ log("Summary")
+ log("=" * 70)
+
+ # Classification counts
+ from collections import Counter
+ class_counts = Counter(v['classification'] for v in variants)
+ log("\nClassifications:")
+ for cls, count in class_counts.most_common():
+ log(f" {cls}: {count}")
+
+ # Gene counts
+ gene_counts = Counter(v['gene'] for v in variants)
+ log("\nGenes:")
+ for gene in GENES:
+ log(f" {gene}: {gene_counts.get(gene, 0)}")
+
+ log("\n" + "=" * 70)
+ log("Output Files")
+ log("=" * 70)
+ log(f" TSV file: {tsv_file}")
+ log(f" AutoSql: {as_file}")
+ for assembly, files in output_files.items():
+ log(f"\n {assembly}:")
+ if files.get('bed'):
+ log(f" BED: {files['bed']}")
+ if files.get('bb') and os.path.exists(files['bb']):
+ log(f" bigBed: {files['bb']}")
+
+ log("\n" + "=" * 70)
+ log("Done!")
+ log("=" * 70)
+
+
+if __name__ == "__main__":
+ main()