ed3cb243b9151cae7d1859712148fa1d1e1a2196
lrnassar
Mon Apr 13 17:34:17 2026 -0700
Adding makedoc and build scripts for InSiGHT VCEP track hub. refs #36582
diff --git src/hg/makeDb/scripts/insight/insightFunctionalAssays.py src/hg/makeDb/scripts/insight/insightFunctionalAssays.py
new file mode 100644
index 00000000000..32ceaa0699e
--- /dev/null
+++ src/hg/makeDb/scripts/insight/insightFunctionalAssays.py
@@ -0,0 +1,963 @@
+#!/usr/bin/env python3
+"""
+InSiGHT VCEP Functional Assays Track Generator
+
+Generates UCSC Genome Browser tracks (bigBed format) for functional assay evidence
+from published studies on Lynch syndrome MMR genes. Combines data from:
+
+ Paper 1: Drost et al. 2018 (PMID:30504929) - MLH1/MSH2 CIMRA assay
+ Paper 2: Drost et al. 2020 (PMID:31965077) - MSH6 CIMRA assay
+ Paper 3: Jia et al. 2021 (PMID:33357406) - MSH2 deep mutational scanning
+ Paper 4: Rath et al. 2022 (PMID:36054288) - MLH1 cell-based functional assay
+
+Each variant is classified according to ACMG PS3/BS3 criteria based on
+Tavtigian OddsPath thresholds or Jia LOF score thresholds.
+
+Output: BED9+7 bigBed files for hg38 and hg19.
+
+Supplementary data files (place in same directory as this script):
+ drost2018_supplement.pdf - https://pmc.ncbi.nlm.nih.gov/articles/instance/7901556/bin/NIHMS1010100-supplement-CIMRA.pdf
+ drost2020_supplement.docx - https://static-content.springer.com/esm/art%3A10.1038%2Fs41436-019-0736-2/MediaObjects/41436_2019_736_MOESM2_ESM.docx
+ mmc2.xlsx - https://pmc.ncbi.nlm.nih.gov/articles/instance/7820803/bin/mmc2.xlsx
+ rath2022_supplement.pdf - https://pmc.ncbi.nlm.nih.gov/articles/instance/9772141/bin/NIHMS1834139-supplement-supinfo.pdf
+
+Author: Generated for InSiGHT VCEP
+Date: 2025
+"""
+
+import subprocess
+import os
+import re
+import sys
+import zipfile
+import xml.etree.ElementTree as ET
+
+try:
+ import openpyxl
+except ImportError:
+ print("ERROR: openpyxl is required. Install with: pip install openpyxl")
+ sys.exit(1)
+
+# ============================================================================
+# Configuration
+# ============================================================================
+OUTPUT_DIR = "/hive/users/lrnassar/insightHub/functionalAssays"
+
+# Transcripts for coordinate mapping (current MANE versions in hgsql)
+TRANSCRIPTS = {
+ 'MLH1': 'NM_000249.4',
+ 'MSH2': 'NM_000251.3',
+ 'MSH6': 'NM_000179.3',
+}
+
+# Paper-specific transcript versions (used in the name field per paper's notation)
+PAPER_TRANSCRIPTS = {
+ 'drost2018': {'MLH1': 'NM_000249.3', 'MSH2': 'NM_000251.2'},
+ 'drost2020': {'MSH6': 'NM_000179.2'},
+ 'rath2022': {'MLH1': 'NM_000249.4'},
+}
+
+# Classification colors (RGB) from insight.html
+COLORS = {
+ 'PS3_Strong': '180,0,0', # Dark Red
+ 'PS3_Moderate': '210,0,0', # Red
+ 'PS3_Supporting': '245,152,152', # Pink
+ 'Indeterminate': '128,128,128', # Gray
+ 'BS3_Supporting': '213,247,213', # Lime Green
+ 'BS3_Strong': '0,210,0', # Green
+}
+
+# Paper references and PMIDs
+PAPERS = {
+ 'drost2018': {'ref': 'Drost et al., 2018', 'pmid': '30504929'},
+ 'drost2020': {'ref': 'Drost et al., 2020', 'pmid': '31965077'},
+ 'jia2021': {'ref': 'Jia et al., 2021', 'pmid': '33357406'},
+ 'rath2022': {'ref': 'Rath et al., 2022', 'pmid': '36054288'},
+}
+
+# AutoSQL definition
+AUTOSQL = """table insightFunctionalAssays
+"Functional assay evidence for Lynch syndrome MMR gene variant classification"
+ (
+ string chrom; "Reference sequence chromosome or scaffold"
+ uint chromStart; "Start position in chromosome"
+ uint chromEnd; "End position in chromosome"
+ string name; "Variant name (HGVSc or HGVSp notation)"
+ uint score; "Not used, set to 0"
+ char[1] strand; "Not used, set to ."
+ uint thickStart; "Same as chromStart"
+ uint thickEnd; "Same as chromEnd"
+ uint reserved; "RGB color value"
+ string gene; "Gene symbol"
+ string protein; "Protein change (HGVSp notation)"
+ string classification; "ACMG evidence classification (PS3/BS3)"
+ string clinVarId; "ClinVar variation ID, if available"
+ string score_value; "Functional score (OddsPath or LOF)"
+ string paperRef; "Publication reference"
+ lstring _mouseOver; "HTML mouseover text"
+ )
+"""
+
+# ============================================================================
+# Utility functions
+# ============================================================================
+
+def bash(cmd):
+ """Run the cmd in bash subprocess"""
+ try:
+ rawBashOutput = subprocess.run(cmd, check=True, shell=True,
+ stdout=subprocess.PIPE, universal_newlines=True, stderr=subprocess.STDOUT)
+ bashStdoutt = rawBashOutput.stdout
+ except subprocess.CalledProcessError as e:
+ raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))
+ return(bashStdoutt)
+
+def get_transcript_info(db, accession):
+ """Query hgsql to get transcript information from ncbiRefSeq"""
+ query = f"SELECT name, chrom, strand, txStart, txEnd, cdsStart, cdsEnd, exonStarts, exonEnds FROM ncbiRefSeq WHERE name='{accession}'"
+ result = bash(f'hgsql {db} -Ne "{query}"')
+
+ if not result.strip():
+ raise ValueError(f"Transcript {accession} not found in {db}.ncbiRefSeq")
+
+ fields = result.strip().split('\t')
+
+ # Parse exon starts and ends (comma-separated, trailing comma)
+ exon_starts = [int(x) for x in fields[7].rstrip(',').split(',')]
+ exon_ends = [int(x) for x in fields[8].rstrip(',').split(',')]
+
+ return {
+ 'name': fields[0],
+ 'chrom': fields[1],
+ 'strand': fields[2],
+ 'txStart': int(fields[3]),
+ 'txEnd': int(fields[4]),
+ 'cdsStart': int(fields[5]),
+ 'cdsEnd': int(fields[6]),
+ 'exonStarts': exon_starts,
+ 'exonEnds': exon_ends,
+ }
+
+def build_cds_regions(tx):
+ """Build list of CDS regions from transcript info"""
+ cds_regions = []
+ for i in range(len(tx['exonStarts'])):
+ ex_start = tx['exonStarts'][i]
+ ex_end = tx['exonEnds'][i]
+ # Skip exons outside CDS
+ if ex_end <= tx['cdsStart'] or ex_start >= tx['cdsEnd']:
+ continue
+ # Clip exon to CDS boundaries
+ region_start = max(ex_start, tx['cdsStart'])
+ region_end = min(ex_end, tx['cdsEnd'])
+ cds_regions.append((region_start, region_end))
+
+ # For minus strand, reverse order (CDS starts from high genomic coords)
+ if tx['strand'] == '-':
+ cds_regions = cds_regions[::-1]
+
+ return cds_regions
+
+def cds_pos_to_genomic(cds_pos, cds_regions, strand):
+ """
+ Convert a 1-based CDS position to 0-based genomic coordinate.
+ Returns (genomic_start, genomic_end) for the single nucleotide.
+ """
+ cumulative = 0
+
+ for region_start, region_end in cds_regions:
+ region_len = region_end - region_start
+ region_cds_start = cumulative + 1
+ region_cds_end = cumulative + region_len
+
+ if cds_pos >= region_cds_start and cds_pos <= region_cds_end:
+ offset = cds_pos - region_cds_start
+
+ if strand == '+':
+ genomic_pos = region_start + offset
+ else:
+ # Minus strand: count from end of region
+ genomic_pos = region_end - offset - 1
+
+ return genomic_pos, genomic_pos + 1
+
+ cumulative += region_len
+
+ return None, None
+
+def aa_to_genomic(aa_pos, cds_regions):
+ """
+ Convert a 1-based amino acid position to 0-based genomic coordinates
+ for the 3-nucleotide codon span. Only handles + strand genes
+ (MLH1, MSH2, MSH6 are all on + strand).
+ Returns (genomic_start, genomic_end) or (None, None).
+ """
+ nt_start = (aa_pos - 1) * 3 + 1 # 1-based CDS nucleotide position
+ nt_end = aa_pos * 3
+
+ # Find the genomic positions for the first and last nucleotide of the codon
+ first_start, first_end = cds_pos_to_genomic(nt_start, cds_regions, '+')
+ last_start, last_end = cds_pos_to_genomic(nt_end, cds_regions, '+')
+
+ if first_start is not None and last_start is not None:
+ return first_start, last_end
+
+ return None, None
+
+def parse_hgvsc_pos(hgvsc):
+ """
+ Parse HGVSc notation to extract the CDS position and span.
+ Returns (start_pos, end_pos) as 1-based CDS positions.
+ For simple substitutions, start == end.
+
+ Examples:
+ - c.4T>A -> (4, 4)
+ - c.100G>C -> (100, 100)
+ - c.104_105delinsAC -> (104, 105)
+ - c.1852_1853delinsGC -> (1852, 1853)
+ """
+ hgvsc = hgvsc.strip()
+ # Simple substitution: c.100G>C
+ match = re.match(r'c\.(\d+)[ACGT]>[ACGT]', hgvsc)
+ if match:
+ pos = int(match.group(1))
+ return pos, pos
+ # Delins spanning positions: c.104_105delinsAC
+ match = re.match(r'c\.(\d+)_(\d+)delins', hgvsc)
+ if match:
+ return int(match.group(1)), int(match.group(2))
+ return None, None
+
+def parse_protein_pos(prot):
+ """
+ Parse amino acid position from protein notation.
+ Handles both p.X###Y and X###Y formats.
+
+ Examples:
+ - p.W372R -> 372
+ - p.K13T -> 13
+ - M1A -> 1
+ """
+ prot = prot.strip()
+ if prot.startswith('p.'):
+ prot = prot[2:]
+ match = re.match(r'[A-Z*](\d+)', prot)
+ if match:
+ return int(match.group(1))
+ return None
+
+# ============================================================================
+# Classification functions
+# ============================================================================
+
+def classify_tavtigian(odds_path):
+ """
+ Classify variant using Tavtigian OddsPath thresholds.
+ Used for Papers 1, 2, 4 (CIMRA assays).
+
+ Thresholds:
+ PS3_Strong: > 18.7
+ PS3_Moderate: > 4.3 and <= 18.7
+ PS3_Supporting: > 2.08 and <= 4.3
+ Indeterminate: > 0.48 and <= 2.08
+ BS3_Supporting: > 0.05 and <= 0.48
+ BS3_Strong: <= 0.05
+ """
+ try:
+ val = float(odds_path)
+ except (ValueError, TypeError):
+ return None
+
+ if val > 18.7:
+ return 'PS3_Strong'
+ elif val > 4.3:
+ return 'PS3_Moderate'
+ elif val > 2.08:
+ return 'PS3_Supporting'
+ elif val > 0.48:
+ return 'Indeterminate'
+ elif val > 0.05:
+ return 'BS3_Supporting'
+ else:
+ return 'BS3_Strong'
+
+def classify_jia_lof(lof_score):
+ """
+ Classify variant using Jia LOF score thresholds.
+ Used for Paper 3 (deep mutational scanning).
+
+ Thresholds:
+ PS3_Strong: >= 0.4
+ Indeterminate: >= 0 and < 0.4
+ BS3_Strong: < 0
+ """
+ try:
+ val = float(lof_score)
+ except (ValueError, TypeError):
+ return None
+
+ if val >= 0.4:
+ return 'PS3_Strong'
+ elif val >= 0:
+ return 'Indeterminate'
+ else:
+ return 'BS3_Strong'
+
+# ============================================================================
+# MouseOver builder
+# ============================================================================
+
+def build_mouseover(name, protein, classification, clinvar_id, score_val, score_label, paper_key):
+ """Build HTML mouseover text for a variant"""
+ paper_info = PAPERS[paper_key]
+ parts = []
+ parts.append(f"HGVSc/HGVSp: {name}")
+ parts.append(f"Protein: {protein}")
+ parts.append(f"Classification: {classification}")
+
+ if clinvar_id and clinvar_id != 'NA' and clinvar_id != '':
+ parts.append(f'ClinVar ID: {clinvar_id}')
+ else:
+ parts.append("ClinVar ID: N/A")
+
+ parts.append(f"{score_label}: {score_val}")
+ parts.append(f'Paper: {paper_info["ref"]}')
+
+ return "".join(parts)
+
+# ============================================================================
+# BED entry builder
+# ============================================================================
+
+def make_bed_entry(chrom, start, end, name, color, gene, protein,
+ classification, clinvar_id, score_val, paper_ref, mouseover):
+ """Create a BED9+7 line"""
+ return (f"{chrom}\t{start}\t{end}\t{name}\t0\t.\t{start}\t{end}\t{color}"
+ f"\t{gene}\t{protein}\t{classification}\t{clinvar_id}"
+ f"\t{score_val}\t{paper_ref}\t{mouseover}")
+
+# ============================================================================
+# Data parsing: Paper 1 - Drost et al. 2018
+# ============================================================================
+
+def parse_drost2018(filepath):
+ """
+ Drost et al. 2018 (PMID:30504929) - Supplementary Table S2.
+ Source: NIHMS1010100-supplement-CIMRA.pdf
+ Genes: MLH1 (NM_000249.3), MSH2 (NM_000251.2)
+
+ Data hardcoded from published supplement for reliability.
+ OddsPath values are the CIMRA-based "Odds for Pathogenicity" column
+ (Classification Prior-P + CIMRA section). European decimal commas
+ converted to periods.
+
+ ClinVar ID exceptions:
+ - V470G (c.1409T>G): Paper listed ClinVar 90658, but that ID now resolves
+ to V470E (c.1409T>A) — a different nucleotide change at the same position.
+ ClinVar appears to have reassigned the ID since publication. Removed to
+ avoid linking to the wrong variant.
+
+ Returns list of variant dicts.
+ """
+ # (gene, dna, protein, clinvar_id, odds_path_cimra)
+ DATA = [
+ # MLH1 variants (39)
+ ('MLH1', 'c.62C>A', 'p.A21E', '90297', '23.621'),
+ ('MLH1', 'c.62C>T', 'p.A21V', '90298', '29.238'),
+ ('MLH1', 'c.73A>T', 'p.I25F', '90343', '23.919'),
+ ('MLH1', 'c.104_105delinsAC', 'p.M35N', '17105', '20.835'),
+ ('MLH1', 'c.143A>C', 'p.Q48P', '89739', '31.923'),
+ ('MLH1', 'c.146T>A', 'p.V49E', '89749', '7.829'),
+ ('MLH1', 'c.191A>G', 'p.N64S', '89947', '0.483'),
+ ('MLH1', 'c.203T>A', 'p.I68N', '90008', '29.607'),
+ ('MLH1', 'c.244A>G', 'p.T82A', '90116', '47.103'),
+ ('MLH1', 'c.245C>T', 'p.T82I', '90118', '33.147'),
+ ('MLH1', 'c.301G>A', 'p.G101S', '90137', '44.238'),
+ ('MLH1', 'c.332C>T', 'p.A111V', '90171', '21.908'),
+ ('MLH1', 'c.346A>C', 'p.T116P', '', '17.699'),
+ ('MLH1', 'c.382G>C', 'p.A128P', '90199', '32.734'),
+ ('MLH1', 'c.394G>C', 'p.D132H', '17096', '0.002'),
+ ('MLH1', 'c.779T>G', 'p.L260R', '90350', '18.378'),
+ ('MLH1', 'c.793C>A', 'p.R265S', '90380', '10.580'),
+ ('MLH1', 'c.803A>G', 'p.E268G', '90382', '0.070'),
+ ('MLH1', 'c.911A>T', 'p.D304V', '90439', '33.990'),
+ ('MLH1', 'c.918T>A', 'p.N306K', '90441', '51.428'),
+ ('MLH1', 'c.1321G>A', 'p.A441T', '89696', '0.001'),
+ ('MLH1', 'c.1664T>C', 'p.L555P', '89822', '57.577'),
+ ('MLH1', 'c.1733A>G', 'p.E578G', '17090', '0.003'),
+ ('MLH1', 'c.1742C>T', 'p.P581L', '41635', '0.002'),
+ ('MLH1', 'c.1745T>C', 'p.L582P', '89871', '28.874'),
+ ('MLH1', 'c.1799A>G', 'p.E600G', '89890', '0.002'),
+ ('MLH1', 'c.1808C>G', 'p.P603R', '41636', '0.002'),
+ ('MLH1', 'c.1823C>A', 'p.A608D', '89897', '2.905'),
+ ('MLH1', 'c.1853A>C', 'p.K618T', '89906', '3.377'),
+ ('MLH1', 'c.1855G>C', 'p.A619P', '89909', '2.833'),
+ ('MLH1', 'c.1865T>A', 'p.L622H', '29657', '6.486'),
+ ('MLH1', 'c.1943C>T', 'p.P648L', '89953', '0.521'),
+ ('MLH1', 'c.1942C>T', 'p.P648S', '17097', '3.507'),
+ ('MLH1', 'c.1976G>T', 'p.R659L', '89966', '4.507'),
+ ('MLH1', 'c.2038T>C', 'p.C680R', '90006', '37.579'),
+ ('MLH1', 'c.2041G>A', 'p.A681T', '17099', '0.031'),
+ ('MLH1', 'c.2101C>A', 'p.Q701K', '90042', '0.034'),
+ ('MLH1', 'c.2128A>G', 'p.N710D', '', '0.160'),
+ ('MLH1', 'c.2263A>G', 'p.R755G', '219292', '68.636'),
+ # MSH2 variants (35)
+ ('MSH2', 'c.23C>T', 'p.T8M', '90964', '0.166'),
+ ('MSH2', 'c.277C>T', 'p.L93F', '91044', '0.554'),
+ ('MSH2', 'c.287G>A', 'p.R96H', '91053', '0.051'),
+ ('MSH2', 'c.317G>A', 'p.R106K', '91062', '0.013'),
+ ('MSH2', 'c.488T>A', 'p.V163D', '91107', '89.331'),
+ ('MSH2', 'c.488T>G', 'p.V163G', '91108', '79.790'),
+ ('MSH2', 'c.493T>G', 'p.Y165D', '91111', '81.818'),
+ ('MSH2', 'c.505A>G', 'p.I169V', '91115', '0.000'),
+ ('MSH2', 'c.560T>G', 'p.L187R', '91135', '59.040'),
+ ('MSH2', 'c.595T>C', 'p.C199R', '91146', '105.160'),
+ ('MSH2', 'c.599T>A', 'p.V200D', '91148', '89.331'),
+ ('MSH2', 'c.929T>C', 'p.L310P', '91245', '55.450'),
+ ('MSH2', 'c.989T>C', 'p.L330P', '91270', '81.818'),
+ ('MSH2', 'c.991A>G', 'p.N331D', '91271', '0.125'),
+ ('MSH2', 'c.1022T>C', 'p.L341P', '90507', '42.073'),
+ ('MSH2', 'c.1046C>G', 'p.P349R', '90513', '80.798'),
+ ('MSH2', 'c.1077A>T', 'p.R359S', '90539', '62.080'),
+ ('MSH2', 'c.1168C>T', 'p.L390F', '41641', '0.003'),
+ ('MSH2', 'c.1255C>A', 'p.Q419K', '90583', '0.002'),
+ ('MSH2', 'c.1319T>C', 'p.L440P', '90625', '33.990'),
+ ('MSH2', 'c.1409T>G', 'p.V470G', '', '83.898'), # Paper listed ClinVar 90658, but that ID now points to V470E (c.1409T>A); removed
+ ('MSH2', 'c.1571G>T', 'p.R524L', '90701', '0.179'),
+ ('MSH2', 'c.1690A>G', 'p.T564A', '90750', '0.001'),
+ ('MSH2', 'c.1730T>C', 'p.I577T', '41644', '0.323'),
+ ('MSH2', 'c.1808A>G', 'p.D603G', '90791', '66.934'),
+ ('MSH2', 'c.1955C>A', 'p.P652H', '90823', '0.035'),
+ ('MSH2', 'c.1963G>A', 'p.V655I', '127637', '0.002'),
+ ('MSH2', 'c.2021G>A', 'p.G674D', '90861', '50.787'),
+ ('MSH2', 'c.2023A>G', 'p.K675E', '408453', '54.075'),
+ ('MSH2', 'c.2047G>A', 'p.G683R', '90868', '74.003'),
+ ('MSH2', 'c.2063T>G', 'p.M688R', '90874', '77.813'),
+ ('MSH2', 'c.2087C>T', 'p.P696L', '90881', '18.845'),
+ ('MSH2', 'c.2276G>A', 'p.G759E', '41866', '2.661'),
+ ('MSH2', 'c.2287G>C', 'p.A763P', '', '35.294'),
+ ('MSH2', 'c.2500G>A', 'p.A834T', '90986', '0.004'),
+ ]
+
+ print(f" Loading Drost 2018 data (hardcoded from Supplementary Table S2)...")
+ variants = []
+ for gene, dna, protein, clinvar_id, odds_path in DATA:
+ transcript = PAPER_TRANSCRIPTS['drost2018'][gene]
+ variants.append({
+ 'gene': gene,
+ 'dna': dna,
+ 'protein': protein,
+ 'clinvar_id': clinvar_id,
+ 'odds_path': odds_path,
+ 'name': f"{transcript}:{dna}",
+ 'paper': 'drost2018',
+ 'coord_type': 'cds',
+ })
+
+ print(f" Loaded {len(variants)} variants from Drost 2018")
+ return variants
+
+# ============================================================================
+# Data parsing: Paper 2 - Drost et al. 2020
+# ============================================================================
+
+def get_docx_tables(docx_path):
+ """Extract tables from a DOCX file using XML parsing"""
+ ns = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
+
+ with zipfile.ZipFile(docx_path, 'r') as z:
+ xml_content = z.read('word/document.xml')
+
+ root = ET.fromstring(xml_content)
+ tables = root.findall('.//w:tbl', ns)
+
+ result = []
+ for table in tables:
+ rows = table.findall('.//w:tr', ns)
+ table_data = []
+ for row in rows:
+ cells = row.findall('.//w:tc', ns)
+ row_data = []
+ for cell in cells:
+ texts = []
+ for p in cell.findall('.//w:t', ns):
+ if p.text:
+ texts.append(p.text)
+ row_data.append(''.join(texts))
+ table_data.append(row_data)
+ result.append(table_data)
+
+ return result
+
+def parse_drost2020(filepath):
+ """
+ Parse Drost 2020 supplementary DOCX (Tables S1, S3, S5).
+ Gene: MSH6 (NM_000179.2 for S1/S3, protein-only for S5)
+ Returns list of variant dicts.
+
+ Table structure (from DOCX XML):
+ Table 0 = S1 (31 calibration variants): DNA col 0, Protein col 1, ClinVar col 2, OddsPath col 10
+ Table 2 = S3 (18 additional variants): DNA col 0, Protein col 1, ClinVar col 2, OddsPath col 8
+ Table 4 = S5 (38 screen variants): Protein col 0, OddsPath col 4
+
+ Errata corrections:
+ - p.V509Ag (c.1526T>C) in S1: Trailing 'g' is a typo in the published
+ DOCX. ClinVar 41588 confirms the correct notation is p.Val509Ala
+ (p.V509A). Corrected during parsing.
+ """
+ if not os.path.exists(filepath):
+ print(f" WARNING: {filepath} not found, skipping Drost 2020")
+ return []
+
+ print(f" Parsing {filepath}...")
+ tables = get_docx_tables(filepath)
+ transcript = PAPER_TRANSCRIPTS['drost2020']['MSH6']
+
+ variants = []
+
+ # Table 0 = S1: 31 calibration variants (rows 2-32, skipping header rows 0-1)
+ if len(tables) > 0:
+ table_s1 = tables[0]
+ for row in table_s1[2:]: # Skip 2 header rows
+ if len(row) < 11:
+ continue
+ dna = row[0].strip()
+ protein = row[1].strip()
+ clinvar_id = row[2].strip()
+ odds_path = row[10].strip()
+
+ if not dna.startswith('c.'):
+ continue
+ if clinvar_id == 'NA':
+ clinvar_id = ''
+ # Fix known typos in published DOCX: trailing 'g' on protein names
+ # is a formatting artifact (e.g., p.V509Ag, p.E220Dg, p.E1163Vg, p.R1304Kg)
+ # ClinVar confirms correct forms without trailing 'g'
+ protein = re.sub(r'^(p\.[A-Z]\d+[A-Z])g$', r'\1', protein)
+
+ variants.append({
+ 'gene': 'MSH6',
+ 'dna': dna,
+ 'protein': protein,
+ 'clinvar_id': clinvar_id,
+ 'odds_path': odds_path,
+ 'name': f"{transcript}:{dna}",
+ 'paper': 'drost2020',
+ 'coord_type': 'cds',
+ })
+ print(f" S1: {len([v for v in variants if v['coord_type'] == 'cds'])} variants")
+
+ # Table 2 = S3: 18 additional variants (rows 2-19, skipping header rows 0-1)
+ s3_count = 0
+ if len(tables) > 2:
+ table_s3 = tables[2]
+ for row in table_s3[2:]: # Skip 2 header rows
+ if len(row) < 9:
+ continue
+ dna = row[0].strip()
+ protein = row[1].strip()
+ clinvar_id = row[2].strip()
+ odds_path = row[8].strip()
+
+ if not dna.startswith('c.'):
+ continue
+ if clinvar_id == 'NA':
+ clinvar_id = ''
+
+ variants.append({
+ 'gene': 'MSH6',
+ 'dna': dna,
+ 'protein': protein,
+ 'clinvar_id': clinvar_id,
+ 'odds_path': odds_path,
+ 'name': f"{transcript}:{dna}",
+ 'paper': 'drost2020',
+ 'coord_type': 'cds',
+ })
+ s3_count += 1
+ print(f" S3: {s3_count} variants")
+
+ # Table 4 = S5: 38 genetic screen variants (rows 2-39, skipping header rows 0-1)
+ s5_count = 0
+ if len(tables) > 4:
+ table_s5 = tables[4]
+ for row in table_s5[2:]: # Skip 2 header rows
+ if len(row) < 5:
+ continue
+ protein = row[0].strip()
+ odds_path = row[4].strip()
+
+ if not protein.startswith('p.'):
+ continue
+
+ variants.append({
+ 'gene': 'MSH6',
+ 'dna': '',
+ 'protein': protein,
+ 'clinvar_id': '',
+ 'odds_path': odds_path,
+ 'name': protein,
+ 'paper': 'drost2020',
+ 'coord_type': 'protein',
+ })
+ s5_count += 1
+ print(f" S5: {s5_count} variants")
+
+ print(f" Total: {len(variants)} variants from Drost 2020")
+ return variants
+
+# ============================================================================
+# Data parsing: Paper 3 - Jia et al. 2021
+# ============================================================================
+
+def parse_jia2021(filepath):
+ """
+ Parse Jia 2021 supplementary XLSX (TableS5_AllVariants).
+ Gene: MSH2 (NM_000251, mapped using NM_000251.3)
+ Returns list of variant dicts.
+
+ TableS5 columns: Variant (col 0), Position (col 1), LOF score (col 2)
+ Variants in compact notation like M1A, need p. prefix.
+ Skip variants with LOF score = None.
+ """
+ if not os.path.exists(filepath):
+ print(f" WARNING: {filepath} not found, skipping Jia 2021")
+ return []
+
+ print(f" Parsing {filepath}...")
+ wb = openpyxl.load_workbook(filepath, read_only=True)
+
+ variants = []
+ ws = wb['TableS5_AllVariants']
+
+ skipped_none = 0
+ for i, row in enumerate(ws.iter_rows(min_row=2, values_only=True)):
+ vals = list(row)
+ if len(vals) < 3:
+ continue
+
+ variant_name = vals[0]
+ position = vals[1]
+ lof_score = vals[2]
+
+ if variant_name is None:
+ continue
+ if lof_score is None:
+ skipped_none += 1
+ continue
+
+ variant_str = str(variant_name).strip()
+ protein = f"p.{variant_str}"
+
+ # Use Position column if available, otherwise parse from variant name
+ if position is not None:
+ aa_pos = int(position)
+ else:
+ aa_pos = parse_protein_pos(variant_str)
+
+ if aa_pos is None:
+ continue
+
+ variants.append({
+ 'gene': 'MSH2',
+ 'dna': '',
+ 'protein': protein,
+ 'clinvar_id': '',
+ 'lof_score': str(lof_score),
+ 'name': protein,
+ 'paper': 'jia2021',
+ 'coord_type': 'protein',
+ 'aa_pos': aa_pos,
+ })
+
+ wb.close()
+ print(f" Parsed {len(variants)} variants from Jia 2021 (skipped {skipped_none} with no LOF score)")
+ return variants
+
+# ============================================================================
+# Data parsing: Paper 4 - Rath et al. 2022
+# ============================================================================
+
+def parse_rath2022(filepath):
+ """
+ Rath et al. 2022 (PMID:36054288) - Supplementary Table S1.
+ Source: NIHMS1834139-supplement-supinfo.pdf
+ Gene: MLH1 (NM_000249.4 - MANE transcript)
+
+ Data hardcoded from published supplement for reliability.
+ OddsPath_Non-functional column. Variants with "-" (no OddsPath) are
+ excluded. Scientific notation values (e.g. 1.08E-7) preserved as strings.
+ "1,450" = 1450 (American thousands separator, not European decimal).
+
+ ClinVar ID exceptions:
+ - K618A (c.1852_1853delinsGC): Paper listed ClinVar 32128, which is a
+ ClinVar measure_id (allele-level), not a variation_id (VCV). The
+ /clinvar/variation/32128/ URL returns 404. Updated to VCV 17089, which
+ is the current ClinVar variation record for this variant
+ (NM_000249.4:c.1852_1853delinsGC, p.Lys618Ala).
+
+ Returns list of variant dicts.
+ """
+ # (protein_short, dna, clinvar_id, odds_path_non_functional)
+ # Protein short is the cell line name = protein change without p. prefix
+ DATA = [
+ # Benign (9 with OddsPath, 2 skipped: I219V, H718Y have "-")
+ ('D132H', 'c.394G>C', '17096', '1.08E-7'),
+ ('V213M', 'c.637G>A', '41640', '5.76E-4'),
+ ('E268G', 'c.803A>G', '90382', '2.24E-4'),
+ ('S406N', 'c.1217G>A', '41634', '1.49E-10'),
+ ('E578G', 'c.1733A>G', '17090', '1.44E-6'),
+ ('K618A', 'c.1852_1853delinsGC', '17089', '2.90E-13'), # Paper listed measure_id 32128 (404); updated to VCV 17089
+ ('Q689R', 'c.2066A>G', '90016', '1.11E-4'),
+ ('S692P', 'c.2074T>C', '127621', '4.02E-4'),
+ ('V716M', 'c.2146G>A', '41639', '4.07E-5'),
+ # VUS (6 with OddsPath, 15 skipped with "-")
+ ('C39R', 'c.115T>C', '89655', '71.4'),
+ ('D63N', 'c.187G>A', '89920', '0.00'),
+ ('N64S', 'c.191A>G', '89947', '5.00E-2'),
+ ('L73P', 'c.218T>C', '90086', '13.7'),
+ ('G454R', 'c.1360G>C', '89705', '1.11E-2'),
+ ('R474P', 'c.1421G>C', '', '0.489'),
+ # Pathogenic (11 with OddsPath)
+ ('I19F', 'c.55A>T', '90274', '346'),
+ ('D41H', 'c.121G>C', '89682', '3.99E+4'),
+ ('S44F', 'c.131C>T', '17079', '2.46E+4'),
+ ('G67R', 'c.199G>A', '89992', '4.27E+4'),
+ ('R100P', 'c.299G>C', '90133', '1450'),
+ ('I107R', 'c.320T>G', '90167', '210'),
+ ('T117M', 'c.350C>T', '17094', '1.33E+18'),
+ ('S247P', 'c.739T>C', '90342', '105'),
+ ('R265S', 'c.793C>A', '90380', '37.2'),
+ ('I276R', 'c.827T>G', '520540', '319'),
+ ('R687W', 'c.2059C>T', '90014', '1.74E+4'),
+ ]
+
+ print(f" Loading Rath 2022 data (hardcoded from Supplementary Table S1)...")
+ transcript = PAPER_TRANSCRIPTS['rath2022']['MLH1']
+
+ variants = []
+ for protein_short, dna, clinvar_id, odds_path in DATA:
+ protein = f"p.{protein_short}"
+ variants.append({
+ 'gene': 'MLH1',
+ 'dna': dna,
+ 'protein': protein,
+ 'clinvar_id': clinvar_id,
+ 'odds_path': odds_path,
+ 'name': f"{transcript}:{dna}",
+ 'paper': 'rath2022',
+ 'coord_type': 'cds',
+ })
+
+ print(f" Loaded {len(variants)} variants from Rath 2022")
+ return variants
+
+# ============================================================================
+# BED generation
+# ============================================================================
+
+def process_variants(variants, db, transcript_cache, stats):
+ """
+ Convert parsed variants to BED entries for a given genome assembly.
+ Returns list of BED line strings.
+ """
+ bed_entries = []
+
+ for var in variants:
+ gene = var['gene']
+ paper = var['paper']
+ paper_info = PAPERS[paper]
+
+ # Get transcript info (cached)
+ tx_accession = TRANSCRIPTS[gene]
+ cache_key = f"{db}_{tx_accession}"
+ if cache_key not in transcript_cache:
+ transcript_cache[cache_key] = get_transcript_info(db, tx_accession)
+ transcript_cache[f"{cache_key}_cds"] = build_cds_regions(transcript_cache[cache_key])
+
+ tx = transcript_cache[cache_key]
+ cds_regions = transcript_cache[f"{cache_key}_cds"]
+ chrom = tx['chrom']
+ strand = tx['strand']
+
+ # Determine genomic coordinates based on coordinate type
+ if var['coord_type'] == 'cds':
+ cds_start, cds_end = parse_hgvsc_pos(var['dna'])
+ if cds_start is None:
+ stats['parse_failed'] += 1
+ continue
+ genomic_start, _ = cds_pos_to_genomic(cds_start, cds_regions, strand)
+ _, genomic_end = cds_pos_to_genomic(cds_end, cds_regions, strand)
+ elif var['coord_type'] == 'protein':
+ aa_pos = var.get('aa_pos') or parse_protein_pos(var['protein'])
+ if aa_pos is None:
+ stats['parse_failed'] += 1
+ continue
+ genomic_start, genomic_end = aa_to_genomic(aa_pos, cds_regions)
+ else:
+ stats['parse_failed'] += 1
+ continue
+
+ if genomic_start is None:
+ stats['coord_failed'] += 1
+ continue
+
+ # Classify variant
+ if paper == 'jia2021':
+ classification = classify_jia_lof(var['lof_score'])
+ score_val = var['lof_score']
+ score_label = 'LOF'
+ else:
+ classification = classify_tavtigian(var['odds_path'])
+ score_val = var['odds_path']
+ score_label = 'OddsPath'
+
+ if classification is None:
+ stats['classify_failed'] += 1
+ continue
+
+ color = COLORS[classification]
+ name = var['name']
+ protein = var['protein']
+ clinvar_id = var.get('clinvar_id', '')
+ paper_ref = paper_info['ref']
+
+ # Build mouseover
+ mouseover = build_mouseover(name, protein, classification,
+ clinvar_id, score_val, score_label, paper)
+
+ bed_line = make_bed_entry(chrom, genomic_start, genomic_end, name,
+ color, gene, protein, classification,
+ clinvar_id, score_val, paper_ref, mouseover)
+ bed_entries.append(bed_line)
+ stats['included'] += 1
+ stats[f'included_{paper}'] = stats.get(f'included_{paper}', 0) + 1
+
+ return bed_entries
+
+def create_track(db):
+ """Create BED and bigBed files for a given genome assembly"""
+ print(f"\n{'='*70}")
+ print(f"Processing {db}")
+ print(f"{'='*70}")
+
+ stats = {
+ 'included': 0,
+ 'parse_failed': 0,
+ 'coord_failed': 0,
+ 'classify_failed': 0,
+ }
+
+ transcript_cache = {}
+
+ # Parse all papers
+ all_variants = []
+
+ # Paper 1: Drost 2018
+ print("\n Paper 1: Drost et al. 2018")
+ drost2018_file = os.path.join(OUTPUT_DIR, "drost2018_supplement.pdf")
+ all_variants.extend(parse_drost2018(drost2018_file))
+
+ # Paper 2: Drost 2020
+ print("\n Paper 2: Drost et al. 2020")
+ drost2020_file = os.path.join(OUTPUT_DIR, "drost2020_supplement.docx")
+ all_variants.extend(parse_drost2020(drost2020_file))
+
+ # Paper 3: Jia 2021
+ print("\n Paper 3: Jia et al. 2021")
+ jia2021_file = os.path.join(OUTPUT_DIR, "mmc2.xlsx")
+ all_variants.extend(parse_jia2021(jia2021_file))
+
+ # Paper 4: Rath 2022
+ print("\n Paper 4: Rath et al. 2022")
+ rath2022_file = os.path.join(OUTPUT_DIR, "rath2022_supplement.pdf")
+ all_variants.extend(parse_rath2022(rath2022_file))
+
+ print(f"\n Total variants to process: {len(all_variants)}")
+
+ # Convert to BED entries
+ print(f"\n Querying transcript coordinates from {db}...")
+ bed_entries = process_variants(all_variants, db, transcript_cache, stats)
+
+ # Write BED file
+ bed_file = os.path.join(OUTPUT_DIR, f"insightFunctionalAssays_{db}.bed")
+ print(f"\n Writing BED file: {bed_file}")
+ with open(bed_file, 'w') as f:
+ f.write('\n'.join(bed_entries) + '\n')
+
+ # Sort BED file
+ print(" Sorting BED file...")
+ bash(f"sort -k1,1 -k2,2n {bed_file} -o {bed_file}")
+
+ # Create bigBed
+ as_file = os.path.join(OUTPUT_DIR, "insightFunctionalAssays.as")
+ db_label = db.replace('hg', 'Hg')
+ bb_file = os.path.join(OUTPUT_DIR, f"insightFunctionalAssays{db_label}.bb")
+ chrom_sizes = f"/cluster/data/{db}/chrom.sizes"
+
+ print(f" Creating bigBed file: {bb_file}")
+ try:
+ bash(f"bedToBigBed -as={as_file} -type=bed9+7 -tab {bed_file} {chrom_sizes} {bb_file}")
+ print(f" Successfully created: {bb_file}")
+ except Exception as e:
+ print(f" ERROR creating bigBed: {e}")
+
+ # Print stats
+ print(f"\n Statistics for {db}:")
+ print(f" Total included: {stats['included']}")
+ for paper_key in ['drost2018', 'drost2020', 'jia2021', 'rath2022']:
+ count = stats.get(f'included_{paper_key}', 0)
+ if count > 0:
+ print(f" {PAPERS[paper_key]['ref']}: {count}")
+ print(f" Parse failed: {stats['parse_failed']}")
+ print(f" Coordinate failed: {stats['coord_failed']}")
+ print(f" Classify failed: {stats['classify_failed']}")
+
+ return bed_file, bb_file
+
+# ============================================================================
+# Main
+# ============================================================================
+
+def main():
+ print("=" * 70)
+ print("InSiGHT VCEP Functional Assays Track Generator")
+ print("=" * 70)
+
+ # Create output directory if needed
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
+
+ # Write AutoSql file
+ as_file = os.path.join(OUTPUT_DIR, "insightFunctionalAssays.as")
+ print(f"\nWriting AutoSql file: {as_file}")
+ with open(as_file, 'w') as f:
+ f.write(AUTOSQL)
+
+ # Process both assemblies
+ output_files = {}
+ for db in ['hg38', 'hg19']:
+ try:
+ bed_file, bb_file = create_track(db)
+ output_files[db] = {'bed': bed_file, 'bb': bb_file}
+ except Exception as e:
+ print(f"\nERROR processing {db}: {e}")
+ import traceback
+ traceback.print_exc()
+
+ # Print summary
+ print("\n" + "=" * 70)
+ print("Output Files")
+ print("=" * 70)
+ print(f" AutoSql file: {as_file}")
+ for db, files in output_files.items():
+ print(f"\n {db}:")
+ print(f" BED file: {files['bed']}")
+ if os.path.exists(files['bb']):
+ print(f" BigBed file: {files['bb']}")
+
+ print("\n" + "=" * 70)
+ print("Done!")
+ print("=" * 70)
+
+if __name__ == "__main__":
+ main()