src/hg/makeDb/scripts/insight/insightFunctionalAssays.py ed3cb243b9151cae7d1859712148fa1d1e1a2196

ed3cb243b9151cae7d1859712148fa1d1e1a2196
lrnassar
  Mon Apr 13 17:34:17 2026 -0700
Adding makedoc and build scripts for InSiGHT VCEP track hub. refs #36582

diff --git src/hg/makeDb/scripts/insight/insightFunctionalAssays.py src/hg/makeDb/scripts/insight/insightFunctionalAssays.py
new file mode 100644
index 00000000000..32ceaa0699e
--- /dev/null
+++ src/hg/makeDb/scripts/insight/insightFunctionalAssays.py
@@ -0,0 +1,963 @@
+#!/usr/bin/env python3
+"""
+InSiGHT VCEP Functional Assays Track Generator
+
+Generates UCSC Genome Browser tracks (bigBed format) for functional assay evidence
+from published studies on Lynch syndrome MMR genes. Combines data from:
+
+  Paper 1: Drost et al. 2018 (PMID:30504929) - MLH1/MSH2 CIMRA assay
+  Paper 2: Drost et al. 2020 (PMID:31965077) - MSH6 CIMRA assay
+  Paper 3: Jia et al. 2021 (PMID:33357406)   - MSH2 deep mutational scanning
+  Paper 4: Rath et al. 2022 (PMID:36054288)  - MLH1 cell-based functional assay
+
+Each variant is classified according to ACMG PS3/BS3 criteria based on
+Tavtigian OddsPath thresholds or Jia LOF score thresholds.
+
+Output: BED9+7 bigBed files for hg38 and hg19.
+
+Supplementary data files (place in same directory as this script):
+  drost2018_supplement.pdf - https://pmc.ncbi.nlm.nih.gov/articles/instance/7901556/bin/NIHMS1010100-supplement-CIMRA.pdf
+  drost2020_supplement.docx - https://static-content.springer.com/esm/art%3A10.1038%2Fs41436-019-0736-2/MediaObjects/41436_2019_736_MOESM2_ESM.docx
+  mmc2.xlsx - https://pmc.ncbi.nlm.nih.gov/articles/instance/7820803/bin/mmc2.xlsx
+  rath2022_supplement.pdf - https://pmc.ncbi.nlm.nih.gov/articles/instance/9772141/bin/NIHMS1834139-supplement-supinfo.pdf
+
+Author: Generated for InSiGHT VCEP
+Date: 2025
+"""
+
+import subprocess
+import os
+import re
+import sys
+import zipfile
+import xml.etree.ElementTree as ET
+
+try:
+    import openpyxl
+except ImportError:
+    print("ERROR: openpyxl is required. Install with: pip install openpyxl")
+    sys.exit(1)
+
+# ============================================================================
+# Configuration
+# ============================================================================
+OUTPUT_DIR = "/hive/users/lrnassar/insightHub/functionalAssays"
+
+# Transcripts for coordinate mapping (current MANE versions in hgsql)
+TRANSCRIPTS = {
+    'MLH1': 'NM_000249.4',
+    'MSH2': 'NM_000251.3',
+    'MSH6': 'NM_000179.3',
+}
+
+# Paper-specific transcript versions (used in the name field per paper's notation)
+PAPER_TRANSCRIPTS = {
+    'drost2018': {'MLH1': 'NM_000249.3', 'MSH2': 'NM_000251.2'},
+    'drost2020': {'MSH6': 'NM_000179.2'},
+    'rath2022':  {'MLH1': 'NM_000249.4'},
+}
+
+# Classification colors (RGB) from insight.html
+COLORS = {
+    'PS3_Strong':     '180,0,0',       # Dark Red
+    'PS3_Moderate':   '210,0,0',       # Red
+    'PS3_Supporting': '245,152,152',   # Pink
+    'Indeterminate':  '128,128,128',   # Gray
+    'BS3_Supporting': '213,247,213',   # Lime Green
+    'BS3_Strong':     '0,210,0',       # Green
+}
+
+# Paper references and PMIDs
+PAPERS = {
+    'drost2018': {'ref': 'Drost et al., 2018', 'pmid': '30504929'},
+    'drost2020': {'ref': 'Drost et al., 2020', 'pmid': '31965077'},
+    'jia2021':   {'ref': 'Jia et al., 2021',   'pmid': '33357406'},
+    'rath2022':  {'ref': 'Rath et al., 2022',  'pmid': '36054288'},
+}
+
+# AutoSQL definition
+AUTOSQL = """table insightFunctionalAssays
+"Functional assay evidence for Lynch syndrome MMR gene variant classification"
+   (
+   string chrom;            "Reference sequence chromosome or scaffold"
+   uint   chromStart;       "Start position in chromosome"
+   uint   chromEnd;         "End position in chromosome"
+   string name;             "Variant name (HGVSc or HGVSp notation)"
+   uint   score;            "Not used, set to 0"
+   char[1] strand;          "Not used, set to ."
+   uint   thickStart;       "Same as chromStart"
+   uint   thickEnd;         "Same as chromEnd"
+   uint   reserved;         "RGB color value"
+   string gene;             "Gene symbol"
+   string protein;          "Protein change (HGVSp notation)"
+   string classification;   "ACMG evidence classification (PS3/BS3)"
+   string clinVarId;        "ClinVar variation ID, if available"
+   string score_value;      "Functional score (OddsPath or LOF)"
+   string paperRef;         "Publication reference"
+   lstring _mouseOver;      "HTML mouseover text"
+   )
+"""
+
+# ============================================================================
+# Utility functions
+# ============================================================================
+
+def bash(cmd):
+    """Run the cmd in bash subprocess"""
+    try:
+        rawBashOutput = subprocess.run(cmd, check=True, shell=True,
+                                       stdout=subprocess.PIPE, universal_newlines=True, stderr=subprocess.STDOUT)
+        bashStdoutt = rawBashOutput.stdout
+    except subprocess.CalledProcessError as e:
+        raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))
+    return(bashStdoutt)
+
+def get_transcript_info(db, accession):
+    """Query hgsql to get transcript information from ncbiRefSeq"""
+    query = f"SELECT name, chrom, strand, txStart, txEnd, cdsStart, cdsEnd, exonStarts, exonEnds FROM ncbiRefSeq WHERE name='{accession}'"
+    result = bash(f'hgsql {db} -Ne "{query}"')
+
+    if not result.strip():
+        raise ValueError(f"Transcript {accession} not found in {db}.ncbiRefSeq")
+
+    fields = result.strip().split('\t')
+
+    # Parse exon starts and ends (comma-separated, trailing comma)
+    exon_starts = [int(x) for x in fields[7].rstrip(',').split(',')]
+    exon_ends = [int(x) for x in fields[8].rstrip(',').split(',')]
+
+    return {
+        'name': fields[0],
+        'chrom': fields[1],
+        'strand': fields[2],
+        'txStart': int(fields[3]),
+        'txEnd': int(fields[4]),
+        'cdsStart': int(fields[5]),
+        'cdsEnd': int(fields[6]),
+        'exonStarts': exon_starts,
+        'exonEnds': exon_ends,
+    }
+
+def build_cds_regions(tx):
+    """Build list of CDS regions from transcript info"""
+    cds_regions = []
+    for i in range(len(tx['exonStarts'])):
+        ex_start = tx['exonStarts'][i]
+        ex_end = tx['exonEnds'][i]
+        # Skip exons outside CDS
+        if ex_end <= tx['cdsStart'] or ex_start >= tx['cdsEnd']:
+            continue
+        # Clip exon to CDS boundaries
+        region_start = max(ex_start, tx['cdsStart'])
+        region_end = min(ex_end, tx['cdsEnd'])
+        cds_regions.append((region_start, region_end))
+
+    # For minus strand, reverse order (CDS starts from high genomic coords)
+    if tx['strand'] == '-':
+        cds_regions = cds_regions[::-1]
+
+    return cds_regions
+
+def cds_pos_to_genomic(cds_pos, cds_regions, strand):
+    """
+    Convert a 1-based CDS position to 0-based genomic coordinate.
+    Returns (genomic_start, genomic_end) for the single nucleotide.
+    """
+    cumulative = 0
+
+    for region_start, region_end in cds_regions:
+        region_len = region_end - region_start
+        region_cds_start = cumulative + 1
+        region_cds_end = cumulative + region_len
+
+        if cds_pos >= region_cds_start and cds_pos <= region_cds_end:
+            offset = cds_pos - region_cds_start
+
+            if strand == '+':
+                genomic_pos = region_start + offset
+            else:
+                # Minus strand: count from end of region
+                genomic_pos = region_end - offset - 1
+
+            return genomic_pos, genomic_pos + 1
+
+        cumulative += region_len
+
+    return None, None
+
+def aa_to_genomic(aa_pos, cds_regions):
+    """
+    Convert a 1-based amino acid position to 0-based genomic coordinates
+    for the 3-nucleotide codon span. Only handles + strand genes
+    (MLH1, MSH2, MSH6 are all on + strand).
+    Returns (genomic_start, genomic_end) or (None, None).
+    """
+    nt_start = (aa_pos - 1) * 3 + 1  # 1-based CDS nucleotide position
+    nt_end = aa_pos * 3
+
+    # Find the genomic positions for the first and last nucleotide of the codon
+    first_start, first_end = cds_pos_to_genomic(nt_start, cds_regions, '+')
+    last_start, last_end = cds_pos_to_genomic(nt_end, cds_regions, '+')
+
+    if first_start is not None and last_start is not None:
+        return first_start, last_end
+
+    return None, None
+
+def parse_hgvsc_pos(hgvsc):
+    """
+    Parse HGVSc notation to extract the CDS position and span.
+    Returns (start_pos, end_pos) as 1-based CDS positions.
+    For simple substitutions, start == end.
+
+    Examples:
+    - c.4T>A -> (4, 4)
+    - c.100G>C -> (100, 100)
+    - c.104_105delinsAC -> (104, 105)
+    - c.1852_1853delinsGC -> (1852, 1853)
+    """
+    hgvsc = hgvsc.strip()
+    # Simple substitution: c.100G>C
+    match = re.match(r'c\.(\d+)[ACGT]>[ACGT]', hgvsc)
+    if match:
+        pos = int(match.group(1))
+        return pos, pos
+    # Delins spanning positions: c.104_105delinsAC
+    match = re.match(r'c\.(\d+)_(\d+)delins', hgvsc)
+    if match:
+        return int(match.group(1)), int(match.group(2))
+    return None, None
+
+def parse_protein_pos(prot):
+    """
+    Parse amino acid position from protein notation.
+    Handles both p.X###Y and X###Y formats.
+
+    Examples:
+    - p.W372R -> 372
+    - p.K13T -> 13
+    - M1A -> 1
+    """
+    prot = prot.strip()
+    if prot.startswith('p.'):
+        prot = prot[2:]
+    match = re.match(r'[A-Z*](\d+)', prot)
+    if match:
+        return int(match.group(1))
+    return None
+
+# ============================================================================
+# Classification functions
+# ============================================================================
+
+def classify_tavtigian(odds_path):
+    """
+    Classify variant using Tavtigian OddsPath thresholds.
+    Used for Papers 1, 2, 4 (CIMRA assays).
+
+    Thresholds:
+      PS3_Strong:     > 18.7
+      PS3_Moderate:   > 4.3 and <= 18.7
+      PS3_Supporting: > 2.08 and <= 4.3
+      Indeterminate:  > 0.48 and <= 2.08
+      BS3_Supporting: > 0.05 and <= 0.48
+      BS3_Strong:     <= 0.05
+    """
+    try:
+        val = float(odds_path)
+    except (ValueError, TypeError):
+        return None
+
+    if val > 18.7:
+        return 'PS3_Strong'
+    elif val > 4.3:
+        return 'PS3_Moderate'
+    elif val > 2.08:
+        return 'PS3_Supporting'
+    elif val > 0.48:
+        return 'Indeterminate'
+    elif val > 0.05:
+        return 'BS3_Supporting'
+    else:
+        return 'BS3_Strong'
+
+def classify_jia_lof(lof_score):
+    """
+    Classify variant using Jia LOF score thresholds.
+    Used for Paper 3 (deep mutational scanning).
+
+    Thresholds:
+      PS3_Strong:    >= 0.4
+      Indeterminate: >= 0 and < 0.4
+      BS3_Strong:    < 0
+    """
+    try:
+        val = float(lof_score)
+    except (ValueError, TypeError):
+        return None
+
+    if val >= 0.4:
+        return 'PS3_Strong'
+    elif val >= 0:
+        return 'Indeterminate'
+    else:
+        return 'BS3_Strong'
+
+# ============================================================================
+# MouseOver builder
+# ============================================================================
+
+def build_mouseover(name, protein, classification, clinvar_id, score_val, score_label, paper_key):
+    """Build HTML mouseover text for a variant"""
+    paper_info = PAPERS[paper_key]
+    parts = []
+    parts.append(f"<b>HGVSc/HGVSp:</b> {name}")
+    parts.append(f"<b>Protein:</b> {protein}")
+    parts.append(f"<b>Classification:</b> {classification}")
+
+    if clinvar_id and clinvar_id != 'NA' and clinvar_id != '':
+        parts.append(f'<b>ClinVar ID:</b> <a href="https://www.ncbi.nlm.nih.gov/clinvar/variation/{clinvar_id}/" target="_blank">{clinvar_id}</a>')
+    else:
+        parts.append("<b>ClinVar ID:</b> N/A")
+
+    parts.append(f"<b>{score_label}:</b> {score_val}")
+    parts.append(f'<b>Paper:</b> <a href="https://pubmed.ncbi.nlm.nih.gov/{paper_info["pmid"]}/" target="_blank">{paper_info["ref"]}</a>')
+
+    return "</br>".join(parts)
+
+# ============================================================================
+# BED entry builder
+# ============================================================================
+
+def make_bed_entry(chrom, start, end, name, color, gene, protein,
+                   classification, clinvar_id, score_val, paper_ref, mouseover):
+    """Create a BED9+7 line"""
+    return (f"{chrom}\t{start}\t{end}\t{name}\t0\t.\t{start}\t{end}\t{color}"
+            f"\t{gene}\t{protein}\t{classification}\t{clinvar_id}"
+            f"\t{score_val}\t{paper_ref}\t{mouseover}")
+
+# ============================================================================
+# Data parsing: Paper 1 - Drost et al. 2018
+# ============================================================================
+
+def parse_drost2018(filepath):
+    """
+    Drost et al. 2018 (PMID:30504929) - Supplementary Table S2.
+    Source: NIHMS1010100-supplement-CIMRA.pdf
+    Genes: MLH1 (NM_000249.3), MSH2 (NM_000251.2)
+
+    Data hardcoded from published supplement for reliability.
+    OddsPath values are the CIMRA-based "Odds for Pathogenicity" column
+    (Classification Prior-P + CIMRA section). European decimal commas
+    converted to periods.
+
+    ClinVar ID exceptions:
+      - V470G (c.1409T>G): Paper listed ClinVar 90658, but that ID now resolves
+        to V470E (c.1409T>A) — a different nucleotide change at the same position.
+        ClinVar appears to have reassigned the ID since publication. Removed to
+        avoid linking to the wrong variant.
+
+    Returns list of variant dicts.
+    """
+    # (gene, dna, protein, clinvar_id, odds_path_cimra)
+    DATA = [
+        # MLH1 variants (39)
+        ('MLH1', 'c.62C>A',              'p.A21E',  '90297',  '23.621'),
+        ('MLH1', 'c.62C>T',              'p.A21V',  '90298',  '29.238'),
+        ('MLH1', 'c.73A>T',              'p.I25F',  '90343',  '23.919'),
+        ('MLH1', 'c.104_105delinsAC',    'p.M35N',  '17105',  '20.835'),
+        ('MLH1', 'c.143A>C',             'p.Q48P',  '89739',  '31.923'),
+        ('MLH1', 'c.146T>A',             'p.V49E',  '89749',  '7.829'),
+        ('MLH1', 'c.191A>G',             'p.N64S',  '89947',  '0.483'),
+        ('MLH1', 'c.203T>A',             'p.I68N',  '90008',  '29.607'),
+        ('MLH1', 'c.244A>G',             'p.T82A',  '90116',  '47.103'),
+        ('MLH1', 'c.245C>T',             'p.T82I',  '90118',  '33.147'),
+        ('MLH1', 'c.301G>A',             'p.G101S', '90137',  '44.238'),
+        ('MLH1', 'c.332C>T',             'p.A111V', '90171',  '21.908'),
+        ('MLH1', 'c.346A>C',             'p.T116P', '',       '17.699'),
+        ('MLH1', 'c.382G>C',             'p.A128P', '90199',  '32.734'),
+        ('MLH1', 'c.394G>C',             'p.D132H', '17096',  '0.002'),
+        ('MLH1', 'c.779T>G',             'p.L260R', '90350',  '18.378'),
+        ('MLH1', 'c.793C>A',             'p.R265S', '90380',  '10.580'),
+        ('MLH1', 'c.803A>G',             'p.E268G', '90382',  '0.070'),
+        ('MLH1', 'c.911A>T',             'p.D304V', '90439',  '33.990'),
+        ('MLH1', 'c.918T>A',             'p.N306K', '90441',  '51.428'),
+        ('MLH1', 'c.1321G>A',            'p.A441T', '89696',  '0.001'),
+        ('MLH1', 'c.1664T>C',            'p.L555P', '89822',  '57.577'),
+        ('MLH1', 'c.1733A>G',            'p.E578G', '17090',  '0.003'),
+        ('MLH1', 'c.1742C>T',            'p.P581L', '41635',  '0.002'),
+        ('MLH1', 'c.1745T>C',            'p.L582P', '89871',  '28.874'),
+        ('MLH1', 'c.1799A>G',            'p.E600G', '89890',  '0.002'),
+        ('MLH1', 'c.1808C>G',            'p.P603R', '41636',  '0.002'),
+        ('MLH1', 'c.1823C>A',            'p.A608D', '89897',  '2.905'),
+        ('MLH1', 'c.1853A>C',            'p.K618T', '89906',  '3.377'),
+        ('MLH1', 'c.1855G>C',            'p.A619P', '89909',  '2.833'),
+        ('MLH1', 'c.1865T>A',            'p.L622H', '29657',  '6.486'),
+        ('MLH1', 'c.1943C>T',            'p.P648L', '89953',  '0.521'),
+        ('MLH1', 'c.1942C>T',            'p.P648S', '17097',  '3.507'),
+        ('MLH1', 'c.1976G>T',            'p.R659L', '89966',  '4.507'),
+        ('MLH1', 'c.2038T>C',            'p.C680R', '90006',  '37.579'),
+        ('MLH1', 'c.2041G>A',            'p.A681T', '17099',  '0.031'),
+        ('MLH1', 'c.2101C>A',            'p.Q701K', '90042',  '0.034'),
+        ('MLH1', 'c.2128A>G',            'p.N710D', '',       '0.160'),
+        ('MLH1', 'c.2263A>G',            'p.R755G', '219292', '68.636'),
+        # MSH2 variants (35)
+        ('MSH2', 'c.23C>T',              'p.T8M',   '90964',  '0.166'),
+        ('MSH2', 'c.277C>T',             'p.L93F',  '91044',  '0.554'),
+        ('MSH2', 'c.287G>A',             'p.R96H',  '91053',  '0.051'),
+        ('MSH2', 'c.317G>A',             'p.R106K', '91062',  '0.013'),
+        ('MSH2', 'c.488T>A',             'p.V163D', '91107',  '89.331'),
+        ('MSH2', 'c.488T>G',             'p.V163G', '91108',  '79.790'),
+        ('MSH2', 'c.493T>G',             'p.Y165D', '91111',  '81.818'),
+        ('MSH2', 'c.505A>G',             'p.I169V', '91115',  '0.000'),
+        ('MSH2', 'c.560T>G',             'p.L187R', '91135',  '59.040'),
+        ('MSH2', 'c.595T>C',             'p.C199R', '91146',  '105.160'),
+        ('MSH2', 'c.599T>A',             'p.V200D', '91148',  '89.331'),
+        ('MSH2', 'c.929T>C',             'p.L310P', '91245',  '55.450'),
+        ('MSH2', 'c.989T>C',             'p.L330P', '91270',  '81.818'),
+        ('MSH2', 'c.991A>G',             'p.N331D', '91271',  '0.125'),
+        ('MSH2', 'c.1022T>C',            'p.L341P', '90507',  '42.073'),
+        ('MSH2', 'c.1046C>G',            'p.P349R', '90513',  '80.798'),
+        ('MSH2', 'c.1077A>T',            'p.R359S', '90539',  '62.080'),
+        ('MSH2', 'c.1168C>T',            'p.L390F', '41641',  '0.003'),
+        ('MSH2', 'c.1255C>A',            'p.Q419K', '90583',  '0.002'),
+        ('MSH2', 'c.1319T>C',            'p.L440P', '90625',  '33.990'),
+        ('MSH2', 'c.1409T>G',            'p.V470G', '',       '83.898'),  # Paper listed ClinVar 90658, but that ID now points to V470E (c.1409T>A); removed
+        ('MSH2', 'c.1571G>T',            'p.R524L', '90701',  '0.179'),
+        ('MSH2', 'c.1690A>G',            'p.T564A', '90750',  '0.001'),
+        ('MSH2', 'c.1730T>C',            'p.I577T', '41644',  '0.323'),
+        ('MSH2', 'c.1808A>G',            'p.D603G', '90791',  '66.934'),
+        ('MSH2', 'c.1955C>A',            'p.P652H', '90823',  '0.035'),
+        ('MSH2', 'c.1963G>A',            'p.V655I', '127637', '0.002'),
+        ('MSH2', 'c.2021G>A',            'p.G674D', '90861',  '50.787'),
+        ('MSH2', 'c.2023A>G',            'p.K675E', '408453', '54.075'),
+        ('MSH2', 'c.2047G>A',            'p.G683R', '90868',  '74.003'),
+        ('MSH2', 'c.2063T>G',            'p.M688R', '90874',  '77.813'),
+        ('MSH2', 'c.2087C>T',            'p.P696L', '90881',  '18.845'),
+        ('MSH2', 'c.2276G>A',            'p.G759E', '41866',  '2.661'),
+        ('MSH2', 'c.2287G>C',            'p.A763P', '',       '35.294'),
+        ('MSH2', 'c.2500G>A',            'p.A834T', '90986',  '0.004'),
+    ]
+
+    print(f"  Loading Drost 2018 data (hardcoded from Supplementary Table S2)...")
+    variants = []
+    for gene, dna, protein, clinvar_id, odds_path in DATA:
+        transcript = PAPER_TRANSCRIPTS['drost2018'][gene]
+        variants.append({
+            'gene': gene,
+            'dna': dna,
+            'protein': protein,
+            'clinvar_id': clinvar_id,
+            'odds_path': odds_path,
+            'name': f"{transcript}:{dna}",
+            'paper': 'drost2018',
+            'coord_type': 'cds',
+        })
+
+    print(f"    Loaded {len(variants)} variants from Drost 2018")
+    return variants
+
+# ============================================================================
+# Data parsing: Paper 2 - Drost et al. 2020
+# ============================================================================
+
+def get_docx_tables(docx_path):
+    """Extract tables from a DOCX file using XML parsing"""
+    ns = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
+
+    with zipfile.ZipFile(docx_path, 'r') as z:
+        xml_content = z.read('word/document.xml')
+
+    root = ET.fromstring(xml_content)
+    tables = root.findall('.//w:tbl', ns)
+
+    result = []
+    for table in tables:
+        rows = table.findall('.//w:tr', ns)
+        table_data = []
+        for row in rows:
+            cells = row.findall('.//w:tc', ns)
+            row_data = []
+            for cell in cells:
+                texts = []
+                for p in cell.findall('.//w:t', ns):
+                    if p.text:
+                        texts.append(p.text)
+                row_data.append(''.join(texts))
+            table_data.append(row_data)
+        result.append(table_data)
+
+    return result
+
+def parse_drost2020(filepath):
+    """
+    Parse Drost 2020 supplementary DOCX (Tables S1, S3, S5).
+    Gene: MSH6 (NM_000179.2 for S1/S3, protein-only for S5)
+    Returns list of variant dicts.
+
+    Table structure (from DOCX XML):
+      Table 0 = S1 (31 calibration variants): DNA col 0, Protein col 1, ClinVar col 2, OddsPath col 10
+      Table 2 = S3 (18 additional variants):  DNA col 0, Protein col 1, ClinVar col 2, OddsPath col 8
+      Table 4 = S5 (38 screen variants):      Protein col 0, OddsPath col 4
+
+    Errata corrections:
+      - p.V509Ag (c.1526T>C) in S1: Trailing 'g' is a typo in the published
+        DOCX. ClinVar 41588 confirms the correct notation is p.Val509Ala
+        (p.V509A). Corrected during parsing.
+    """
+    if not os.path.exists(filepath):
+        print(f"  WARNING: {filepath} not found, skipping Drost 2020")
+        return []
+
+    print(f"  Parsing {filepath}...")
+    tables = get_docx_tables(filepath)
+    transcript = PAPER_TRANSCRIPTS['drost2020']['MSH6']
+
+    variants = []
+
+    # Table 0 = S1: 31 calibration variants (rows 2-32, skipping header rows 0-1)
+    if len(tables) > 0:
+        table_s1 = tables[0]
+        for row in table_s1[2:]:  # Skip 2 header rows
+            if len(row) < 11:
+                continue
+            dna = row[0].strip()
+            protein = row[1].strip()
+            clinvar_id = row[2].strip()
+            odds_path = row[10].strip()
+
+            if not dna.startswith('c.'):
+                continue
+            if clinvar_id == 'NA':
+                clinvar_id = ''
+            # Fix known typos in published DOCX: trailing 'g' on protein names
+            # is a formatting artifact (e.g., p.V509Ag, p.E220Dg, p.E1163Vg, p.R1304Kg)
+            # ClinVar confirms correct forms without trailing 'g'
+            protein = re.sub(r'^(p\.[A-Z]\d+[A-Z])g$', r'\1', protein)
+
+            variants.append({
+                'gene': 'MSH6',
+                'dna': dna,
+                'protein': protein,
+                'clinvar_id': clinvar_id,
+                'odds_path': odds_path,
+                'name': f"{transcript}:{dna}",
+                'paper': 'drost2020',
+                'coord_type': 'cds',
+            })
+        print(f"    S1: {len([v for v in variants if v['coord_type'] == 'cds'])} variants")
+
+    # Table 2 = S3: 18 additional variants (rows 2-19, skipping header rows 0-1)
+    s3_count = 0
+    if len(tables) > 2:
+        table_s3 = tables[2]
+        for row in table_s3[2:]:  # Skip 2 header rows
+            if len(row) < 9:
+                continue
+            dna = row[0].strip()
+            protein = row[1].strip()
+            clinvar_id = row[2].strip()
+            odds_path = row[8].strip()
+
+            if not dna.startswith('c.'):
+                continue
+            if clinvar_id == 'NA':
+                clinvar_id = ''
+
+            variants.append({
+                'gene': 'MSH6',
+                'dna': dna,
+                'protein': protein,
+                'clinvar_id': clinvar_id,
+                'odds_path': odds_path,
+                'name': f"{transcript}:{dna}",
+                'paper': 'drost2020',
+                'coord_type': 'cds',
+            })
+            s3_count += 1
+        print(f"    S3: {s3_count} variants")
+
+    # Table 4 = S5: 38 genetic screen variants (rows 2-39, skipping header rows 0-1)
+    s5_count = 0
+    if len(tables) > 4:
+        table_s5 = tables[4]
+        for row in table_s5[2:]:  # Skip 2 header rows
+            if len(row) < 5:
+                continue
+            protein = row[0].strip()
+            odds_path = row[4].strip()
+
+            if not protein.startswith('p.'):
+                continue
+
+            variants.append({
+                'gene': 'MSH6',
+                'dna': '',
+                'protein': protein,
+                'clinvar_id': '',
+                'odds_path': odds_path,
+                'name': protein,
+                'paper': 'drost2020',
+                'coord_type': 'protein',
+            })
+            s5_count += 1
+        print(f"    S5: {s5_count} variants")
+
+    print(f"    Total: {len(variants)} variants from Drost 2020")
+    return variants
+
+# ============================================================================
+# Data parsing: Paper 3 - Jia et al. 2021
+# ============================================================================
+
+def parse_jia2021(filepath):
+    """
+    Parse Jia 2021 supplementary XLSX (TableS5_AllVariants).
+    Gene: MSH2 (NM_000251, mapped using NM_000251.3)
+    Returns list of variant dicts.
+
+    TableS5 columns: Variant (col 0), Position (col 1), LOF score (col 2)
+    Variants in compact notation like M1A, need p. prefix.
+    Skip variants with LOF score = None.
+    """
+    if not os.path.exists(filepath):
+        print(f"  WARNING: {filepath} not found, skipping Jia 2021")
+        return []
+
+    print(f"  Parsing {filepath}...")
+    wb = openpyxl.load_workbook(filepath, read_only=True)
+
+    variants = []
+    ws = wb['TableS5_AllVariants']
+
+    skipped_none = 0
+    for i, row in enumerate(ws.iter_rows(min_row=2, values_only=True)):
+        vals = list(row)
+        if len(vals) < 3:
+            continue
+
+        variant_name = vals[0]
+        position = vals[1]
+        lof_score = vals[2]
+
+        if variant_name is None:
+            continue
+        if lof_score is None:
+            skipped_none += 1
+            continue
+
+        variant_str = str(variant_name).strip()
+        protein = f"p.{variant_str}"
+
+        # Use Position column if available, otherwise parse from variant name
+        if position is not None:
+            aa_pos = int(position)
+        else:
+            aa_pos = parse_protein_pos(variant_str)
+
+        if aa_pos is None:
+            continue
+
+        variants.append({
+            'gene': 'MSH2',
+            'dna': '',
+            'protein': protein,
+            'clinvar_id': '',
+            'lof_score': str(lof_score),
+            'name': protein,
+            'paper': 'jia2021',
+            'coord_type': 'protein',
+            'aa_pos': aa_pos,
+        })
+
+    wb.close()
+    print(f"    Parsed {len(variants)} variants from Jia 2021 (skipped {skipped_none} with no LOF score)")
+    return variants
+
+# ============================================================================
+# Data parsing: Paper 4 - Rath et al. 2022
+# ============================================================================
+
+def parse_rath2022(filepath):
+    """
+    Rath et al. 2022 (PMID:36054288) - Supplementary Table S1.
+    Source: NIHMS1834139-supplement-supinfo.pdf
+    Gene: MLH1 (NM_000249.4 - MANE transcript)
+
+    Data hardcoded from published supplement for reliability.
+    OddsPath_Non-functional column. Variants with "-" (no OddsPath) are
+    excluded. Scientific notation values (e.g. 1.08E-7) preserved as strings.
+    "1,450" = 1450 (American thousands separator, not European decimal).
+
+    ClinVar ID exceptions:
+      - K618A (c.1852_1853delinsGC): Paper listed ClinVar 32128, which is a
+        ClinVar measure_id (allele-level), not a variation_id (VCV). The
+        /clinvar/variation/32128/ URL returns 404. Updated to VCV 17089, which
+        is the current ClinVar variation record for this variant
+        (NM_000249.4:c.1852_1853delinsGC, p.Lys618Ala).
+
+    Returns list of variant dicts.
+    """
+    # (protein_short, dna, clinvar_id, odds_path_non_functional)
+    # Protein short is the cell line name = protein change without p. prefix
+    DATA = [
+        # Benign (9 with OddsPath, 2 skipped: I219V, H718Y have "-")
+        ('D132H', 'c.394G>C',             '17096',  '1.08E-7'),
+        ('V213M', 'c.637G>A',             '41640',  '5.76E-4'),
+        ('E268G', 'c.803A>G',             '90382',  '2.24E-4'),
+        ('S406N', 'c.1217G>A',            '41634',  '1.49E-10'),
+        ('E578G', 'c.1733A>G',            '17090',  '1.44E-6'),
+        ('K618A', 'c.1852_1853delinsGC',  '17089',  '2.90E-13'),  # Paper listed measure_id 32128 (404); updated to VCV 17089
+        ('Q689R', 'c.2066A>G',            '90016',  '1.11E-4'),
+        ('S692P', 'c.2074T>C',            '127621', '4.02E-4'),
+        ('V716M', 'c.2146G>A',            '41639',  '4.07E-5'),
+        # VUS (6 with OddsPath, 15 skipped with "-")
+        ('C39R',  'c.115T>C',             '89655',  '71.4'),
+        ('D63N',  'c.187G>A',             '89920',  '0.00'),
+        ('N64S',  'c.191A>G',             '89947',  '5.00E-2'),
+        ('L73P',  'c.218T>C',             '90086',  '13.7'),
+        ('G454R', 'c.1360G>C',            '89705',  '1.11E-2'),
+        ('R474P', 'c.1421G>C',            '',       '0.489'),
+        # Pathogenic (11 with OddsPath)
+        ('I19F',  'c.55A>T',              '90274',  '346'),
+        ('D41H',  'c.121G>C',             '89682',  '3.99E+4'),
+        ('S44F',  'c.131C>T',             '17079',  '2.46E+4'),
+        ('G67R',  'c.199G>A',             '89992',  '4.27E+4'),
+        ('R100P', 'c.299G>C',             '90133',  '1450'),
+        ('I107R', 'c.320T>G',             '90167',  '210'),
+        ('T117M', 'c.350C>T',             '17094',  '1.33E+18'),
+        ('S247P', 'c.739T>C',             '90342',  '105'),
+        ('R265S', 'c.793C>A',             '90380',  '37.2'),
+        ('I276R', 'c.827T>G',             '520540', '319'),
+        ('R687W', 'c.2059C>T',            '90014',  '1.74E+4'),
+    ]
+
+    print(f"  Loading Rath 2022 data (hardcoded from Supplementary Table S1)...")
+    transcript = PAPER_TRANSCRIPTS['rath2022']['MLH1']
+
+    variants = []
+    for protein_short, dna, clinvar_id, odds_path in DATA:
+        protein = f"p.{protein_short}"
+        variants.append({
+            'gene': 'MLH1',
+            'dna': dna,
+            'protein': protein,
+            'clinvar_id': clinvar_id,
+            'odds_path': odds_path,
+            'name': f"{transcript}:{dna}",
+            'paper': 'rath2022',
+            'coord_type': 'cds',
+        })
+
+    print(f"    Loaded {len(variants)} variants from Rath 2022")
+    return variants
+
+# ============================================================================
+# BED generation
+# ============================================================================
+
+def process_variants(variants, db, transcript_cache, stats):
+    """
+    Convert parsed variants to BED entries for a given genome assembly.
+    Returns list of BED line strings.
+    """
+    bed_entries = []
+
+    for var in variants:
+        gene = var['gene']
+        paper = var['paper']
+        paper_info = PAPERS[paper]
+
+        # Get transcript info (cached)
+        tx_accession = TRANSCRIPTS[gene]
+        cache_key = f"{db}_{tx_accession}"
+        if cache_key not in transcript_cache:
+            transcript_cache[cache_key] = get_transcript_info(db, tx_accession)
+            transcript_cache[f"{cache_key}_cds"] = build_cds_regions(transcript_cache[cache_key])
+
+        tx = transcript_cache[cache_key]
+        cds_regions = transcript_cache[f"{cache_key}_cds"]
+        chrom = tx['chrom']
+        strand = tx['strand']
+
+        # Determine genomic coordinates based on coordinate type
+        if var['coord_type'] == 'cds':
+            cds_start, cds_end = parse_hgvsc_pos(var['dna'])
+            if cds_start is None:
+                stats['parse_failed'] += 1
+                continue
+            genomic_start, _ = cds_pos_to_genomic(cds_start, cds_regions, strand)
+            _, genomic_end = cds_pos_to_genomic(cds_end, cds_regions, strand)
+        elif var['coord_type'] == 'protein':
+            aa_pos = var.get('aa_pos') or parse_protein_pos(var['protein'])
+            if aa_pos is None:
+                stats['parse_failed'] += 1
+                continue
+            genomic_start, genomic_end = aa_to_genomic(aa_pos, cds_regions)
+        else:
+            stats['parse_failed'] += 1
+            continue
+
+        if genomic_start is None:
+            stats['coord_failed'] += 1
+            continue
+
+        # Classify variant
+        if paper == 'jia2021':
+            classification = classify_jia_lof(var['lof_score'])
+            score_val = var['lof_score']
+            score_label = 'LOF'
+        else:
+            classification = classify_tavtigian(var['odds_path'])
+            score_val = var['odds_path']
+            score_label = 'OddsPath'
+
+        if classification is None:
+            stats['classify_failed'] += 1
+            continue
+
+        color = COLORS[classification]
+        name = var['name']
+        protein = var['protein']
+        clinvar_id = var.get('clinvar_id', '')
+        paper_ref = paper_info['ref']
+
+        # Build mouseover
+        mouseover = build_mouseover(name, protein, classification,
+                                     clinvar_id, score_val, score_label, paper)
+
+        bed_line = make_bed_entry(chrom, genomic_start, genomic_end, name,
+                                  color, gene, protein, classification,
+                                  clinvar_id, score_val, paper_ref, mouseover)
+        bed_entries.append(bed_line)
+        stats['included'] += 1
+        stats[f'included_{paper}'] = stats.get(f'included_{paper}', 0) + 1
+
+    return bed_entries
+
+def create_track(db):
+    """Create BED and bigBed files for a given genome assembly"""
+    print(f"\n{'='*70}")
+    print(f"Processing {db}")
+    print(f"{'='*70}")
+
+    stats = {
+        'included': 0,
+        'parse_failed': 0,
+        'coord_failed': 0,
+        'classify_failed': 0,
+    }
+
+    transcript_cache = {}
+
+    # Parse all papers
+    all_variants = []
+
+    # Paper 1: Drost 2018
+    print("\n  Paper 1: Drost et al. 2018")
+    drost2018_file = os.path.join(OUTPUT_DIR, "drost2018_supplement.pdf")
+    all_variants.extend(parse_drost2018(drost2018_file))
+
+    # Paper 2: Drost 2020
+    print("\n  Paper 2: Drost et al. 2020")
+    drost2020_file = os.path.join(OUTPUT_DIR, "drost2020_supplement.docx")
+    all_variants.extend(parse_drost2020(drost2020_file))
+
+    # Paper 3: Jia 2021
+    print("\n  Paper 3: Jia et al. 2021")
+    jia2021_file = os.path.join(OUTPUT_DIR, "mmc2.xlsx")
+    all_variants.extend(parse_jia2021(jia2021_file))
+
+    # Paper 4: Rath 2022
+    print("\n  Paper 4: Rath et al. 2022")
+    rath2022_file = os.path.join(OUTPUT_DIR, "rath2022_supplement.pdf")
+    all_variants.extend(parse_rath2022(rath2022_file))
+
+    print(f"\n  Total variants to process: {len(all_variants)}")
+
+    # Convert to BED entries
+    print(f"\n  Querying transcript coordinates from {db}...")
+    bed_entries = process_variants(all_variants, db, transcript_cache, stats)
+
+    # Write BED file
+    bed_file = os.path.join(OUTPUT_DIR, f"insightFunctionalAssays_{db}.bed")
+    print(f"\n  Writing BED file: {bed_file}")
+    with open(bed_file, 'w') as f:
+        f.write('\n'.join(bed_entries) + '\n')
+
+    # Sort BED file
+    print("  Sorting BED file...")
+    bash(f"sort -k1,1 -k2,2n {bed_file} -o {bed_file}")
+
+    # Create bigBed
+    as_file = os.path.join(OUTPUT_DIR, "insightFunctionalAssays.as")
+    db_label = db.replace('hg', 'Hg')
+    bb_file = os.path.join(OUTPUT_DIR, f"insightFunctionalAssays{db_label}.bb")
+    chrom_sizes = f"/cluster/data/{db}/chrom.sizes"
+
+    print(f"  Creating bigBed file: {bb_file}")
+    try:
+        bash(f"bedToBigBed -as={as_file} -type=bed9+7 -tab {bed_file} {chrom_sizes} {bb_file}")
+        print(f"    Successfully created: {bb_file}")
+    except Exception as e:
+        print(f"    ERROR creating bigBed: {e}")
+
+    # Print stats
+    print(f"\n  Statistics for {db}:")
+    print(f"    Total included: {stats['included']}")
+    for paper_key in ['drost2018', 'drost2020', 'jia2021', 'rath2022']:
+        count = stats.get(f'included_{paper_key}', 0)
+        if count > 0:
+            print(f"      {PAPERS[paper_key]['ref']}: {count}")
+    print(f"    Parse failed: {stats['parse_failed']}")
+    print(f"    Coordinate failed: {stats['coord_failed']}")
+    print(f"    Classify failed: {stats['classify_failed']}")
+
+    return bed_file, bb_file
+
+# ============================================================================
+# Main
+# ============================================================================
+
+def main():
+    print("=" * 70)
+    print("InSiGHT VCEP Functional Assays Track Generator")
+    print("=" * 70)
+
+    # Create output directory if needed
+    os.makedirs(OUTPUT_DIR, exist_ok=True)
+
+    # Write AutoSql file
+    as_file = os.path.join(OUTPUT_DIR, "insightFunctionalAssays.as")
+    print(f"\nWriting AutoSql file: {as_file}")
+    with open(as_file, 'w') as f:
+        f.write(AUTOSQL)
+
+    # Process both assemblies
+    output_files = {}
+    for db in ['hg38', 'hg19']:
+        try:
+            bed_file, bb_file = create_track(db)
+            output_files[db] = {'bed': bed_file, 'bb': bb_file}
+        except Exception as e:
+            print(f"\nERROR processing {db}: {e}")
+            import traceback
+            traceback.print_exc()
+
+    # Print summary
+    print("\n" + "=" * 70)
+    print("Output Files")
+    print("=" * 70)
+    print(f"  AutoSql file: {as_file}")
+    for db, files in output_files.items():
+        print(f"\n  {db}:")
+        print(f"    BED file:    {files['bed']}")
+        if os.path.exists(files['bb']):
+            print(f"    BigBed file: {files['bb']}")
+
+    print("\n" + "=" * 70)
+    print("Done!")
+    print("=" * 70)
+
+if __name__ == "__main__":
+    main()