55225be4fa1c5b6c1d886c524606b67701e86448 lrnassar Tue Apr 14 10:32:57 2026 -0700 Fix typo, comment error, and bare except in InSiGHT build scripts per code review. refs #36582 diff --git src/hg/makeDb/scripts/insight/insightFunctionalAssays.py src/hg/makeDb/scripts/insight/insightFunctionalAssays.py index 32ceaa0699e..dd08b4c067a 100644 --- src/hg/makeDb/scripts/insight/insightFunctionalAssays.py +++ src/hg/makeDb/scripts/insight/insightFunctionalAssays.py @@ -1,963 +1,963 @@ #!/usr/bin/env python3 """ InSiGHT VCEP Functional Assays Track Generator Generates UCSC Genome Browser tracks (bigBed format) for functional assay evidence from published studies on Lynch syndrome MMR genes. Combines data from: Paper 1: Drost et al. 2018 (PMID:30504929) - MLH1/MSH2 CIMRA assay Paper 2: Drost et al. 2020 (PMID:31965077) - MSH6 CIMRA assay Paper 3: Jia et al. 2021 (PMID:33357406) - MSH2 deep mutational scanning Paper 4: Rath et al. 2022 (PMID:36054288) - MLH1 cell-based functional assay Each variant is classified according to ACMG PS3/BS3 criteria based on Tavtigian OddsPath thresholds or Jia LOF score thresholds. Output: BED9+7 bigBed files for hg38 and hg19. Supplementary data files (place in same directory as this script): drost2018_supplement.pdf - https://pmc.ncbi.nlm.nih.gov/articles/instance/7901556/bin/NIHMS1010100-supplement-CIMRA.pdf drost2020_supplement.docx - https://static-content.springer.com/esm/art%3A10.1038%2Fs41436-019-0736-2/MediaObjects/41436_2019_736_MOESM2_ESM.docx mmc2.xlsx - https://pmc.ncbi.nlm.nih.gov/articles/instance/7820803/bin/mmc2.xlsx rath2022_supplement.pdf - https://pmc.ncbi.nlm.nih.gov/articles/instance/9772141/bin/NIHMS1834139-supplement-supinfo.pdf Author: Generated for InSiGHT VCEP Date: 2025 """ import subprocess import os import re import sys import zipfile import xml.etree.ElementTree as ET try: import openpyxl except ImportError: print("ERROR: openpyxl is required. Install with: pip install openpyxl") sys.exit(1) # ============================================================================ # Configuration # ============================================================================ OUTPUT_DIR = "/hive/users/lrnassar/insightHub/functionalAssays" # Transcripts for coordinate mapping (current MANE versions in hgsql) TRANSCRIPTS = { 'MLH1': 'NM_000249.4', 'MSH2': 'NM_000251.3', 'MSH6': 'NM_000179.3', } # Paper-specific transcript versions (used in the name field per paper's notation) PAPER_TRANSCRIPTS = { 'drost2018': {'MLH1': 'NM_000249.3', 'MSH2': 'NM_000251.2'}, 'drost2020': {'MSH6': 'NM_000179.2'}, 'rath2022': {'MLH1': 'NM_000249.4'}, } # Classification colors (RGB) from insight.html COLORS = { 'PS3_Strong': '180,0,0', # Dark Red 'PS3_Moderate': '210,0,0', # Red 'PS3_Supporting': '245,152,152', # Pink 'Indeterminate': '128,128,128', # Gray 'BS3_Supporting': '213,247,213', # Lime Green 'BS3_Strong': '0,210,0', # Green } # Paper references and PMIDs PAPERS = { 'drost2018': {'ref': 'Drost et al., 2018', 'pmid': '30504929'}, 'drost2020': {'ref': 'Drost et al., 2020', 'pmid': '31965077'}, 'jia2021': {'ref': 'Jia et al., 2021', 'pmid': '33357406'}, 'rath2022': {'ref': 'Rath et al., 2022', 'pmid': '36054288'}, } # AutoSQL definition AUTOSQL = """table insightFunctionalAssays "Functional assay evidence for Lynch syndrome MMR gene variant classification" ( string chrom; "Reference sequence chromosome or scaffold" uint chromStart; "Start position in chromosome" uint chromEnd; "End position in chromosome" string name; "Variant name (HGVSc or HGVSp notation)" uint score; "Not used, set to 0" char[1] strand; "Not used, set to ." uint thickStart; "Same as chromStart" uint thickEnd; "Same as chromEnd" uint reserved; "RGB color value" string gene; "Gene symbol" string protein; "Protein change (HGVSp notation)" string classification; "ACMG evidence classification (PS3/BS3)" string clinVarId; "ClinVar variation ID, if available" string score_value; "Functional score (OddsPath or LOF)" string paperRef; "Publication reference" lstring _mouseOver; "HTML mouseover text" ) """ # ============================================================================ # Utility functions # ============================================================================ def bash(cmd): """Run the cmd in bash subprocess""" try: rawBashOutput = subprocess.run(cmd, check=True, shell=True, stdout=subprocess.PIPE, universal_newlines=True, stderr=subprocess.STDOUT) - bashStdoutt = rawBashOutput.stdout + bashStdout = rawBashOutput.stdout except subprocess.CalledProcessError as e: raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output)) - return(bashStdoutt) + return(bashStdout) def get_transcript_info(db, accession): """Query hgsql to get transcript information from ncbiRefSeq""" query = f"SELECT name, chrom, strand, txStart, txEnd, cdsStart, cdsEnd, exonStarts, exonEnds FROM ncbiRefSeq WHERE name='{accession}'" result = bash(f'hgsql {db} -Ne "{query}"') if not result.strip(): raise ValueError(f"Transcript {accession} not found in {db}.ncbiRefSeq") fields = result.strip().split('\t') # Parse exon starts and ends (comma-separated, trailing comma) exon_starts = [int(x) for x in fields[7].rstrip(',').split(',')] exon_ends = [int(x) for x in fields[8].rstrip(',').split(',')] return { 'name': fields[0], 'chrom': fields[1], 'strand': fields[2], 'txStart': int(fields[3]), 'txEnd': int(fields[4]), 'cdsStart': int(fields[5]), 'cdsEnd': int(fields[6]), 'exonStarts': exon_starts, 'exonEnds': exon_ends, } def build_cds_regions(tx): """Build list of CDS regions from transcript info""" cds_regions = [] for i in range(len(tx['exonStarts'])): ex_start = tx['exonStarts'][i] ex_end = tx['exonEnds'][i] # Skip exons outside CDS if ex_end <= tx['cdsStart'] or ex_start >= tx['cdsEnd']: continue # Clip exon to CDS boundaries region_start = max(ex_start, tx['cdsStart']) region_end = min(ex_end, tx['cdsEnd']) cds_regions.append((region_start, region_end)) # For minus strand, reverse order (CDS starts from high genomic coords) if tx['strand'] == '-': cds_regions = cds_regions[::-1] return cds_regions def cds_pos_to_genomic(cds_pos, cds_regions, strand): """ Convert a 1-based CDS position to 0-based genomic coordinate. Returns (genomic_start, genomic_end) for the single nucleotide. """ cumulative = 0 for region_start, region_end in cds_regions: region_len = region_end - region_start region_cds_start = cumulative + 1 region_cds_end = cumulative + region_len if cds_pos >= region_cds_start and cds_pos <= region_cds_end: offset = cds_pos - region_cds_start if strand == '+': genomic_pos = region_start + offset else: # Minus strand: count from end of region genomic_pos = region_end - offset - 1 return genomic_pos, genomic_pos + 1 cumulative += region_len return None, None def aa_to_genomic(aa_pos, cds_regions): """ Convert a 1-based amino acid position to 0-based genomic coordinates for the 3-nucleotide codon span. Only handles + strand genes (MLH1, MSH2, MSH6 are all on + strand). Returns (genomic_start, genomic_end) or (None, None). """ nt_start = (aa_pos - 1) * 3 + 1 # 1-based CDS nucleotide position nt_end = aa_pos * 3 # Find the genomic positions for the first and last nucleotide of the codon first_start, first_end = cds_pos_to_genomic(nt_start, cds_regions, '+') last_start, last_end = cds_pos_to_genomic(nt_end, cds_regions, '+') if first_start is not None and last_start is not None: return first_start, last_end return None, None def parse_hgvsc_pos(hgvsc): """ Parse HGVSc notation to extract the CDS position and span. Returns (start_pos, end_pos) as 1-based CDS positions. For simple substitutions, start == end. Examples: - c.4T>A -> (4, 4) - c.100G>C -> (100, 100) - c.104_105delinsAC -> (104, 105) - c.1852_1853delinsGC -> (1852, 1853) """ hgvsc = hgvsc.strip() # Simple substitution: c.100G>C match = re.match(r'c\.(\d+)[ACGT]>[ACGT]', hgvsc) if match: pos = int(match.group(1)) return pos, pos # Delins spanning positions: c.104_105delinsAC match = re.match(r'c\.(\d+)_(\d+)delins', hgvsc) if match: return int(match.group(1)), int(match.group(2)) return None, None def parse_protein_pos(prot): """ Parse amino acid position from protein notation. Handles both p.X###Y and X###Y formats. Examples: - p.W372R -> 372 - p.K13T -> 13 - M1A -> 1 """ prot = prot.strip() if prot.startswith('p.'): prot = prot[2:] match = re.match(r'[A-Z*](\d+)', prot) if match: return int(match.group(1)) return None # ============================================================================ # Classification functions # ============================================================================ def classify_tavtigian(odds_path): """ Classify variant using Tavtigian OddsPath thresholds. Used for Papers 1, 2, 4 (CIMRA assays). Thresholds: PS3_Strong: > 18.7 PS3_Moderate: > 4.3 and <= 18.7 PS3_Supporting: > 2.08 and <= 4.3 Indeterminate: > 0.48 and <= 2.08 BS3_Supporting: > 0.05 and <= 0.48 BS3_Strong: <= 0.05 """ try: val = float(odds_path) except (ValueError, TypeError): return None if val > 18.7: return 'PS3_Strong' elif val > 4.3: return 'PS3_Moderate' elif val > 2.08: return 'PS3_Supporting' elif val > 0.48: return 'Indeterminate' elif val > 0.05: return 'BS3_Supporting' else: return 'BS3_Strong' def classify_jia_lof(lof_score): """ Classify variant using Jia LOF score thresholds. Used for Paper 3 (deep mutational scanning). Thresholds: PS3_Strong: >= 0.4 Indeterminate: >= 0 and < 0.4 BS3_Strong: < 0 """ try: val = float(lof_score) except (ValueError, TypeError): return None if val >= 0.4: return 'PS3_Strong' elif val >= 0: return 'Indeterminate' else: return 'BS3_Strong' # ============================================================================ # MouseOver builder # ============================================================================ def build_mouseover(name, protein, classification, clinvar_id, score_val, score_label, paper_key): """Build HTML mouseover text for a variant""" paper_info = PAPERS[paper_key] parts = [] parts.append(f"HGVSc/HGVSp: {name}") parts.append(f"Protein: {protein}") parts.append(f"Classification: {classification}") if clinvar_id and clinvar_id != 'NA' and clinvar_id != '': parts.append(f'ClinVar ID: {clinvar_id}') else: parts.append("ClinVar ID: N/A") parts.append(f"{score_label}: {score_val}") parts.append(f'Paper: {paper_info["ref"]}') return "
".join(parts) # ============================================================================ # BED entry builder # ============================================================================ def make_bed_entry(chrom, start, end, name, color, gene, protein, classification, clinvar_id, score_val, paper_ref, mouseover): """Create a BED9+7 line""" return (f"{chrom}\t{start}\t{end}\t{name}\t0\t.\t{start}\t{end}\t{color}" f"\t{gene}\t{protein}\t{classification}\t{clinvar_id}" f"\t{score_val}\t{paper_ref}\t{mouseover}") # ============================================================================ # Data parsing: Paper 1 - Drost et al. 2018 # ============================================================================ def parse_drost2018(filepath): """ Drost et al. 2018 (PMID:30504929) - Supplementary Table S2. Source: NIHMS1010100-supplement-CIMRA.pdf Genes: MLH1 (NM_000249.3), MSH2 (NM_000251.2) Data hardcoded from published supplement for reliability. OddsPath values are the CIMRA-based "Odds for Pathogenicity" column (Classification Prior-P + CIMRA section). European decimal commas converted to periods. ClinVar ID exceptions: - V470G (c.1409T>G): Paper listed ClinVar 90658, but that ID now resolves to V470E (c.1409T>A) — a different nucleotide change at the same position. ClinVar appears to have reassigned the ID since publication. Removed to avoid linking to the wrong variant. Returns list of variant dicts. """ # (gene, dna, protein, clinvar_id, odds_path_cimra) DATA = [ # MLH1 variants (39) ('MLH1', 'c.62C>A', 'p.A21E', '90297', '23.621'), ('MLH1', 'c.62C>T', 'p.A21V', '90298', '29.238'), ('MLH1', 'c.73A>T', 'p.I25F', '90343', '23.919'), ('MLH1', 'c.104_105delinsAC', 'p.M35N', '17105', '20.835'), ('MLH1', 'c.143A>C', 'p.Q48P', '89739', '31.923'), ('MLH1', 'c.146T>A', 'p.V49E', '89749', '7.829'), ('MLH1', 'c.191A>G', 'p.N64S', '89947', '0.483'), ('MLH1', 'c.203T>A', 'p.I68N', '90008', '29.607'), ('MLH1', 'c.244A>G', 'p.T82A', '90116', '47.103'), ('MLH1', 'c.245C>T', 'p.T82I', '90118', '33.147'), ('MLH1', 'c.301G>A', 'p.G101S', '90137', '44.238'), ('MLH1', 'c.332C>T', 'p.A111V', '90171', '21.908'), ('MLH1', 'c.346A>C', 'p.T116P', '', '17.699'), ('MLH1', 'c.382G>C', 'p.A128P', '90199', '32.734'), ('MLH1', 'c.394G>C', 'p.D132H', '17096', '0.002'), ('MLH1', 'c.779T>G', 'p.L260R', '90350', '18.378'), ('MLH1', 'c.793C>A', 'p.R265S', '90380', '10.580'), ('MLH1', 'c.803A>G', 'p.E268G', '90382', '0.070'), ('MLH1', 'c.911A>T', 'p.D304V', '90439', '33.990'), ('MLH1', 'c.918T>A', 'p.N306K', '90441', '51.428'), ('MLH1', 'c.1321G>A', 'p.A441T', '89696', '0.001'), ('MLH1', 'c.1664T>C', 'p.L555P', '89822', '57.577'), ('MLH1', 'c.1733A>G', 'p.E578G', '17090', '0.003'), ('MLH1', 'c.1742C>T', 'p.P581L', '41635', '0.002'), ('MLH1', 'c.1745T>C', 'p.L582P', '89871', '28.874'), ('MLH1', 'c.1799A>G', 'p.E600G', '89890', '0.002'), ('MLH1', 'c.1808C>G', 'p.P603R', '41636', '0.002'), ('MLH1', 'c.1823C>A', 'p.A608D', '89897', '2.905'), ('MLH1', 'c.1853A>C', 'p.K618T', '89906', '3.377'), ('MLH1', 'c.1855G>C', 'p.A619P', '89909', '2.833'), ('MLH1', 'c.1865T>A', 'p.L622H', '29657', '6.486'), ('MLH1', 'c.1943C>T', 'p.P648L', '89953', '0.521'), ('MLH1', 'c.1942C>T', 'p.P648S', '17097', '3.507'), ('MLH1', 'c.1976G>T', 'p.R659L', '89966', '4.507'), ('MLH1', 'c.2038T>C', 'p.C680R', '90006', '37.579'), ('MLH1', 'c.2041G>A', 'p.A681T', '17099', '0.031'), ('MLH1', 'c.2101C>A', 'p.Q701K', '90042', '0.034'), ('MLH1', 'c.2128A>G', 'p.N710D', '', '0.160'), ('MLH1', 'c.2263A>G', 'p.R755G', '219292', '68.636'), # MSH2 variants (35) ('MSH2', 'c.23C>T', 'p.T8M', '90964', '0.166'), ('MSH2', 'c.277C>T', 'p.L93F', '91044', '0.554'), ('MSH2', 'c.287G>A', 'p.R96H', '91053', '0.051'), ('MSH2', 'c.317G>A', 'p.R106K', '91062', '0.013'), ('MSH2', 'c.488T>A', 'p.V163D', '91107', '89.331'), ('MSH2', 'c.488T>G', 'p.V163G', '91108', '79.790'), ('MSH2', 'c.493T>G', 'p.Y165D', '91111', '81.818'), ('MSH2', 'c.505A>G', 'p.I169V', '91115', '0.000'), ('MSH2', 'c.560T>G', 'p.L187R', '91135', '59.040'), ('MSH2', 'c.595T>C', 'p.C199R', '91146', '105.160'), ('MSH2', 'c.599T>A', 'p.V200D', '91148', '89.331'), ('MSH2', 'c.929T>C', 'p.L310P', '91245', '55.450'), ('MSH2', 'c.989T>C', 'p.L330P', '91270', '81.818'), ('MSH2', 'c.991A>G', 'p.N331D', '91271', '0.125'), ('MSH2', 'c.1022T>C', 'p.L341P', '90507', '42.073'), ('MSH2', 'c.1046C>G', 'p.P349R', '90513', '80.798'), ('MSH2', 'c.1077A>T', 'p.R359S', '90539', '62.080'), ('MSH2', 'c.1168C>T', 'p.L390F', '41641', '0.003'), ('MSH2', 'c.1255C>A', 'p.Q419K', '90583', '0.002'), ('MSH2', 'c.1319T>C', 'p.L440P', '90625', '33.990'), ('MSH2', 'c.1409T>G', 'p.V470G', '', '83.898'), # Paper listed ClinVar 90658, but that ID now points to V470E (c.1409T>A); removed ('MSH2', 'c.1571G>T', 'p.R524L', '90701', '0.179'), ('MSH2', 'c.1690A>G', 'p.T564A', '90750', '0.001'), ('MSH2', 'c.1730T>C', 'p.I577T', '41644', '0.323'), ('MSH2', 'c.1808A>G', 'p.D603G', '90791', '66.934'), ('MSH2', 'c.1955C>A', 'p.P652H', '90823', '0.035'), ('MSH2', 'c.1963G>A', 'p.V655I', '127637', '0.002'), ('MSH2', 'c.2021G>A', 'p.G674D', '90861', '50.787'), ('MSH2', 'c.2023A>G', 'p.K675E', '408453', '54.075'), ('MSH2', 'c.2047G>A', 'p.G683R', '90868', '74.003'), ('MSH2', 'c.2063T>G', 'p.M688R', '90874', '77.813'), ('MSH2', 'c.2087C>T', 'p.P696L', '90881', '18.845'), ('MSH2', 'c.2276G>A', 'p.G759E', '41866', '2.661'), ('MSH2', 'c.2287G>C', 'p.A763P', '', '35.294'), ('MSH2', 'c.2500G>A', 'p.A834T', '90986', '0.004'), ] print(f" Loading Drost 2018 data (hardcoded from Supplementary Table S2)...") variants = [] for gene, dna, protein, clinvar_id, odds_path in DATA: transcript = PAPER_TRANSCRIPTS['drost2018'][gene] variants.append({ 'gene': gene, 'dna': dna, 'protein': protein, 'clinvar_id': clinvar_id, 'odds_path': odds_path, 'name': f"{transcript}:{dna}", 'paper': 'drost2018', 'coord_type': 'cds', }) print(f" Loaded {len(variants)} variants from Drost 2018") return variants # ============================================================================ # Data parsing: Paper 2 - Drost et al. 2020 # ============================================================================ def get_docx_tables(docx_path): """Extract tables from a DOCX file using XML parsing""" ns = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'} with zipfile.ZipFile(docx_path, 'r') as z: xml_content = z.read('word/document.xml') root = ET.fromstring(xml_content) tables = root.findall('.//w:tbl', ns) result = [] for table in tables: rows = table.findall('.//w:tr', ns) table_data = [] for row in rows: cells = row.findall('.//w:tc', ns) row_data = [] for cell in cells: texts = [] for p in cell.findall('.//w:t', ns): if p.text: texts.append(p.text) row_data.append(''.join(texts)) table_data.append(row_data) result.append(table_data) return result def parse_drost2020(filepath): """ Parse Drost 2020 supplementary DOCX (Tables S1, S3, S5). Gene: MSH6 (NM_000179.2 for S1/S3, protein-only for S5) Returns list of variant dicts. Table structure (from DOCX XML): Table 0 = S1 (31 calibration variants): DNA col 0, Protein col 1, ClinVar col 2, OddsPath col 10 Table 2 = S3 (18 additional variants): DNA col 0, Protein col 1, ClinVar col 2, OddsPath col 8 Table 4 = S5 (38 screen variants): Protein col 0, OddsPath col 4 Errata corrections: - p.V509Ag (c.1526T>C) in S1: Trailing 'g' is a typo in the published DOCX. ClinVar 41588 confirms the correct notation is p.Val509Ala (p.V509A). Corrected during parsing. """ if not os.path.exists(filepath): print(f" WARNING: {filepath} not found, skipping Drost 2020") return [] print(f" Parsing {filepath}...") tables = get_docx_tables(filepath) transcript = PAPER_TRANSCRIPTS['drost2020']['MSH6'] variants = [] # Table 0 = S1: 31 calibration variants (rows 2-32, skipping header rows 0-1) if len(tables) > 0: table_s1 = tables[0] for row in table_s1[2:]: # Skip 2 header rows if len(row) < 11: continue dna = row[0].strip() protein = row[1].strip() clinvar_id = row[2].strip() odds_path = row[10].strip() if not dna.startswith('c.'): continue if clinvar_id == 'NA': clinvar_id = '' # Fix known typos in published DOCX: trailing 'g' on protein names # is a formatting artifact (e.g., p.V509Ag, p.E220Dg, p.E1163Vg, p.R1304Kg) # ClinVar confirms correct forms without trailing 'g' protein = re.sub(r'^(p\.[A-Z]\d+[A-Z])g$', r'\1', protein) variants.append({ 'gene': 'MSH6', 'dna': dna, 'protein': protein, 'clinvar_id': clinvar_id, 'odds_path': odds_path, 'name': f"{transcript}:{dna}", 'paper': 'drost2020', 'coord_type': 'cds', }) print(f" S1: {len([v for v in variants if v['coord_type'] == 'cds'])} variants") # Table 2 = S3: 18 additional variants (rows 2-19, skipping header rows 0-1) s3_count = 0 if len(tables) > 2: table_s3 = tables[2] for row in table_s3[2:]: # Skip 2 header rows if len(row) < 9: continue dna = row[0].strip() protein = row[1].strip() clinvar_id = row[2].strip() odds_path = row[8].strip() if not dna.startswith('c.'): continue if clinvar_id == 'NA': clinvar_id = '' variants.append({ 'gene': 'MSH6', 'dna': dna, 'protein': protein, 'clinvar_id': clinvar_id, 'odds_path': odds_path, 'name': f"{transcript}:{dna}", 'paper': 'drost2020', 'coord_type': 'cds', }) s3_count += 1 print(f" S3: {s3_count} variants") # Table 4 = S5: 38 genetic screen variants (rows 2-39, skipping header rows 0-1) s5_count = 0 if len(tables) > 4: table_s5 = tables[4] for row in table_s5[2:]: # Skip 2 header rows if len(row) < 5: continue protein = row[0].strip() odds_path = row[4].strip() if not protein.startswith('p.'): continue variants.append({ 'gene': 'MSH6', 'dna': '', 'protein': protein, 'clinvar_id': '', 'odds_path': odds_path, 'name': protein, 'paper': 'drost2020', 'coord_type': 'protein', }) s5_count += 1 print(f" S5: {s5_count} variants") print(f" Total: {len(variants)} variants from Drost 2020") return variants # ============================================================================ # Data parsing: Paper 3 - Jia et al. 2021 # ============================================================================ def parse_jia2021(filepath): """ Parse Jia 2021 supplementary XLSX (TableS5_AllVariants). Gene: MSH2 (NM_000251, mapped using NM_000251.3) Returns list of variant dicts. TableS5 columns: Variant (col 0), Position (col 1), LOF score (col 2) Variants in compact notation like M1A, need p. prefix. Skip variants with LOF score = None. """ if not os.path.exists(filepath): print(f" WARNING: {filepath} not found, skipping Jia 2021") return [] print(f" Parsing {filepath}...") wb = openpyxl.load_workbook(filepath, read_only=True) variants = [] ws = wb['TableS5_AllVariants'] skipped_none = 0 for i, row in enumerate(ws.iter_rows(min_row=2, values_only=True)): vals = list(row) if len(vals) < 3: continue variant_name = vals[0] position = vals[1] lof_score = vals[2] if variant_name is None: continue if lof_score is None: skipped_none += 1 continue variant_str = str(variant_name).strip() protein = f"p.{variant_str}" # Use Position column if available, otherwise parse from variant name if position is not None: aa_pos = int(position) else: aa_pos = parse_protein_pos(variant_str) if aa_pos is None: continue variants.append({ 'gene': 'MSH2', 'dna': '', 'protein': protein, 'clinvar_id': '', 'lof_score': str(lof_score), 'name': protein, 'paper': 'jia2021', 'coord_type': 'protein', 'aa_pos': aa_pos, }) wb.close() print(f" Parsed {len(variants)} variants from Jia 2021 (skipped {skipped_none} with no LOF score)") return variants # ============================================================================ # Data parsing: Paper 4 - Rath et al. 2022 # ============================================================================ def parse_rath2022(filepath): """ Rath et al. 2022 (PMID:36054288) - Supplementary Table S1. Source: NIHMS1834139-supplement-supinfo.pdf Gene: MLH1 (NM_000249.4 - MANE transcript) Data hardcoded from published supplement for reliability. OddsPath_Non-functional column. Variants with "-" (no OddsPath) are excluded. Scientific notation values (e.g. 1.08E-7) preserved as strings. "1,450" = 1450 (American thousands separator, not European decimal). ClinVar ID exceptions: - K618A (c.1852_1853delinsGC): Paper listed ClinVar 32128, which is a ClinVar measure_id (allele-level), not a variation_id (VCV). The /clinvar/variation/32128/ URL returns 404. Updated to VCV 17089, which is the current ClinVar variation record for this variant (NM_000249.4:c.1852_1853delinsGC, p.Lys618Ala). Returns list of variant dicts. """ # (protein_short, dna, clinvar_id, odds_path_non_functional) # Protein short is the cell line name = protein change without p. prefix DATA = [ # Benign (9 with OddsPath, 2 skipped: I219V, H718Y have "-") ('D132H', 'c.394G>C', '17096', '1.08E-7'), ('V213M', 'c.637G>A', '41640', '5.76E-4'), ('E268G', 'c.803A>G', '90382', '2.24E-4'), ('S406N', 'c.1217G>A', '41634', '1.49E-10'), ('E578G', 'c.1733A>G', '17090', '1.44E-6'), ('K618A', 'c.1852_1853delinsGC', '17089', '2.90E-13'), # Paper listed measure_id 32128 (404); updated to VCV 17089 ('Q689R', 'c.2066A>G', '90016', '1.11E-4'), ('S692P', 'c.2074T>C', '127621', '4.02E-4'), ('V716M', 'c.2146G>A', '41639', '4.07E-5'), # VUS (6 with OddsPath, 15 skipped with "-") ('C39R', 'c.115T>C', '89655', '71.4'), ('D63N', 'c.187G>A', '89920', '0.00'), ('N64S', 'c.191A>G', '89947', '5.00E-2'), ('L73P', 'c.218T>C', '90086', '13.7'), ('G454R', 'c.1360G>C', '89705', '1.11E-2'), ('R474P', 'c.1421G>C', '', '0.489'), # Pathogenic (11 with OddsPath) ('I19F', 'c.55A>T', '90274', '346'), ('D41H', 'c.121G>C', '89682', '3.99E+4'), ('S44F', 'c.131C>T', '17079', '2.46E+4'), ('G67R', 'c.199G>A', '89992', '4.27E+4'), ('R100P', 'c.299G>C', '90133', '1450'), ('I107R', 'c.320T>G', '90167', '210'), ('T117M', 'c.350C>T', '17094', '1.33E+18'), ('S247P', 'c.739T>C', '90342', '105'), ('R265S', 'c.793C>A', '90380', '37.2'), ('I276R', 'c.827T>G', '520540', '319'), ('R687W', 'c.2059C>T', '90014', '1.74E+4'), ] print(f" Loading Rath 2022 data (hardcoded from Supplementary Table S1)...") transcript = PAPER_TRANSCRIPTS['rath2022']['MLH1'] variants = [] for protein_short, dna, clinvar_id, odds_path in DATA: protein = f"p.{protein_short}" variants.append({ 'gene': 'MLH1', 'dna': dna, 'protein': protein, 'clinvar_id': clinvar_id, 'odds_path': odds_path, 'name': f"{transcript}:{dna}", 'paper': 'rath2022', 'coord_type': 'cds', }) print(f" Loaded {len(variants)} variants from Rath 2022") return variants # ============================================================================ # BED generation # ============================================================================ def process_variants(variants, db, transcript_cache, stats): """ Convert parsed variants to BED entries for a given genome assembly. Returns list of BED line strings. """ bed_entries = [] for var in variants: gene = var['gene'] paper = var['paper'] paper_info = PAPERS[paper] # Get transcript info (cached) tx_accession = TRANSCRIPTS[gene] cache_key = f"{db}_{tx_accession}" if cache_key not in transcript_cache: transcript_cache[cache_key] = get_transcript_info(db, tx_accession) transcript_cache[f"{cache_key}_cds"] = build_cds_regions(transcript_cache[cache_key]) tx = transcript_cache[cache_key] cds_regions = transcript_cache[f"{cache_key}_cds"] chrom = tx['chrom'] strand = tx['strand'] # Determine genomic coordinates based on coordinate type if var['coord_type'] == 'cds': cds_start, cds_end = parse_hgvsc_pos(var['dna']) if cds_start is None: stats['parse_failed'] += 1 continue genomic_start, _ = cds_pos_to_genomic(cds_start, cds_regions, strand) _, genomic_end = cds_pos_to_genomic(cds_end, cds_regions, strand) elif var['coord_type'] == 'protein': aa_pos = var.get('aa_pos') or parse_protein_pos(var['protein']) if aa_pos is None: stats['parse_failed'] += 1 continue genomic_start, genomic_end = aa_to_genomic(aa_pos, cds_regions) else: stats['parse_failed'] += 1 continue if genomic_start is None: stats['coord_failed'] += 1 continue # Classify variant if paper == 'jia2021': classification = classify_jia_lof(var['lof_score']) score_val = var['lof_score'] score_label = 'LOF' else: classification = classify_tavtigian(var['odds_path']) score_val = var['odds_path'] score_label = 'OddsPath' if classification is None: stats['classify_failed'] += 1 continue color = COLORS[classification] name = var['name'] protein = var['protein'] clinvar_id = var.get('clinvar_id', '') paper_ref = paper_info['ref'] # Build mouseover mouseover = build_mouseover(name, protein, classification, clinvar_id, score_val, score_label, paper) bed_line = make_bed_entry(chrom, genomic_start, genomic_end, name, color, gene, protein, classification, clinvar_id, score_val, paper_ref, mouseover) bed_entries.append(bed_line) stats['included'] += 1 stats[f'included_{paper}'] = stats.get(f'included_{paper}', 0) + 1 return bed_entries def create_track(db): """Create BED and bigBed files for a given genome assembly""" print(f"\n{'='*70}") print(f"Processing {db}") print(f"{'='*70}") stats = { 'included': 0, 'parse_failed': 0, 'coord_failed': 0, 'classify_failed': 0, } transcript_cache = {} # Parse all papers all_variants = [] # Paper 1: Drost 2018 print("\n Paper 1: Drost et al. 2018") drost2018_file = os.path.join(OUTPUT_DIR, "drost2018_supplement.pdf") all_variants.extend(parse_drost2018(drost2018_file)) # Paper 2: Drost 2020 print("\n Paper 2: Drost et al. 2020") drost2020_file = os.path.join(OUTPUT_DIR, "drost2020_supplement.docx") all_variants.extend(parse_drost2020(drost2020_file)) # Paper 3: Jia 2021 print("\n Paper 3: Jia et al. 2021") jia2021_file = os.path.join(OUTPUT_DIR, "mmc2.xlsx") all_variants.extend(parse_jia2021(jia2021_file)) # Paper 4: Rath 2022 print("\n Paper 4: Rath et al. 2022") rath2022_file = os.path.join(OUTPUT_DIR, "rath2022_supplement.pdf") all_variants.extend(parse_rath2022(rath2022_file)) print(f"\n Total variants to process: {len(all_variants)}") # Convert to BED entries print(f"\n Querying transcript coordinates from {db}...") bed_entries = process_variants(all_variants, db, transcript_cache, stats) # Write BED file bed_file = os.path.join(OUTPUT_DIR, f"insightFunctionalAssays_{db}.bed") print(f"\n Writing BED file: {bed_file}") with open(bed_file, 'w') as f: f.write('\n'.join(bed_entries) + '\n') # Sort BED file print(" Sorting BED file...") bash(f"sort -k1,1 -k2,2n {bed_file} -o {bed_file}") # Create bigBed as_file = os.path.join(OUTPUT_DIR, "insightFunctionalAssays.as") db_label = db.replace('hg', 'Hg') bb_file = os.path.join(OUTPUT_DIR, f"insightFunctionalAssays{db_label}.bb") chrom_sizes = f"/cluster/data/{db}/chrom.sizes" print(f" Creating bigBed file: {bb_file}") try: bash(f"bedToBigBed -as={as_file} -type=bed9+7 -tab {bed_file} {chrom_sizes} {bb_file}") print(f" Successfully created: {bb_file}") except Exception as e: print(f" ERROR creating bigBed: {e}") # Print stats print(f"\n Statistics for {db}:") print(f" Total included: {stats['included']}") for paper_key in ['drost2018', 'drost2020', 'jia2021', 'rath2022']: count = stats.get(f'included_{paper_key}', 0) if count > 0: print(f" {PAPERS[paper_key]['ref']}: {count}") print(f" Parse failed: {stats['parse_failed']}") print(f" Coordinate failed: {stats['coord_failed']}") print(f" Classify failed: {stats['classify_failed']}") return bed_file, bb_file # ============================================================================ # Main # ============================================================================ def main(): print("=" * 70) print("InSiGHT VCEP Functional Assays Track Generator") print("=" * 70) # Create output directory if needed os.makedirs(OUTPUT_DIR, exist_ok=True) # Write AutoSql file as_file = os.path.join(OUTPUT_DIR, "insightFunctionalAssays.as") print(f"\nWriting AutoSql file: {as_file}") with open(as_file, 'w') as f: f.write(AUTOSQL) # Process both assemblies output_files = {} for db in ['hg38', 'hg19']: try: bed_file, bb_file = create_track(db) output_files[db] = {'bed': bed_file, 'bb': bb_file} except Exception as e: print(f"\nERROR processing {db}: {e}") import traceback traceback.print_exc() # Print summary print("\n" + "=" * 70) print("Output Files") print("=" * 70) print(f" AutoSql file: {as_file}") for db, files in output_files.items(): print(f"\n {db}:") print(f" BED file: {files['bed']}") if os.path.exists(files['bb']): print(f" BigBed file: {files['bb']}") print("\n" + "=" * 70) print("Done!") print("=" * 70) if __name__ == "__main__": main()