55225be4fa1c5b6c1d886c524606b67701e86448
lrnassar
Tue Apr 14 10:32:57 2026 -0700
Fix typo, comment error, and bare except in InSiGHT build scripts per code review. refs #36582
diff --git src/hg/makeDb/scripts/insight/insightFunctionalAssays.py src/hg/makeDb/scripts/insight/insightFunctionalAssays.py
index 32ceaa0699e..dd08b4c067a 100644
--- src/hg/makeDb/scripts/insight/insightFunctionalAssays.py
+++ src/hg/makeDb/scripts/insight/insightFunctionalAssays.py
@@ -1,963 +1,963 @@
#!/usr/bin/env python3
"""
InSiGHT VCEP Functional Assays Track Generator
Generates UCSC Genome Browser tracks (bigBed format) for functional assay evidence
from published studies on Lynch syndrome MMR genes. Combines data from:
Paper 1: Drost et al. 2018 (PMID:30504929) - MLH1/MSH2 CIMRA assay
Paper 2: Drost et al. 2020 (PMID:31965077) - MSH6 CIMRA assay
Paper 3: Jia et al. 2021 (PMID:33357406) - MSH2 deep mutational scanning
Paper 4: Rath et al. 2022 (PMID:36054288) - MLH1 cell-based functional assay
Each variant is classified according to ACMG PS3/BS3 criteria based on
Tavtigian OddsPath thresholds or Jia LOF score thresholds.
Output: BED9+7 bigBed files for hg38 and hg19.
Supplementary data files (place in same directory as this script):
drost2018_supplement.pdf - https://pmc.ncbi.nlm.nih.gov/articles/instance/7901556/bin/NIHMS1010100-supplement-CIMRA.pdf
drost2020_supplement.docx - https://static-content.springer.com/esm/art%3A10.1038%2Fs41436-019-0736-2/MediaObjects/41436_2019_736_MOESM2_ESM.docx
mmc2.xlsx - https://pmc.ncbi.nlm.nih.gov/articles/instance/7820803/bin/mmc2.xlsx
rath2022_supplement.pdf - https://pmc.ncbi.nlm.nih.gov/articles/instance/9772141/bin/NIHMS1834139-supplement-supinfo.pdf
Author: Generated for InSiGHT VCEP
Date: 2025
"""
import subprocess
import os
import re
import sys
import zipfile
import xml.etree.ElementTree as ET
try:
import openpyxl
except ImportError:
print("ERROR: openpyxl is required. Install with: pip install openpyxl")
sys.exit(1)
# ============================================================================
# Configuration
# ============================================================================
OUTPUT_DIR = "/hive/users/lrnassar/insightHub/functionalAssays"
# Transcripts for coordinate mapping (current MANE versions in hgsql)
TRANSCRIPTS = {
'MLH1': 'NM_000249.4',
'MSH2': 'NM_000251.3',
'MSH6': 'NM_000179.3',
}
# Paper-specific transcript versions (used in the name field per paper's notation)
PAPER_TRANSCRIPTS = {
'drost2018': {'MLH1': 'NM_000249.3', 'MSH2': 'NM_000251.2'},
'drost2020': {'MSH6': 'NM_000179.2'},
'rath2022': {'MLH1': 'NM_000249.4'},
}
# Classification colors (RGB) from insight.html
COLORS = {
'PS3_Strong': '180,0,0', # Dark Red
'PS3_Moderate': '210,0,0', # Red
'PS3_Supporting': '245,152,152', # Pink
'Indeterminate': '128,128,128', # Gray
'BS3_Supporting': '213,247,213', # Lime Green
'BS3_Strong': '0,210,0', # Green
}
# Paper references and PMIDs
PAPERS = {
'drost2018': {'ref': 'Drost et al., 2018', 'pmid': '30504929'},
'drost2020': {'ref': 'Drost et al., 2020', 'pmid': '31965077'},
'jia2021': {'ref': 'Jia et al., 2021', 'pmid': '33357406'},
'rath2022': {'ref': 'Rath et al., 2022', 'pmid': '36054288'},
}
# AutoSQL definition
AUTOSQL = """table insightFunctionalAssays
"Functional assay evidence for Lynch syndrome MMR gene variant classification"
(
string chrom; "Reference sequence chromosome or scaffold"
uint chromStart; "Start position in chromosome"
uint chromEnd; "End position in chromosome"
string name; "Variant name (HGVSc or HGVSp notation)"
uint score; "Not used, set to 0"
char[1] strand; "Not used, set to ."
uint thickStart; "Same as chromStart"
uint thickEnd; "Same as chromEnd"
uint reserved; "RGB color value"
string gene; "Gene symbol"
string protein; "Protein change (HGVSp notation)"
string classification; "ACMG evidence classification (PS3/BS3)"
string clinVarId; "ClinVar variation ID, if available"
string score_value; "Functional score (OddsPath or LOF)"
string paperRef; "Publication reference"
lstring _mouseOver; "HTML mouseover text"
)
"""
# ============================================================================
# Utility functions
# ============================================================================
def bash(cmd):
"""Run the cmd in bash subprocess"""
try:
rawBashOutput = subprocess.run(cmd, check=True, shell=True,
stdout=subprocess.PIPE, universal_newlines=True, stderr=subprocess.STDOUT)
- bashStdoutt = rawBashOutput.stdout
+ bashStdout = rawBashOutput.stdout
except subprocess.CalledProcessError as e:
raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))
- return(bashStdoutt)
+ return(bashStdout)
def get_transcript_info(db, accession):
"""Query hgsql to get transcript information from ncbiRefSeq"""
query = f"SELECT name, chrom, strand, txStart, txEnd, cdsStart, cdsEnd, exonStarts, exonEnds FROM ncbiRefSeq WHERE name='{accession}'"
result = bash(f'hgsql {db} -Ne "{query}"')
if not result.strip():
raise ValueError(f"Transcript {accession} not found in {db}.ncbiRefSeq")
fields = result.strip().split('\t')
# Parse exon starts and ends (comma-separated, trailing comma)
exon_starts = [int(x) for x in fields[7].rstrip(',').split(',')]
exon_ends = [int(x) for x in fields[8].rstrip(',').split(',')]
return {
'name': fields[0],
'chrom': fields[1],
'strand': fields[2],
'txStart': int(fields[3]),
'txEnd': int(fields[4]),
'cdsStart': int(fields[5]),
'cdsEnd': int(fields[6]),
'exonStarts': exon_starts,
'exonEnds': exon_ends,
}
def build_cds_regions(tx):
"""Build list of CDS regions from transcript info"""
cds_regions = []
for i in range(len(tx['exonStarts'])):
ex_start = tx['exonStarts'][i]
ex_end = tx['exonEnds'][i]
# Skip exons outside CDS
if ex_end <= tx['cdsStart'] or ex_start >= tx['cdsEnd']:
continue
# Clip exon to CDS boundaries
region_start = max(ex_start, tx['cdsStart'])
region_end = min(ex_end, tx['cdsEnd'])
cds_regions.append((region_start, region_end))
# For minus strand, reverse order (CDS starts from high genomic coords)
if tx['strand'] == '-':
cds_regions = cds_regions[::-1]
return cds_regions
def cds_pos_to_genomic(cds_pos, cds_regions, strand):
"""
Convert a 1-based CDS position to 0-based genomic coordinate.
Returns (genomic_start, genomic_end) for the single nucleotide.
"""
cumulative = 0
for region_start, region_end in cds_regions:
region_len = region_end - region_start
region_cds_start = cumulative + 1
region_cds_end = cumulative + region_len
if cds_pos >= region_cds_start and cds_pos <= region_cds_end:
offset = cds_pos - region_cds_start
if strand == '+':
genomic_pos = region_start + offset
else:
# Minus strand: count from end of region
genomic_pos = region_end - offset - 1
return genomic_pos, genomic_pos + 1
cumulative += region_len
return None, None
def aa_to_genomic(aa_pos, cds_regions):
"""
Convert a 1-based amino acid position to 0-based genomic coordinates
for the 3-nucleotide codon span. Only handles + strand genes
(MLH1, MSH2, MSH6 are all on + strand).
Returns (genomic_start, genomic_end) or (None, None).
"""
nt_start = (aa_pos - 1) * 3 + 1 # 1-based CDS nucleotide position
nt_end = aa_pos * 3
# Find the genomic positions for the first and last nucleotide of the codon
first_start, first_end = cds_pos_to_genomic(nt_start, cds_regions, '+')
last_start, last_end = cds_pos_to_genomic(nt_end, cds_regions, '+')
if first_start is not None and last_start is not None:
return first_start, last_end
return None, None
def parse_hgvsc_pos(hgvsc):
"""
Parse HGVSc notation to extract the CDS position and span.
Returns (start_pos, end_pos) as 1-based CDS positions.
For simple substitutions, start == end.
Examples:
- c.4T>A -> (4, 4)
- c.100G>C -> (100, 100)
- c.104_105delinsAC -> (104, 105)
- c.1852_1853delinsGC -> (1852, 1853)
"""
hgvsc = hgvsc.strip()
# Simple substitution: c.100G>C
match = re.match(r'c\.(\d+)[ACGT]>[ACGT]', hgvsc)
if match:
pos = int(match.group(1))
return pos, pos
# Delins spanning positions: c.104_105delinsAC
match = re.match(r'c\.(\d+)_(\d+)delins', hgvsc)
if match:
return int(match.group(1)), int(match.group(2))
return None, None
def parse_protein_pos(prot):
"""
Parse amino acid position from protein notation.
Handles both p.X###Y and X###Y formats.
Examples:
- p.W372R -> 372
- p.K13T -> 13
- M1A -> 1
"""
prot = prot.strip()
if prot.startswith('p.'):
prot = prot[2:]
match = re.match(r'[A-Z*](\d+)', prot)
if match:
return int(match.group(1))
return None
# ============================================================================
# Classification functions
# ============================================================================
def classify_tavtigian(odds_path):
"""
Classify variant using Tavtigian OddsPath thresholds.
Used for Papers 1, 2, 4 (CIMRA assays).
Thresholds:
PS3_Strong: > 18.7
PS3_Moderate: > 4.3 and <= 18.7
PS3_Supporting: > 2.08 and <= 4.3
Indeterminate: > 0.48 and <= 2.08
BS3_Supporting: > 0.05 and <= 0.48
BS3_Strong: <= 0.05
"""
try:
val = float(odds_path)
except (ValueError, TypeError):
return None
if val > 18.7:
return 'PS3_Strong'
elif val > 4.3:
return 'PS3_Moderate'
elif val > 2.08:
return 'PS3_Supporting'
elif val > 0.48:
return 'Indeterminate'
elif val > 0.05:
return 'BS3_Supporting'
else:
return 'BS3_Strong'
def classify_jia_lof(lof_score):
"""
Classify variant using Jia LOF score thresholds.
Used for Paper 3 (deep mutational scanning).
Thresholds:
PS3_Strong: >= 0.4
Indeterminate: >= 0 and < 0.4
BS3_Strong: < 0
"""
try:
val = float(lof_score)
except (ValueError, TypeError):
return None
if val >= 0.4:
return 'PS3_Strong'
elif val >= 0:
return 'Indeterminate'
else:
return 'BS3_Strong'
# ============================================================================
# MouseOver builder
# ============================================================================
def build_mouseover(name, protein, classification, clinvar_id, score_val, score_label, paper_key):
"""Build HTML mouseover text for a variant"""
paper_info = PAPERS[paper_key]
parts = []
parts.append(f"HGVSc/HGVSp: {name}")
parts.append(f"Protein: {protein}")
parts.append(f"Classification: {classification}")
if clinvar_id and clinvar_id != 'NA' and clinvar_id != '':
parts.append(f'ClinVar ID: {clinvar_id}')
else:
parts.append("ClinVar ID: N/A")
parts.append(f"{score_label}: {score_val}")
parts.append(f'Paper: {paper_info["ref"]}')
return "".join(parts)
# ============================================================================
# BED entry builder
# ============================================================================
def make_bed_entry(chrom, start, end, name, color, gene, protein,
classification, clinvar_id, score_val, paper_ref, mouseover):
"""Create a BED9+7 line"""
return (f"{chrom}\t{start}\t{end}\t{name}\t0\t.\t{start}\t{end}\t{color}"
f"\t{gene}\t{protein}\t{classification}\t{clinvar_id}"
f"\t{score_val}\t{paper_ref}\t{mouseover}")
# ============================================================================
# Data parsing: Paper 1 - Drost et al. 2018
# ============================================================================
def parse_drost2018(filepath):
"""
Drost et al. 2018 (PMID:30504929) - Supplementary Table S2.
Source: NIHMS1010100-supplement-CIMRA.pdf
Genes: MLH1 (NM_000249.3), MSH2 (NM_000251.2)
Data hardcoded from published supplement for reliability.
OddsPath values are the CIMRA-based "Odds for Pathogenicity" column
(Classification Prior-P + CIMRA section). European decimal commas
converted to periods.
ClinVar ID exceptions:
- V470G (c.1409T>G): Paper listed ClinVar 90658, but that ID now resolves
to V470E (c.1409T>A) — a different nucleotide change at the same position.
ClinVar appears to have reassigned the ID since publication. Removed to
avoid linking to the wrong variant.
Returns list of variant dicts.
"""
# (gene, dna, protein, clinvar_id, odds_path_cimra)
DATA = [
# MLH1 variants (39)
('MLH1', 'c.62C>A', 'p.A21E', '90297', '23.621'),
('MLH1', 'c.62C>T', 'p.A21V', '90298', '29.238'),
('MLH1', 'c.73A>T', 'p.I25F', '90343', '23.919'),
('MLH1', 'c.104_105delinsAC', 'p.M35N', '17105', '20.835'),
('MLH1', 'c.143A>C', 'p.Q48P', '89739', '31.923'),
('MLH1', 'c.146T>A', 'p.V49E', '89749', '7.829'),
('MLH1', 'c.191A>G', 'p.N64S', '89947', '0.483'),
('MLH1', 'c.203T>A', 'p.I68N', '90008', '29.607'),
('MLH1', 'c.244A>G', 'p.T82A', '90116', '47.103'),
('MLH1', 'c.245C>T', 'p.T82I', '90118', '33.147'),
('MLH1', 'c.301G>A', 'p.G101S', '90137', '44.238'),
('MLH1', 'c.332C>T', 'p.A111V', '90171', '21.908'),
('MLH1', 'c.346A>C', 'p.T116P', '', '17.699'),
('MLH1', 'c.382G>C', 'p.A128P', '90199', '32.734'),
('MLH1', 'c.394G>C', 'p.D132H', '17096', '0.002'),
('MLH1', 'c.779T>G', 'p.L260R', '90350', '18.378'),
('MLH1', 'c.793C>A', 'p.R265S', '90380', '10.580'),
('MLH1', 'c.803A>G', 'p.E268G', '90382', '0.070'),
('MLH1', 'c.911A>T', 'p.D304V', '90439', '33.990'),
('MLH1', 'c.918T>A', 'p.N306K', '90441', '51.428'),
('MLH1', 'c.1321G>A', 'p.A441T', '89696', '0.001'),
('MLH1', 'c.1664T>C', 'p.L555P', '89822', '57.577'),
('MLH1', 'c.1733A>G', 'p.E578G', '17090', '0.003'),
('MLH1', 'c.1742C>T', 'p.P581L', '41635', '0.002'),
('MLH1', 'c.1745T>C', 'p.L582P', '89871', '28.874'),
('MLH1', 'c.1799A>G', 'p.E600G', '89890', '0.002'),
('MLH1', 'c.1808C>G', 'p.P603R', '41636', '0.002'),
('MLH1', 'c.1823C>A', 'p.A608D', '89897', '2.905'),
('MLH1', 'c.1853A>C', 'p.K618T', '89906', '3.377'),
('MLH1', 'c.1855G>C', 'p.A619P', '89909', '2.833'),
('MLH1', 'c.1865T>A', 'p.L622H', '29657', '6.486'),
('MLH1', 'c.1943C>T', 'p.P648L', '89953', '0.521'),
('MLH1', 'c.1942C>T', 'p.P648S', '17097', '3.507'),
('MLH1', 'c.1976G>T', 'p.R659L', '89966', '4.507'),
('MLH1', 'c.2038T>C', 'p.C680R', '90006', '37.579'),
('MLH1', 'c.2041G>A', 'p.A681T', '17099', '0.031'),
('MLH1', 'c.2101C>A', 'p.Q701K', '90042', '0.034'),
('MLH1', 'c.2128A>G', 'p.N710D', '', '0.160'),
('MLH1', 'c.2263A>G', 'p.R755G', '219292', '68.636'),
# MSH2 variants (35)
('MSH2', 'c.23C>T', 'p.T8M', '90964', '0.166'),
('MSH2', 'c.277C>T', 'p.L93F', '91044', '0.554'),
('MSH2', 'c.287G>A', 'p.R96H', '91053', '0.051'),
('MSH2', 'c.317G>A', 'p.R106K', '91062', '0.013'),
('MSH2', 'c.488T>A', 'p.V163D', '91107', '89.331'),
('MSH2', 'c.488T>G', 'p.V163G', '91108', '79.790'),
('MSH2', 'c.493T>G', 'p.Y165D', '91111', '81.818'),
('MSH2', 'c.505A>G', 'p.I169V', '91115', '0.000'),
('MSH2', 'c.560T>G', 'p.L187R', '91135', '59.040'),
('MSH2', 'c.595T>C', 'p.C199R', '91146', '105.160'),
('MSH2', 'c.599T>A', 'p.V200D', '91148', '89.331'),
('MSH2', 'c.929T>C', 'p.L310P', '91245', '55.450'),
('MSH2', 'c.989T>C', 'p.L330P', '91270', '81.818'),
('MSH2', 'c.991A>G', 'p.N331D', '91271', '0.125'),
('MSH2', 'c.1022T>C', 'p.L341P', '90507', '42.073'),
('MSH2', 'c.1046C>G', 'p.P349R', '90513', '80.798'),
('MSH2', 'c.1077A>T', 'p.R359S', '90539', '62.080'),
('MSH2', 'c.1168C>T', 'p.L390F', '41641', '0.003'),
('MSH2', 'c.1255C>A', 'p.Q419K', '90583', '0.002'),
('MSH2', 'c.1319T>C', 'p.L440P', '90625', '33.990'),
('MSH2', 'c.1409T>G', 'p.V470G', '', '83.898'), # Paper listed ClinVar 90658, but that ID now points to V470E (c.1409T>A); removed
('MSH2', 'c.1571G>T', 'p.R524L', '90701', '0.179'),
('MSH2', 'c.1690A>G', 'p.T564A', '90750', '0.001'),
('MSH2', 'c.1730T>C', 'p.I577T', '41644', '0.323'),
('MSH2', 'c.1808A>G', 'p.D603G', '90791', '66.934'),
('MSH2', 'c.1955C>A', 'p.P652H', '90823', '0.035'),
('MSH2', 'c.1963G>A', 'p.V655I', '127637', '0.002'),
('MSH2', 'c.2021G>A', 'p.G674D', '90861', '50.787'),
('MSH2', 'c.2023A>G', 'p.K675E', '408453', '54.075'),
('MSH2', 'c.2047G>A', 'p.G683R', '90868', '74.003'),
('MSH2', 'c.2063T>G', 'p.M688R', '90874', '77.813'),
('MSH2', 'c.2087C>T', 'p.P696L', '90881', '18.845'),
('MSH2', 'c.2276G>A', 'p.G759E', '41866', '2.661'),
('MSH2', 'c.2287G>C', 'p.A763P', '', '35.294'),
('MSH2', 'c.2500G>A', 'p.A834T', '90986', '0.004'),
]
print(f" Loading Drost 2018 data (hardcoded from Supplementary Table S2)...")
variants = []
for gene, dna, protein, clinvar_id, odds_path in DATA:
transcript = PAPER_TRANSCRIPTS['drost2018'][gene]
variants.append({
'gene': gene,
'dna': dna,
'protein': protein,
'clinvar_id': clinvar_id,
'odds_path': odds_path,
'name': f"{transcript}:{dna}",
'paper': 'drost2018',
'coord_type': 'cds',
})
print(f" Loaded {len(variants)} variants from Drost 2018")
return variants
# ============================================================================
# Data parsing: Paper 2 - Drost et al. 2020
# ============================================================================
def get_docx_tables(docx_path):
"""Extract tables from a DOCX file using XML parsing"""
ns = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
with zipfile.ZipFile(docx_path, 'r') as z:
xml_content = z.read('word/document.xml')
root = ET.fromstring(xml_content)
tables = root.findall('.//w:tbl', ns)
result = []
for table in tables:
rows = table.findall('.//w:tr', ns)
table_data = []
for row in rows:
cells = row.findall('.//w:tc', ns)
row_data = []
for cell in cells:
texts = []
for p in cell.findall('.//w:t', ns):
if p.text:
texts.append(p.text)
row_data.append(''.join(texts))
table_data.append(row_data)
result.append(table_data)
return result
def parse_drost2020(filepath):
"""
Parse Drost 2020 supplementary DOCX (Tables S1, S3, S5).
Gene: MSH6 (NM_000179.2 for S1/S3, protein-only for S5)
Returns list of variant dicts.
Table structure (from DOCX XML):
Table 0 = S1 (31 calibration variants): DNA col 0, Protein col 1, ClinVar col 2, OddsPath col 10
Table 2 = S3 (18 additional variants): DNA col 0, Protein col 1, ClinVar col 2, OddsPath col 8
Table 4 = S5 (38 screen variants): Protein col 0, OddsPath col 4
Errata corrections:
- p.V509Ag (c.1526T>C) in S1: Trailing 'g' is a typo in the published
DOCX. ClinVar 41588 confirms the correct notation is p.Val509Ala
(p.V509A). Corrected during parsing.
"""
if not os.path.exists(filepath):
print(f" WARNING: {filepath} not found, skipping Drost 2020")
return []
print(f" Parsing {filepath}...")
tables = get_docx_tables(filepath)
transcript = PAPER_TRANSCRIPTS['drost2020']['MSH6']
variants = []
# Table 0 = S1: 31 calibration variants (rows 2-32, skipping header rows 0-1)
if len(tables) > 0:
table_s1 = tables[0]
for row in table_s1[2:]: # Skip 2 header rows
if len(row) < 11:
continue
dna = row[0].strip()
protein = row[1].strip()
clinvar_id = row[2].strip()
odds_path = row[10].strip()
if not dna.startswith('c.'):
continue
if clinvar_id == 'NA':
clinvar_id = ''
# Fix known typos in published DOCX: trailing 'g' on protein names
# is a formatting artifact (e.g., p.V509Ag, p.E220Dg, p.E1163Vg, p.R1304Kg)
# ClinVar confirms correct forms without trailing 'g'
protein = re.sub(r'^(p\.[A-Z]\d+[A-Z])g$', r'\1', protein)
variants.append({
'gene': 'MSH6',
'dna': dna,
'protein': protein,
'clinvar_id': clinvar_id,
'odds_path': odds_path,
'name': f"{transcript}:{dna}",
'paper': 'drost2020',
'coord_type': 'cds',
})
print(f" S1: {len([v for v in variants if v['coord_type'] == 'cds'])} variants")
# Table 2 = S3: 18 additional variants (rows 2-19, skipping header rows 0-1)
s3_count = 0
if len(tables) > 2:
table_s3 = tables[2]
for row in table_s3[2:]: # Skip 2 header rows
if len(row) < 9:
continue
dna = row[0].strip()
protein = row[1].strip()
clinvar_id = row[2].strip()
odds_path = row[8].strip()
if not dna.startswith('c.'):
continue
if clinvar_id == 'NA':
clinvar_id = ''
variants.append({
'gene': 'MSH6',
'dna': dna,
'protein': protein,
'clinvar_id': clinvar_id,
'odds_path': odds_path,
'name': f"{transcript}:{dna}",
'paper': 'drost2020',
'coord_type': 'cds',
})
s3_count += 1
print(f" S3: {s3_count} variants")
# Table 4 = S5: 38 genetic screen variants (rows 2-39, skipping header rows 0-1)
s5_count = 0
if len(tables) > 4:
table_s5 = tables[4]
for row in table_s5[2:]: # Skip 2 header rows
if len(row) < 5:
continue
protein = row[0].strip()
odds_path = row[4].strip()
if not protein.startswith('p.'):
continue
variants.append({
'gene': 'MSH6',
'dna': '',
'protein': protein,
'clinvar_id': '',
'odds_path': odds_path,
'name': protein,
'paper': 'drost2020',
'coord_type': 'protein',
})
s5_count += 1
print(f" S5: {s5_count} variants")
print(f" Total: {len(variants)} variants from Drost 2020")
return variants
# ============================================================================
# Data parsing: Paper 3 - Jia et al. 2021
# ============================================================================
def parse_jia2021(filepath):
"""
Parse Jia 2021 supplementary XLSX (TableS5_AllVariants).
Gene: MSH2 (NM_000251, mapped using NM_000251.3)
Returns list of variant dicts.
TableS5 columns: Variant (col 0), Position (col 1), LOF score (col 2)
Variants in compact notation like M1A, need p. prefix.
Skip variants with LOF score = None.
"""
if not os.path.exists(filepath):
print(f" WARNING: {filepath} not found, skipping Jia 2021")
return []
print(f" Parsing {filepath}...")
wb = openpyxl.load_workbook(filepath, read_only=True)
variants = []
ws = wb['TableS5_AllVariants']
skipped_none = 0
for i, row in enumerate(ws.iter_rows(min_row=2, values_only=True)):
vals = list(row)
if len(vals) < 3:
continue
variant_name = vals[0]
position = vals[1]
lof_score = vals[2]
if variant_name is None:
continue
if lof_score is None:
skipped_none += 1
continue
variant_str = str(variant_name).strip()
protein = f"p.{variant_str}"
# Use Position column if available, otherwise parse from variant name
if position is not None:
aa_pos = int(position)
else:
aa_pos = parse_protein_pos(variant_str)
if aa_pos is None:
continue
variants.append({
'gene': 'MSH2',
'dna': '',
'protein': protein,
'clinvar_id': '',
'lof_score': str(lof_score),
'name': protein,
'paper': 'jia2021',
'coord_type': 'protein',
'aa_pos': aa_pos,
})
wb.close()
print(f" Parsed {len(variants)} variants from Jia 2021 (skipped {skipped_none} with no LOF score)")
return variants
# ============================================================================
# Data parsing: Paper 4 - Rath et al. 2022
# ============================================================================
def parse_rath2022(filepath):
"""
Rath et al. 2022 (PMID:36054288) - Supplementary Table S1.
Source: NIHMS1834139-supplement-supinfo.pdf
Gene: MLH1 (NM_000249.4 - MANE transcript)
Data hardcoded from published supplement for reliability.
OddsPath_Non-functional column. Variants with "-" (no OddsPath) are
excluded. Scientific notation values (e.g. 1.08E-7) preserved as strings.
"1,450" = 1450 (American thousands separator, not European decimal).
ClinVar ID exceptions:
- K618A (c.1852_1853delinsGC): Paper listed ClinVar 32128, which is a
ClinVar measure_id (allele-level), not a variation_id (VCV). The
/clinvar/variation/32128/ URL returns 404. Updated to VCV 17089, which
is the current ClinVar variation record for this variant
(NM_000249.4:c.1852_1853delinsGC, p.Lys618Ala).
Returns list of variant dicts.
"""
# (protein_short, dna, clinvar_id, odds_path_non_functional)
# Protein short is the cell line name = protein change without p. prefix
DATA = [
# Benign (9 with OddsPath, 2 skipped: I219V, H718Y have "-")
('D132H', 'c.394G>C', '17096', '1.08E-7'),
('V213M', 'c.637G>A', '41640', '5.76E-4'),
('E268G', 'c.803A>G', '90382', '2.24E-4'),
('S406N', 'c.1217G>A', '41634', '1.49E-10'),
('E578G', 'c.1733A>G', '17090', '1.44E-6'),
('K618A', 'c.1852_1853delinsGC', '17089', '2.90E-13'), # Paper listed measure_id 32128 (404); updated to VCV 17089
('Q689R', 'c.2066A>G', '90016', '1.11E-4'),
('S692P', 'c.2074T>C', '127621', '4.02E-4'),
('V716M', 'c.2146G>A', '41639', '4.07E-5'),
# VUS (6 with OddsPath, 15 skipped with "-")
('C39R', 'c.115T>C', '89655', '71.4'),
('D63N', 'c.187G>A', '89920', '0.00'),
('N64S', 'c.191A>G', '89947', '5.00E-2'),
('L73P', 'c.218T>C', '90086', '13.7'),
('G454R', 'c.1360G>C', '89705', '1.11E-2'),
('R474P', 'c.1421G>C', '', '0.489'),
# Pathogenic (11 with OddsPath)
('I19F', 'c.55A>T', '90274', '346'),
('D41H', 'c.121G>C', '89682', '3.99E+4'),
('S44F', 'c.131C>T', '17079', '2.46E+4'),
('G67R', 'c.199G>A', '89992', '4.27E+4'),
('R100P', 'c.299G>C', '90133', '1450'),
('I107R', 'c.320T>G', '90167', '210'),
('T117M', 'c.350C>T', '17094', '1.33E+18'),
('S247P', 'c.739T>C', '90342', '105'),
('R265S', 'c.793C>A', '90380', '37.2'),
('I276R', 'c.827T>G', '520540', '319'),
('R687W', 'c.2059C>T', '90014', '1.74E+4'),
]
print(f" Loading Rath 2022 data (hardcoded from Supplementary Table S1)...")
transcript = PAPER_TRANSCRIPTS['rath2022']['MLH1']
variants = []
for protein_short, dna, clinvar_id, odds_path in DATA:
protein = f"p.{protein_short}"
variants.append({
'gene': 'MLH1',
'dna': dna,
'protein': protein,
'clinvar_id': clinvar_id,
'odds_path': odds_path,
'name': f"{transcript}:{dna}",
'paper': 'rath2022',
'coord_type': 'cds',
})
print(f" Loaded {len(variants)} variants from Rath 2022")
return variants
# ============================================================================
# BED generation
# ============================================================================
def process_variants(variants, db, transcript_cache, stats):
"""
Convert parsed variants to BED entries for a given genome assembly.
Returns list of BED line strings.
"""
bed_entries = []
for var in variants:
gene = var['gene']
paper = var['paper']
paper_info = PAPERS[paper]
# Get transcript info (cached)
tx_accession = TRANSCRIPTS[gene]
cache_key = f"{db}_{tx_accession}"
if cache_key not in transcript_cache:
transcript_cache[cache_key] = get_transcript_info(db, tx_accession)
transcript_cache[f"{cache_key}_cds"] = build_cds_regions(transcript_cache[cache_key])
tx = transcript_cache[cache_key]
cds_regions = transcript_cache[f"{cache_key}_cds"]
chrom = tx['chrom']
strand = tx['strand']
# Determine genomic coordinates based on coordinate type
if var['coord_type'] == 'cds':
cds_start, cds_end = parse_hgvsc_pos(var['dna'])
if cds_start is None:
stats['parse_failed'] += 1
continue
genomic_start, _ = cds_pos_to_genomic(cds_start, cds_regions, strand)
_, genomic_end = cds_pos_to_genomic(cds_end, cds_regions, strand)
elif var['coord_type'] == 'protein':
aa_pos = var.get('aa_pos') or parse_protein_pos(var['protein'])
if aa_pos is None:
stats['parse_failed'] += 1
continue
genomic_start, genomic_end = aa_to_genomic(aa_pos, cds_regions)
else:
stats['parse_failed'] += 1
continue
if genomic_start is None:
stats['coord_failed'] += 1
continue
# Classify variant
if paper == 'jia2021':
classification = classify_jia_lof(var['lof_score'])
score_val = var['lof_score']
score_label = 'LOF'
else:
classification = classify_tavtigian(var['odds_path'])
score_val = var['odds_path']
score_label = 'OddsPath'
if classification is None:
stats['classify_failed'] += 1
continue
color = COLORS[classification]
name = var['name']
protein = var['protein']
clinvar_id = var.get('clinvar_id', '')
paper_ref = paper_info['ref']
# Build mouseover
mouseover = build_mouseover(name, protein, classification,
clinvar_id, score_val, score_label, paper)
bed_line = make_bed_entry(chrom, genomic_start, genomic_end, name,
color, gene, protein, classification,
clinvar_id, score_val, paper_ref, mouseover)
bed_entries.append(bed_line)
stats['included'] += 1
stats[f'included_{paper}'] = stats.get(f'included_{paper}', 0) + 1
return bed_entries
def create_track(db):
"""Create BED and bigBed files for a given genome assembly"""
print(f"\n{'='*70}")
print(f"Processing {db}")
print(f"{'='*70}")
stats = {
'included': 0,
'parse_failed': 0,
'coord_failed': 0,
'classify_failed': 0,
}
transcript_cache = {}
# Parse all papers
all_variants = []
# Paper 1: Drost 2018
print("\n Paper 1: Drost et al. 2018")
drost2018_file = os.path.join(OUTPUT_DIR, "drost2018_supplement.pdf")
all_variants.extend(parse_drost2018(drost2018_file))
# Paper 2: Drost 2020
print("\n Paper 2: Drost et al. 2020")
drost2020_file = os.path.join(OUTPUT_DIR, "drost2020_supplement.docx")
all_variants.extend(parse_drost2020(drost2020_file))
# Paper 3: Jia 2021
print("\n Paper 3: Jia et al. 2021")
jia2021_file = os.path.join(OUTPUT_DIR, "mmc2.xlsx")
all_variants.extend(parse_jia2021(jia2021_file))
# Paper 4: Rath 2022
print("\n Paper 4: Rath et al. 2022")
rath2022_file = os.path.join(OUTPUT_DIR, "rath2022_supplement.pdf")
all_variants.extend(parse_rath2022(rath2022_file))
print(f"\n Total variants to process: {len(all_variants)}")
# Convert to BED entries
print(f"\n Querying transcript coordinates from {db}...")
bed_entries = process_variants(all_variants, db, transcript_cache, stats)
# Write BED file
bed_file = os.path.join(OUTPUT_DIR, f"insightFunctionalAssays_{db}.bed")
print(f"\n Writing BED file: {bed_file}")
with open(bed_file, 'w') as f:
f.write('\n'.join(bed_entries) + '\n')
# Sort BED file
print(" Sorting BED file...")
bash(f"sort -k1,1 -k2,2n {bed_file} -o {bed_file}")
# Create bigBed
as_file = os.path.join(OUTPUT_DIR, "insightFunctionalAssays.as")
db_label = db.replace('hg', 'Hg')
bb_file = os.path.join(OUTPUT_DIR, f"insightFunctionalAssays{db_label}.bb")
chrom_sizes = f"/cluster/data/{db}/chrom.sizes"
print(f" Creating bigBed file: {bb_file}")
try:
bash(f"bedToBigBed -as={as_file} -type=bed9+7 -tab {bed_file} {chrom_sizes} {bb_file}")
print(f" Successfully created: {bb_file}")
except Exception as e:
print(f" ERROR creating bigBed: {e}")
# Print stats
print(f"\n Statistics for {db}:")
print(f" Total included: {stats['included']}")
for paper_key in ['drost2018', 'drost2020', 'jia2021', 'rath2022']:
count = stats.get(f'included_{paper_key}', 0)
if count > 0:
print(f" {PAPERS[paper_key]['ref']}: {count}")
print(f" Parse failed: {stats['parse_failed']}")
print(f" Coordinate failed: {stats['coord_failed']}")
print(f" Classify failed: {stats['classify_failed']}")
return bed_file, bb_file
# ============================================================================
# Main
# ============================================================================
def main():
print("=" * 70)
print("InSiGHT VCEP Functional Assays Track Generator")
print("=" * 70)
# Create output directory if needed
os.makedirs(OUTPUT_DIR, exist_ok=True)
# Write AutoSql file
as_file = os.path.join(OUTPUT_DIR, "insightFunctionalAssays.as")
print(f"\nWriting AutoSql file: {as_file}")
with open(as_file, 'w') as f:
f.write(AUTOSQL)
# Process both assemblies
output_files = {}
for db in ['hg38', 'hg19']:
try:
bed_file, bb_file = create_track(db)
output_files[db] = {'bed': bed_file, 'bb': bb_file}
except Exception as e:
print(f"\nERROR processing {db}: {e}")
import traceback
traceback.print_exc()
# Print summary
print("\n" + "=" * 70)
print("Output Files")
print("=" * 70)
print(f" AutoSql file: {as_file}")
for db, files in output_files.items():
print(f"\n {db}:")
print(f" BED file: {files['bed']}")
if os.path.exists(files['bb']):
print(f" BigBed file: {files['bb']}")
print("\n" + "=" * 70)
print("Done!")
print("=" * 70)
if __name__ == "__main__":
main()