55225be4fa1c5b6c1d886c524606b67701e86448
lrnassar
Tue Apr 14 10:32:57 2026 -0700
Fix typo, comment error, and bare except in InSiGHT build scripts per code review. refs #36582
diff --git src/hg/makeDb/scripts/insight/insightClinDomains.py src/hg/makeDb/scripts/insight/insightClinDomains.py
index 7ebc41ad6f0..3cd4301be7c 100644
--- src/hg/makeDb/scripts/insight/insightClinDomains.py
+++ src/hg/makeDb/scripts/insight/insightClinDomains.py
@@ -1,333 +1,333 @@
#!/usr/bin/env python3
"""
InSiGHT VCEP Clinically Relevant Protein Domains Track Generator
This script generates UCSC Genome Browser tracks (bigBed format) for the
InSiGHT Variant Curation Expert Panel (VCEP) clinically relevant protein domains
for Lynch syndrome genes: MLH1, MSH2, MSH6, and PMS2.
Coordinates are computed dynamically from amino acid positions by querying
NCBI RefSeq transcript coordinates from hgsql for both hg38 and hg19.
Transcripts used:
MLH1: NM_000249.4 (chr3, + strand)
MSH2: NM_000251.3 (chr2, + strand)
MSH6: NM_000179.3 (chr2, + strand)
PMS2: NM_000535.7 (chr7, - strand)
Author: Generated for InSiGHT VCEP
Date: 2025
"""
import subprocess
import os
def bash(cmd):
"""Run the cmd in bash subprocess"""
try:
rawBashOutput = subprocess.run(cmd, check=True, shell=True,
stdout=subprocess.PIPE, universal_newlines=True, stderr=subprocess.STDOUT)
- bashStdoutt = rawBashOutput.stdout
+ bashStdout = rawBashOutput.stdout
except subprocess.CalledProcessError as e:
raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))
- return(bashStdoutt)
+ return(bashStdout)
# ============================================================================
# Configuration
# ============================================================================
OUTPUT_DIR = "/hive/users/lrnassar/insightHub/clinDomains"
# Transcripts to query
TRANSCRIPTS = {
'MLH1': 'NM_000249.4',
'MSH2': 'NM_000251.3',
'MSH6': 'NM_000179.3',
'PMS2': 'NM_000535.7',
}
# Domain data from proteinDomains.txt (gene, domain_name, aa_start, aa_end)
DOMAINS = [
('MLH1', 'ATPase', 20, 147),
('MLH1', 'MutSa interaction', 1, 329),
('MLH1', 'EXO1 interaction', 410, 650),
('MLH1', 'NLS', 469, 478),
('MLH1', 'MLH3/PMS1/PMS2 interaction', 506, 743),
('MLH1', 'NES', 585, 595),
('MLH1', 'NES', 650, 663),
('PMS2', 'ATPase', 1, 365),
('PMS2', 'NLS', 625, 632),
('PMS2', 'MLH1 interaction', 675, 855),
('MSH2', 'DNA binding', 1, 125),
('MSH2', 'Connector', 125, 300),
('MSH2', 'Lever', 300, 457),
('MSH2', 'Clamp', 457, 554),
('MSH2', 'Lever', 554, 620),
('MSH2', 'ATPase', 620, 856),
('MSH2', 'Helix-turn-helix', 856, 934),
('MSH2', 'MutLa interaction', 204, 225),
('MSH2', 'MSH3/MSH6 interaction', 378, 625),
('MSH2', 'MutLa interaction', 673, 686),
('MSH2', 'MSH3/MSH6 interaction', 875, 934),
('MSH2', 'EXO1 stabilisation & interaction', 266, 671),
('MSH6', 'PCNA interaction', 4, 11),
('MSH6', 'PWWP', 89, 194),
('MSH6', 'NLS', 246, 249),
('MSH6', 'NLS', 298, 313),
('MSH6', 'DNA binding', 360, 405),
('MSH6', 'Connector', 405, 701),
('MSH6', 'Lever', 701, 968),
('MSH6', 'Clamp', 968, 981),
('MSH6', 'Lever', 981, 1132),
('MSH6', 'ATPase', 1132, 1360),
('MSH6', 'MSH2 interaction', 326, 575),
('MSH6', 'MSH2 interaction', 1302, 1360),
]
# AutoSQL definition for the BED9+4 format
AUTOSQL = """table InSiGHTclinDomains
"InSiGHT VCEP clinically relevant protein domains for Lynch syndrome genes (MLH1, MSH2, MSH6, PMS2)"
(
string chrom; "Reference sequence chromosome or scaffold"
uint chromStart; "Start position in chromosome"
uint chromEnd; "End position in chromosome"
string name; "Domain name"
uint score; "Not used, all 0"
char[1] strand; "Not used, all ."
uint thickStart; "Same as chromStart"
uint thickEnd; "Same as chromEnd"
uint reserved; "RGB value (use R,G,B string in input file)"
string geneSymbol; "Gene symbol"
string NMaccession; "NCBI NM isoform accession"
string AAlocation; "Amino acid location of domain"
string _mouseOver; "Field only used as mouseOver"
)"""
# ============================================================================
# Functions for coordinate computation
# ============================================================================
def get_transcript_info(db, accession):
"""Query hgsql to get transcript information from ncbiRefSeq"""
query = f"SELECT name, chrom, strand, txStart, txEnd, cdsStart, cdsEnd, exonStarts, exonEnds FROM ncbiRefSeq WHERE name='{accession}'"
result = bash(f'hgsql {db} -Ne "{query}"')
if not result.strip():
raise ValueError(f"Transcript {accession} not found in {db}.ncbiRefSeq")
fields = result.strip().split('\t')
# Parse exon starts and ends (comma-separated, trailing comma)
exon_starts = [int(x) for x in fields[7].rstrip(',').split(',')]
exon_ends = [int(x) for x in fields[8].rstrip(',').split(',')]
return {
'name': fields[0],
'chrom': fields[1],
'strand': fields[2],
'txStart': int(fields[3]),
'txEnd': int(fields[4]),
'cdsStart': int(fields[5]),
'cdsEnd': int(fields[6]),
'exonStarts': exon_starts,
'exonEnds': exon_ends,
}
def build_cds_regions(tx):
"""Build list of CDS regions from transcript info"""
cds_regions = []
for i in range(len(tx['exonStarts'])):
ex_start = tx['exonStarts'][i]
ex_end = tx['exonEnds'][i]
# Skip exons outside CDS
if ex_end <= tx['cdsStart'] or ex_start >= tx['cdsEnd']:
continue
# Clip exon to CDS boundaries
region_start = max(ex_start, tx['cdsStart'])
region_end = min(ex_end, tx['cdsEnd'])
cds_regions.append((region_start, region_end, i+1))
# For minus strand, reverse order (CDS starts from high genomic coords)
if tx['strand'] == '-':
cds_regions = cds_regions[::-1]
return cds_regions
def aa_to_genomic_plus(aa_start, aa_end, cds_regions):
"""Convert AA range to genomic segments for + strand genes"""
nt_start = (aa_start - 1) * 3 + 1 # 1-based nucleotide position
nt_end = aa_end * 3
segments = []
cumulative = 0
for start, end, exon_num in cds_regions:
region_len = end - start
region_nt_start = cumulative + 1
region_nt_end = cumulative + region_len
if region_nt_end >= nt_start and region_nt_start <= nt_end:
overlap_nt_start = max(nt_start, region_nt_start)
overlap_nt_end = min(nt_end, region_nt_end)
genomic_start = start + (overlap_nt_start - region_nt_start)
genomic_end = start + (overlap_nt_end - region_nt_start) + 1
segments.append((genomic_start, genomic_end, exon_num))
cumulative += region_len
return segments
def aa_to_genomic_minus(aa_start, aa_end, cds_regions):
"""Convert AA range to genomic segments for - strand genes"""
nt_start = (aa_start - 1) * 3 + 1
nt_end = aa_end * 3
segments = []
cumulative = 0
for start, end, exon_num in cds_regions:
region_len = end - start
region_nt_start = cumulative + 1
region_nt_end = cumulative + region_len
if region_nt_end >= nt_start and region_nt_start <= nt_end:
overlap_nt_start = max(nt_start, region_nt_start)
overlap_nt_end = min(nt_end, region_nt_end)
offset_from_end_start = overlap_nt_start - region_nt_start
offset_from_end_end = overlap_nt_end - region_nt_start
genomic_end = end - offset_from_end_start
genomic_start = end - offset_from_end_end - 1
if genomic_start > genomic_end:
genomic_start, genomic_end = genomic_end, genomic_start
segments.append((genomic_start, genomic_end, exon_num))
cumulative += region_len
return segments
def generate_bed_entries(db, transcripts_info):
"""Generate BED entries for all domains"""
bed_lines = []
for gene, domain, aa_start, aa_end in DOMAINS:
tx = transcripts_info[gene]
chrom = tx['chrom']
strand = tx['strand']
accession = tx['name']
cds_regions = build_cds_regions(tx)
if strand == '+':
segments = aa_to_genomic_plus(aa_start, aa_end, cds_regions)
else:
segments = aa_to_genomic_minus(aa_start, aa_end, cds_regions)
for seg_start, seg_end, exon_num in segments:
if seg_start >= seg_end:
print(f" WARNING: Skipping invalid segment for {gene} {domain}: {seg_start}-{seg_end}")
continue
aa_loc = f"{aa_start}-{aa_end}"
mouse_over = f"Domain: {domain}Gene: {gene}Transcript: {accession}Amino acid loc: {aa_loc}"
bed_line = f"{chrom}\t{seg_start}\t{seg_end}\t{domain}\t0\t.\t{seg_start}\t{seg_end}\t230,3,131\t{gene}\t{accession}\t{aa_loc}\t{mouse_over}"
bed_lines.append(bed_line)
return bed_lines
def create_track(db, output_dir):
"""Create BED and bigBed files for a given genome assembly"""
print(f"\n{'='*70}")
print(f"Processing {db}")
print(f"{'='*70}")
# Query transcript info from hgsql
print(f"\nQuerying transcript coordinates from {db}.ncbiRefSeq...")
transcripts_info = {}
for gene, accession in TRANSCRIPTS.items():
print(f" {gene}: {accession}")
transcripts_info[gene] = get_transcript_info(db, accession)
# Generate BED entries
print("\nGenerating BED entries...")
bed_lines = generate_bed_entries(db, transcripts_info)
print(f" Generated {len(bed_lines)} domain segments")
# Write BED file
bed_file = os.path.join(output_dir, f"InSiGHTclinDomains_{db}.bed")
print(f"\nWriting BED file: {bed_file}")
with open(bed_file, 'w') as f:
f.write('\n'.join(bed_lines) + '\n')
# Sort BED file
print("Sorting BED file...")
bash(f"sort -k1,1 -k2,2n {bed_file} -o {bed_file}")
# Create bigBed
as_file = os.path.join(output_dir, "InSiGHTclinDomains.as")
bb_file = os.path.join(output_dir, f"InSiGHTclinDomains{db.capitalize()}.bb")
chrom_sizes = f"/cluster/data/{db}/chrom.sizes"
print(f"\nCreating bigBed file: {bb_file}")
try:
bash(f"bedToBigBed -as={as_file} -type=bed9+4 -tab {bed_file} {chrom_sizes} {bb_file}")
print(f" Successfully created: {bb_file}")
except Exception as e:
print(f" ERROR creating bigBed: {e}")
return bed_file, bb_file
# ============================================================================
# Main execution
# ============================================================================
if __name__ == "__main__":
print("=" * 70)
print("InSiGHT VCEP Clinically Relevant Protein Domains Track Generator")
print("=" * 70)
# Create output directory if needed
os.makedirs(OUTPUT_DIR, exist_ok=True)
# Write AutoSql file
as_file = os.path.join(OUTPUT_DIR, "InSiGHTclinDomains.as")
print(f"\nWriting AutoSql file: {as_file}")
with open(as_file, 'w') as f:
f.write(AUTOSQL)
# Process both assemblies
output_files = {}
for db in ['hg38', 'hg19']:
try:
bed_file, bb_file = create_track(db, OUTPUT_DIR)
output_files[db] = {'bed': bed_file, 'bb': bb_file}
except Exception as e:
print(f"\nERROR processing {db}: {e}")
# Print summary
print("\n" + "=" * 70)
print("Output Files")
print("=" * 70)
print(f" AutoSql file: {as_file}")
for db, files in output_files.items():
print(f"\n {db}:")
print(f" BED file: {files['bed']}")
if os.path.exists(files['bb']):
print(f" BigBed file: {files['bb']}")
print("\n" + "=" * 70)
print("Custom Track Line (for bigBed)")
print("=" * 70)
print("""
track type=bigBed name="InSiGHT Domains" \\
description="InSiGHT VCEP Clinically Relevant Protein Domains" \\
visibility=pack itemRgb=on \\
bigDataUrl=
""")
print("=" * 70)
print("Done!")
print("=" * 70)