ed3cb243b9151cae7d1859712148fa1d1e1a2196
lrnassar
Mon Apr 13 17:34:17 2026 -0700
Adding makedoc and build scripts for InSiGHT VCEP track hub. refs #36582
diff --git src/hg/makeDb/scripts/insight/insightClinDomains.py src/hg/makeDb/scripts/insight/insightClinDomains.py
new file mode 100644
index 00000000000..7ebc41ad6f0
--- /dev/null
+++ src/hg/makeDb/scripts/insight/insightClinDomains.py
@@ -0,0 +1,333 @@
+#!/usr/bin/env python3
+"""
+InSiGHT VCEP Clinically Relevant Protein Domains Track Generator
+
+This script generates UCSC Genome Browser tracks (bigBed format) for the
+InSiGHT Variant Curation Expert Panel (VCEP) clinically relevant protein domains
+for Lynch syndrome genes: MLH1, MSH2, MSH6, and PMS2.
+
+Coordinates are computed dynamically from amino acid positions by querying
+NCBI RefSeq transcript coordinates from hgsql for both hg38 and hg19.
+
+Transcripts used:
+ MLH1: NM_000249.4 (chr3, + strand)
+ MSH2: NM_000251.3 (chr2, + strand)
+ MSH6: NM_000179.3 (chr2, + strand)
+ PMS2: NM_000535.7 (chr7, - strand)
+
+Author: Generated for InSiGHT VCEP
+Date: 2025
+"""
+
+import subprocess
+import os
+
+def bash(cmd):
+ """Run the cmd in bash subprocess"""
+ try:
+ rawBashOutput = subprocess.run(cmd, check=True, shell=True,
+ stdout=subprocess.PIPE, universal_newlines=True, stderr=subprocess.STDOUT)
+ bashStdoutt = rawBashOutput.stdout
+ except subprocess.CalledProcessError as e:
+ raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))
+ return(bashStdoutt)
+
+# ============================================================================
+# Configuration
+# ============================================================================
+OUTPUT_DIR = "/hive/users/lrnassar/insightHub/clinDomains"
+
+# Transcripts to query
+TRANSCRIPTS = {
+ 'MLH1': 'NM_000249.4',
+ 'MSH2': 'NM_000251.3',
+ 'MSH6': 'NM_000179.3',
+ 'PMS2': 'NM_000535.7',
+}
+
+# Domain data from proteinDomains.txt (gene, domain_name, aa_start, aa_end)
+DOMAINS = [
+ ('MLH1', 'ATPase', 20, 147),
+ ('MLH1', 'MutSa interaction', 1, 329),
+ ('MLH1', 'EXO1 interaction', 410, 650),
+ ('MLH1', 'NLS', 469, 478),
+ ('MLH1', 'MLH3/PMS1/PMS2 interaction', 506, 743),
+ ('MLH1', 'NES', 585, 595),
+ ('MLH1', 'NES', 650, 663),
+ ('PMS2', 'ATPase', 1, 365),
+ ('PMS2', 'NLS', 625, 632),
+ ('PMS2', 'MLH1 interaction', 675, 855),
+ ('MSH2', 'DNA binding', 1, 125),
+ ('MSH2', 'Connector', 125, 300),
+ ('MSH2', 'Lever', 300, 457),
+ ('MSH2', 'Clamp', 457, 554),
+ ('MSH2', 'Lever', 554, 620),
+ ('MSH2', 'ATPase', 620, 856),
+ ('MSH2', 'Helix-turn-helix', 856, 934),
+ ('MSH2', 'MutLa interaction', 204, 225),
+ ('MSH2', 'MSH3/MSH6 interaction', 378, 625),
+ ('MSH2', 'MutLa interaction', 673, 686),
+ ('MSH2', 'MSH3/MSH6 interaction', 875, 934),
+ ('MSH2', 'EXO1 stabilisation & interaction', 266, 671),
+ ('MSH6', 'PCNA interaction', 4, 11),
+ ('MSH6', 'PWWP', 89, 194),
+ ('MSH6', 'NLS', 246, 249),
+ ('MSH6', 'NLS', 298, 313),
+ ('MSH6', 'DNA binding', 360, 405),
+ ('MSH6', 'Connector', 405, 701),
+ ('MSH6', 'Lever', 701, 968),
+ ('MSH6', 'Clamp', 968, 981),
+ ('MSH6', 'Lever', 981, 1132),
+ ('MSH6', 'ATPase', 1132, 1360),
+ ('MSH6', 'MSH2 interaction', 326, 575),
+ ('MSH6', 'MSH2 interaction', 1302, 1360),
+]
+
+# AutoSQL definition for the BED9+4 format
+AUTOSQL = """table InSiGHTclinDomains
+"InSiGHT VCEP clinically relevant protein domains for Lynch syndrome genes (MLH1, MSH2, MSH6, PMS2)"
+ (
+ string chrom; "Reference sequence chromosome or scaffold"
+ uint chromStart; "Start position in chromosome"
+ uint chromEnd; "End position in chromosome"
+ string name; "Domain name"
+ uint score; "Not used, all 0"
+ char[1] strand; "Not used, all ."
+ uint thickStart; "Same as chromStart"
+ uint thickEnd; "Same as chromEnd"
+ uint reserved; "RGB value (use R,G,B string in input file)"
+ string geneSymbol; "Gene symbol"
+ string NMaccession; "NCBI NM isoform accession"
+ string AAlocation; "Amino acid location of domain"
+ string _mouseOver; "Field only used as mouseOver"
+ )"""
+
+# ============================================================================
+# Functions for coordinate computation
+# ============================================================================
+
+def get_transcript_info(db, accession):
+ """Query hgsql to get transcript information from ncbiRefSeq"""
+ query = f"SELECT name, chrom, strand, txStart, txEnd, cdsStart, cdsEnd, exonStarts, exonEnds FROM ncbiRefSeq WHERE name='{accession}'"
+ result = bash(f'hgsql {db} -Ne "{query}"')
+
+ if not result.strip():
+ raise ValueError(f"Transcript {accession} not found in {db}.ncbiRefSeq")
+
+ fields = result.strip().split('\t')
+
+ # Parse exon starts and ends (comma-separated, trailing comma)
+ exon_starts = [int(x) for x in fields[7].rstrip(',').split(',')]
+ exon_ends = [int(x) for x in fields[8].rstrip(',').split(',')]
+
+ return {
+ 'name': fields[0],
+ 'chrom': fields[1],
+ 'strand': fields[2],
+ 'txStart': int(fields[3]),
+ 'txEnd': int(fields[4]),
+ 'cdsStart': int(fields[5]),
+ 'cdsEnd': int(fields[6]),
+ 'exonStarts': exon_starts,
+ 'exonEnds': exon_ends,
+ }
+
+def build_cds_regions(tx):
+ """Build list of CDS regions from transcript info"""
+ cds_regions = []
+ for i in range(len(tx['exonStarts'])):
+ ex_start = tx['exonStarts'][i]
+ ex_end = tx['exonEnds'][i]
+ # Skip exons outside CDS
+ if ex_end <= tx['cdsStart'] or ex_start >= tx['cdsEnd']:
+ continue
+ # Clip exon to CDS boundaries
+ region_start = max(ex_start, tx['cdsStart'])
+ region_end = min(ex_end, tx['cdsEnd'])
+ cds_regions.append((region_start, region_end, i+1))
+
+ # For minus strand, reverse order (CDS starts from high genomic coords)
+ if tx['strand'] == '-':
+ cds_regions = cds_regions[::-1]
+
+ return cds_regions
+
+def aa_to_genomic_plus(aa_start, aa_end, cds_regions):
+ """Convert AA range to genomic segments for + strand genes"""
+ nt_start = (aa_start - 1) * 3 + 1 # 1-based nucleotide position
+ nt_end = aa_end * 3
+
+ segments = []
+ cumulative = 0
+
+ for start, end, exon_num in cds_regions:
+ region_len = end - start
+ region_nt_start = cumulative + 1
+ region_nt_end = cumulative + region_len
+
+ if region_nt_end >= nt_start and region_nt_start <= nt_end:
+ overlap_nt_start = max(nt_start, region_nt_start)
+ overlap_nt_end = min(nt_end, region_nt_end)
+
+ genomic_start = start + (overlap_nt_start - region_nt_start)
+ genomic_end = start + (overlap_nt_end - region_nt_start) + 1
+
+ segments.append((genomic_start, genomic_end, exon_num))
+
+ cumulative += region_len
+
+ return segments
+
+def aa_to_genomic_minus(aa_start, aa_end, cds_regions):
+ """Convert AA range to genomic segments for - strand genes"""
+ nt_start = (aa_start - 1) * 3 + 1
+ nt_end = aa_end * 3
+
+ segments = []
+ cumulative = 0
+
+ for start, end, exon_num in cds_regions:
+ region_len = end - start
+ region_nt_start = cumulative + 1
+ region_nt_end = cumulative + region_len
+
+ if region_nt_end >= nt_start and region_nt_start <= nt_end:
+ overlap_nt_start = max(nt_start, region_nt_start)
+ overlap_nt_end = min(nt_end, region_nt_end)
+
+ offset_from_end_start = overlap_nt_start - region_nt_start
+ offset_from_end_end = overlap_nt_end - region_nt_start
+
+ genomic_end = end - offset_from_end_start
+ genomic_start = end - offset_from_end_end - 1
+
+ if genomic_start > genomic_end:
+ genomic_start, genomic_end = genomic_end, genomic_start
+
+ segments.append((genomic_start, genomic_end, exon_num))
+
+ cumulative += region_len
+
+ return segments
+
+def generate_bed_entries(db, transcripts_info):
+ """Generate BED entries for all domains"""
+ bed_lines = []
+
+ for gene, domain, aa_start, aa_end in DOMAINS:
+ tx = transcripts_info[gene]
+ chrom = tx['chrom']
+ strand = tx['strand']
+ accession = tx['name']
+
+ cds_regions = build_cds_regions(tx)
+
+ if strand == '+':
+ segments = aa_to_genomic_plus(aa_start, aa_end, cds_regions)
+ else:
+ segments = aa_to_genomic_minus(aa_start, aa_end, cds_regions)
+
+ for seg_start, seg_end, exon_num in segments:
+ if seg_start >= seg_end:
+ print(f" WARNING: Skipping invalid segment for {gene} {domain}: {seg_start}-{seg_end}")
+ continue
+
+ aa_loc = f"{aa_start}-{aa_end}"
+ mouse_over = f"Domain: {domain}Gene: {gene}Transcript: {accession}Amino acid loc: {aa_loc}"
+ bed_line = f"{chrom}\t{seg_start}\t{seg_end}\t{domain}\t0\t.\t{seg_start}\t{seg_end}\t230,3,131\t{gene}\t{accession}\t{aa_loc}\t{mouse_over}"
+ bed_lines.append(bed_line)
+
+ return bed_lines
+
+def create_track(db, output_dir):
+ """Create BED and bigBed files for a given genome assembly"""
+ print(f"\n{'='*70}")
+ print(f"Processing {db}")
+ print(f"{'='*70}")
+
+ # Query transcript info from hgsql
+ print(f"\nQuerying transcript coordinates from {db}.ncbiRefSeq...")
+ transcripts_info = {}
+ for gene, accession in TRANSCRIPTS.items():
+ print(f" {gene}: {accession}")
+ transcripts_info[gene] = get_transcript_info(db, accession)
+
+ # Generate BED entries
+ print("\nGenerating BED entries...")
+ bed_lines = generate_bed_entries(db, transcripts_info)
+ print(f" Generated {len(bed_lines)} domain segments")
+
+ # Write BED file
+ bed_file = os.path.join(output_dir, f"InSiGHTclinDomains_{db}.bed")
+ print(f"\nWriting BED file: {bed_file}")
+ with open(bed_file, 'w') as f:
+ f.write('\n'.join(bed_lines) + '\n')
+
+ # Sort BED file
+ print("Sorting BED file...")
+ bash(f"sort -k1,1 -k2,2n {bed_file} -o {bed_file}")
+
+ # Create bigBed
+ as_file = os.path.join(output_dir, "InSiGHTclinDomains.as")
+ bb_file = os.path.join(output_dir, f"InSiGHTclinDomains{db.capitalize()}.bb")
+ chrom_sizes = f"/cluster/data/{db}/chrom.sizes"
+
+ print(f"\nCreating bigBed file: {bb_file}")
+ try:
+ bash(f"bedToBigBed -as={as_file} -type=bed9+4 -tab {bed_file} {chrom_sizes} {bb_file}")
+ print(f" Successfully created: {bb_file}")
+ except Exception as e:
+ print(f" ERROR creating bigBed: {e}")
+
+ return bed_file, bb_file
+
+# ============================================================================
+# Main execution
+# ============================================================================
+if __name__ == "__main__":
+ print("=" * 70)
+ print("InSiGHT VCEP Clinically Relevant Protein Domains Track Generator")
+ print("=" * 70)
+
+ # Create output directory if needed
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
+
+ # Write AutoSql file
+ as_file = os.path.join(OUTPUT_DIR, "InSiGHTclinDomains.as")
+ print(f"\nWriting AutoSql file: {as_file}")
+ with open(as_file, 'w') as f:
+ f.write(AUTOSQL)
+
+ # Process both assemblies
+ output_files = {}
+ for db in ['hg38', 'hg19']:
+ try:
+ bed_file, bb_file = create_track(db, OUTPUT_DIR)
+ output_files[db] = {'bed': bed_file, 'bb': bb_file}
+ except Exception as e:
+ print(f"\nERROR processing {db}: {e}")
+
+ # Print summary
+ print("\n" + "=" * 70)
+ print("Output Files")
+ print("=" * 70)
+ print(f" AutoSql file: {as_file}")
+ for db, files in output_files.items():
+ print(f"\n {db}:")
+ print(f" BED file: {files['bed']}")
+ if os.path.exists(files['bb']):
+ print(f" BigBed file: {files['bb']}")
+
+ print("\n" + "=" * 70)
+ print("Custom Track Line (for bigBed)")
+ print("=" * 70)
+ print("""
+ track type=bigBed name="InSiGHT Domains" \\
+ description="InSiGHT VCEP Clinically Relevant Protein Domains" \\
+ visibility=pack itemRgb=on \\
+ bigDataUrl=
+""")
+
+ print("=" * 70)
+ print("Done!")
+ print("=" * 70)