src/hg/makeDb/scripts/insight/insightClinDomains.py 55225be4fa1c5b6c1d886c524606b67701e86448

55225be4fa1c5b6c1d886c524606b67701e86448
lrnassar
  Tue Apr 14 10:32:57 2026 -0700
Fix typo, comment error, and bare except in InSiGHT build scripts per code review. refs #36582

diff --git src/hg/makeDb/scripts/insight/insightClinDomains.py src/hg/makeDb/scripts/insight/insightClinDomains.py
index 7ebc41ad6f0..3cd4301be7c 100644
--- src/hg/makeDb/scripts/insight/insightClinDomains.py
+++ src/hg/makeDb/scripts/insight/insightClinDomains.py
@@ -1,333 +1,333 @@
 #!/usr/bin/env python3
 """
 InSiGHT VCEP Clinically Relevant Protein Domains Track Generator
 
 This script generates UCSC Genome Browser tracks (bigBed format) for the
 InSiGHT Variant Curation Expert Panel (VCEP) clinically relevant protein domains
 for Lynch syndrome genes: MLH1, MSH2, MSH6, and PMS2.
 
 Coordinates are computed dynamically from amino acid positions by querying
 NCBI RefSeq transcript coordinates from hgsql for both hg38 and hg19.
 
 Transcripts used:
   MLH1: NM_000249.4  (chr3, + strand)
   MSH2: NM_000251.3  (chr2, + strand)
   MSH6: NM_000179.3  (chr2, + strand)
   PMS2: NM_000535.7  (chr7, - strand)
 
 Author: Generated for InSiGHT VCEP
 Date: 2025
 """
 
 import subprocess
 import os
 
 def bash(cmd):
     """Run the cmd in bash subprocess"""
     try:
         rawBashOutput = subprocess.run(cmd, check=True, shell=True,
                                        stdout=subprocess.PIPE, universal_newlines=True, stderr=subprocess.STDOUT)
-        bashStdoutt = rawBashOutput.stdout
+        bashStdout = rawBashOutput.stdout
     except subprocess.CalledProcessError as e:
         raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))
-    return(bashStdoutt)
+    return(bashStdout)
 
 # ============================================================================
 # Configuration
 # ============================================================================
 OUTPUT_DIR = "/hive/users/lrnassar/insightHub/clinDomains"
 
 # Transcripts to query
 TRANSCRIPTS = {
     'MLH1': 'NM_000249.4',
     'MSH2': 'NM_000251.3',
     'MSH6': 'NM_000179.3',
     'PMS2': 'NM_000535.7',
 }
 
 # Domain data from proteinDomains.txt (gene, domain_name, aa_start, aa_end)
 DOMAINS = [
     ('MLH1', 'ATPase', 20, 147),
     ('MLH1', 'MutSa interaction', 1, 329),
     ('MLH1', 'EXO1 interaction', 410, 650),
     ('MLH1', 'NLS', 469, 478),
     ('MLH1', 'MLH3/PMS1/PMS2 interaction', 506, 743),
     ('MLH1', 'NES', 585, 595),
     ('MLH1', 'NES', 650, 663),
     ('PMS2', 'ATPase', 1, 365),
     ('PMS2', 'NLS', 625, 632),
     ('PMS2', 'MLH1 interaction', 675, 855),
     ('MSH2', 'DNA binding', 1, 125),
     ('MSH2', 'Connector', 125, 300),
     ('MSH2', 'Lever', 300, 457),
     ('MSH2', 'Clamp', 457, 554),
     ('MSH2', 'Lever', 554, 620),
     ('MSH2', 'ATPase', 620, 856),
     ('MSH2', 'Helix-turn-helix', 856, 934),
     ('MSH2', 'MutLa interaction', 204, 225),
     ('MSH2', 'MSH3/MSH6 interaction', 378, 625),
     ('MSH2', 'MutLa interaction', 673, 686),
     ('MSH2', 'MSH3/MSH6 interaction', 875, 934),
     ('MSH2', 'EXO1 stabilisation & interaction', 266, 671),
     ('MSH6', 'PCNA interaction', 4, 11),
     ('MSH6', 'PWWP', 89, 194),
     ('MSH6', 'NLS', 246, 249),
     ('MSH6', 'NLS', 298, 313),
     ('MSH6', 'DNA binding', 360, 405),
     ('MSH6', 'Connector', 405, 701),
     ('MSH6', 'Lever', 701, 968),
     ('MSH6', 'Clamp', 968, 981),
     ('MSH6', 'Lever', 981, 1132),
     ('MSH6', 'ATPase', 1132, 1360),
     ('MSH6', 'MSH2 interaction', 326, 575),
     ('MSH6', 'MSH2 interaction', 1302, 1360),
 ]
 
 # AutoSQL definition for the BED9+4 format
 AUTOSQL = """table InSiGHTclinDomains
 "InSiGHT VCEP clinically relevant protein domains for Lynch syndrome genes (MLH1, MSH2, MSH6, PMS2)"
    (
    string chrom;       "Reference sequence chromosome or scaffold"
    uint   chromStart;  "Start position in chromosome"
    uint   chromEnd;    "End position in chromosome"
    string name;        "Domain name"
    uint score;         "Not used, all 0"
    char[1] strand;     "Not used, all ."
    uint thickStart;    "Same as chromStart"
    uint thickEnd;      "Same as chromEnd"
    uint reserved;      "RGB value (use R,G,B string in input file)"
    string geneSymbol;  "Gene symbol"
    string NMaccession; "NCBI NM isoform accession"
    string AAlocation;  "Amino acid location of domain"
    string _mouseOver;  "Field only used as mouseOver"
    )"""
 
 # ============================================================================
 # Functions for coordinate computation
 # ============================================================================
 
 def get_transcript_info(db, accession):
     """Query hgsql to get transcript information from ncbiRefSeq"""
     query = f"SELECT name, chrom, strand, txStart, txEnd, cdsStart, cdsEnd, exonStarts, exonEnds FROM ncbiRefSeq WHERE name='{accession}'"
     result = bash(f'hgsql {db} -Ne "{query}"')
 
     if not result.strip():
         raise ValueError(f"Transcript {accession} not found in {db}.ncbiRefSeq")
 
     fields = result.strip().split('\t')
 
     # Parse exon starts and ends (comma-separated, trailing comma)
     exon_starts = [int(x) for x in fields[7].rstrip(',').split(',')]
     exon_ends = [int(x) for x in fields[8].rstrip(',').split(',')]
 
     return {
         'name': fields[0],
         'chrom': fields[1],
         'strand': fields[2],
         'txStart': int(fields[3]),
         'txEnd': int(fields[4]),
         'cdsStart': int(fields[5]),
         'cdsEnd': int(fields[6]),
         'exonStarts': exon_starts,
         'exonEnds': exon_ends,
     }
 
 def build_cds_regions(tx):
     """Build list of CDS regions from transcript info"""
     cds_regions = []
     for i in range(len(tx['exonStarts'])):
         ex_start = tx['exonStarts'][i]
         ex_end = tx['exonEnds'][i]
         # Skip exons outside CDS
         if ex_end <= tx['cdsStart'] or ex_start >= tx['cdsEnd']:
             continue
         # Clip exon to CDS boundaries
         region_start = max(ex_start, tx['cdsStart'])
         region_end = min(ex_end, tx['cdsEnd'])
         cds_regions.append((region_start, region_end, i+1))
 
     # For minus strand, reverse order (CDS starts from high genomic coords)
     if tx['strand'] == '-':
         cds_regions = cds_regions[::-1]
 
     return cds_regions
 
 def aa_to_genomic_plus(aa_start, aa_end, cds_regions):
     """Convert AA range to genomic segments for + strand genes"""
     nt_start = (aa_start - 1) * 3 + 1  # 1-based nucleotide position
     nt_end = aa_end * 3
 
     segments = []
     cumulative = 0
 
     for start, end, exon_num in cds_regions:
         region_len = end - start
         region_nt_start = cumulative + 1
         region_nt_end = cumulative + region_len
 
         if region_nt_end >= nt_start and region_nt_start <= nt_end:
             overlap_nt_start = max(nt_start, region_nt_start)
             overlap_nt_end = min(nt_end, region_nt_end)
 
             genomic_start = start + (overlap_nt_start - region_nt_start)
             genomic_end = start + (overlap_nt_end - region_nt_start) + 1
 
             segments.append((genomic_start, genomic_end, exon_num))
 
         cumulative += region_len
 
     return segments
 
 def aa_to_genomic_minus(aa_start, aa_end, cds_regions):
     """Convert AA range to genomic segments for - strand genes"""
     nt_start = (aa_start - 1) * 3 + 1
     nt_end = aa_end * 3
 
     segments = []
     cumulative = 0
 
     for start, end, exon_num in cds_regions:
         region_len = end - start
         region_nt_start = cumulative + 1
         region_nt_end = cumulative + region_len
 
         if region_nt_end >= nt_start and region_nt_start <= nt_end:
             overlap_nt_start = max(nt_start, region_nt_start)
             overlap_nt_end = min(nt_end, region_nt_end)
 
             offset_from_end_start = overlap_nt_start - region_nt_start
             offset_from_end_end = overlap_nt_end - region_nt_start
 
             genomic_end = end - offset_from_end_start
             genomic_start = end - offset_from_end_end - 1
 
             if genomic_start > genomic_end:
                 genomic_start, genomic_end = genomic_end, genomic_start
 
             segments.append((genomic_start, genomic_end, exon_num))
 
         cumulative += region_len
 
     return segments
 
 def generate_bed_entries(db, transcripts_info):
     """Generate BED entries for all domains"""
     bed_lines = []
 
     for gene, domain, aa_start, aa_end in DOMAINS:
         tx = transcripts_info[gene]
         chrom = tx['chrom']
         strand = tx['strand']
         accession = tx['name']
 
         cds_regions = build_cds_regions(tx)
 
         if strand == '+':
             segments = aa_to_genomic_plus(aa_start, aa_end, cds_regions)
         else:
             segments = aa_to_genomic_minus(aa_start, aa_end, cds_regions)
 
         for seg_start, seg_end, exon_num in segments:
             if seg_start >= seg_end:
                 print(f"  WARNING: Skipping invalid segment for {gene} {domain}: {seg_start}-{seg_end}")
                 continue
 
             aa_loc = f"{aa_start}-{aa_end}"
             mouse_over = f"<b>Domain: </b>{domain}</br><b>Gene: </b>{gene}</br><b>Transcript: </b>{accession}</br><b>Amino acid loc:</b> {aa_loc}"
             bed_line = f"{chrom}\t{seg_start}\t{seg_end}\t{domain}\t0\t.\t{seg_start}\t{seg_end}\t230,3,131\t{gene}\t{accession}\t{aa_loc}\t{mouse_over}"
             bed_lines.append(bed_line)
 
     return bed_lines
 
 def create_track(db, output_dir):
     """Create BED and bigBed files for a given genome assembly"""
     print(f"\n{'='*70}")
     print(f"Processing {db}")
     print(f"{'='*70}")
 
     # Query transcript info from hgsql
     print(f"\nQuerying transcript coordinates from {db}.ncbiRefSeq...")
     transcripts_info = {}
     for gene, accession in TRANSCRIPTS.items():
         print(f"  {gene}: {accession}")
         transcripts_info[gene] = get_transcript_info(db, accession)
 
     # Generate BED entries
     print("\nGenerating BED entries...")
     bed_lines = generate_bed_entries(db, transcripts_info)
     print(f"  Generated {len(bed_lines)} domain segments")
 
     # Write BED file
     bed_file = os.path.join(output_dir, f"InSiGHTclinDomains_{db}.bed")
     print(f"\nWriting BED file: {bed_file}")
     with open(bed_file, 'w') as f:
         f.write('\n'.join(bed_lines) + '\n')
 
     # Sort BED file
     print("Sorting BED file...")
     bash(f"sort -k1,1 -k2,2n {bed_file} -o {bed_file}")
 
     # Create bigBed
     as_file = os.path.join(output_dir, "InSiGHTclinDomains.as")
     bb_file = os.path.join(output_dir, f"InSiGHTclinDomains{db.capitalize()}.bb")
     chrom_sizes = f"/cluster/data/{db}/chrom.sizes"
 
     print(f"\nCreating bigBed file: {bb_file}")
     try:
         bash(f"bedToBigBed -as={as_file} -type=bed9+4 -tab {bed_file} {chrom_sizes} {bb_file}")
         print(f"  Successfully created: {bb_file}")
     except Exception as e:
         print(f"  ERROR creating bigBed: {e}")
 
     return bed_file, bb_file
 
 # ============================================================================
 # Main execution
 # ============================================================================
 if __name__ == "__main__":
     print("=" * 70)
     print("InSiGHT VCEP Clinically Relevant Protein Domains Track Generator")
     print("=" * 70)
 
     # Create output directory if needed
     os.makedirs(OUTPUT_DIR, exist_ok=True)
 
     # Write AutoSql file
     as_file = os.path.join(OUTPUT_DIR, "InSiGHTclinDomains.as")
     print(f"\nWriting AutoSql file: {as_file}")
     with open(as_file, 'w') as f:
         f.write(AUTOSQL)
 
     # Process both assemblies
     output_files = {}
     for db in ['hg38', 'hg19']:
         try:
             bed_file, bb_file = create_track(db, OUTPUT_DIR)
             output_files[db] = {'bed': bed_file, 'bb': bb_file}
         except Exception as e:
             print(f"\nERROR processing {db}: {e}")
 
     # Print summary
     print("\n" + "=" * 70)
     print("Output Files")
     print("=" * 70)
     print(f"  AutoSql file: {as_file}")
     for db, files in output_files.items():
         print(f"\n  {db}:")
         print(f"    BED file:    {files['bed']}")
         if os.path.exists(files['bb']):
             print(f"    BigBed file: {files['bb']}")
 
     print("\n" + "=" * 70)
     print("Custom Track Line (for bigBed)")
     print("=" * 70)
     print("""
   track type=bigBed name="InSiGHT Domains" \\
     description="InSiGHT VCEP Clinically Relevant Protein Domains" \\
     visibility=pack itemRgb=on \\
     bigDataUrl=<URL_TO_YOUR_BB_FILE>
 """)
 
     print("=" * 70)
     print("Done!")
     print("=" * 70)