55225be4fa1c5b6c1d886c524606b67701e86448 lrnassar Tue Apr 14 10:32:57 2026 -0700 Fix typo, comment error, and bare except in InSiGHT build scripts per code review. refs #36582 diff --git src/hg/makeDb/scripts/insight/insightClinDomains.py src/hg/makeDb/scripts/insight/insightClinDomains.py index 7ebc41ad6f0..3cd4301be7c 100644 --- src/hg/makeDb/scripts/insight/insightClinDomains.py +++ src/hg/makeDb/scripts/insight/insightClinDomains.py @@ -1,333 +1,333 @@ #!/usr/bin/env python3 """ InSiGHT VCEP Clinically Relevant Protein Domains Track Generator This script generates UCSC Genome Browser tracks (bigBed format) for the InSiGHT Variant Curation Expert Panel (VCEP) clinically relevant protein domains for Lynch syndrome genes: MLH1, MSH2, MSH6, and PMS2. Coordinates are computed dynamically from amino acid positions by querying NCBI RefSeq transcript coordinates from hgsql for both hg38 and hg19. Transcripts used: MLH1: NM_000249.4 (chr3, + strand) MSH2: NM_000251.3 (chr2, + strand) MSH6: NM_000179.3 (chr2, + strand) PMS2: NM_000535.7 (chr7, - strand) Author: Generated for InSiGHT VCEP Date: 2025 """ import subprocess import os def bash(cmd): """Run the cmd in bash subprocess""" try: rawBashOutput = subprocess.run(cmd, check=True, shell=True, stdout=subprocess.PIPE, universal_newlines=True, stderr=subprocess.STDOUT) - bashStdoutt = rawBashOutput.stdout + bashStdout = rawBashOutput.stdout except subprocess.CalledProcessError as e: raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output)) - return(bashStdoutt) + return(bashStdout) # ============================================================================ # Configuration # ============================================================================ OUTPUT_DIR = "/hive/users/lrnassar/insightHub/clinDomains" # Transcripts to query TRANSCRIPTS = { 'MLH1': 'NM_000249.4', 'MSH2': 'NM_000251.3', 'MSH6': 'NM_000179.3', 'PMS2': 'NM_000535.7', } # Domain data from proteinDomains.txt (gene, domain_name, aa_start, aa_end) DOMAINS = [ ('MLH1', 'ATPase', 20, 147), ('MLH1', 'MutSa interaction', 1, 329), ('MLH1', 'EXO1 interaction', 410, 650), ('MLH1', 'NLS', 469, 478), ('MLH1', 'MLH3/PMS1/PMS2 interaction', 506, 743), ('MLH1', 'NES', 585, 595), ('MLH1', 'NES', 650, 663), ('PMS2', 'ATPase', 1, 365), ('PMS2', 'NLS', 625, 632), ('PMS2', 'MLH1 interaction', 675, 855), ('MSH2', 'DNA binding', 1, 125), ('MSH2', 'Connector', 125, 300), ('MSH2', 'Lever', 300, 457), ('MSH2', 'Clamp', 457, 554), ('MSH2', 'Lever', 554, 620), ('MSH2', 'ATPase', 620, 856), ('MSH2', 'Helix-turn-helix', 856, 934), ('MSH2', 'MutLa interaction', 204, 225), ('MSH2', 'MSH3/MSH6 interaction', 378, 625), ('MSH2', 'MutLa interaction', 673, 686), ('MSH2', 'MSH3/MSH6 interaction', 875, 934), ('MSH2', 'EXO1 stabilisation & interaction', 266, 671), ('MSH6', 'PCNA interaction', 4, 11), ('MSH6', 'PWWP', 89, 194), ('MSH6', 'NLS', 246, 249), ('MSH6', 'NLS', 298, 313), ('MSH6', 'DNA binding', 360, 405), ('MSH6', 'Connector', 405, 701), ('MSH6', 'Lever', 701, 968), ('MSH6', 'Clamp', 968, 981), ('MSH6', 'Lever', 981, 1132), ('MSH6', 'ATPase', 1132, 1360), ('MSH6', 'MSH2 interaction', 326, 575), ('MSH6', 'MSH2 interaction', 1302, 1360), ] # AutoSQL definition for the BED9+4 format AUTOSQL = """table InSiGHTclinDomains "InSiGHT VCEP clinically relevant protein domains for Lynch syndrome genes (MLH1, MSH2, MSH6, PMS2)" ( string chrom; "Reference sequence chromosome or scaffold" uint chromStart; "Start position in chromosome" uint chromEnd; "End position in chromosome" string name; "Domain name" uint score; "Not used, all 0" char[1] strand; "Not used, all ." uint thickStart; "Same as chromStart" uint thickEnd; "Same as chromEnd" uint reserved; "RGB value (use R,G,B string in input file)" string geneSymbol; "Gene symbol" string NMaccession; "NCBI NM isoform accession" string AAlocation; "Amino acid location of domain" string _mouseOver; "Field only used as mouseOver" )""" # ============================================================================ # Functions for coordinate computation # ============================================================================ def get_transcript_info(db, accession): """Query hgsql to get transcript information from ncbiRefSeq""" query = f"SELECT name, chrom, strand, txStart, txEnd, cdsStart, cdsEnd, exonStarts, exonEnds FROM ncbiRefSeq WHERE name='{accession}'" result = bash(f'hgsql {db} -Ne "{query}"') if not result.strip(): raise ValueError(f"Transcript {accession} not found in {db}.ncbiRefSeq") fields = result.strip().split('\t') # Parse exon starts and ends (comma-separated, trailing comma) exon_starts = [int(x) for x in fields[7].rstrip(',').split(',')] exon_ends = [int(x) for x in fields[8].rstrip(',').split(',')] return { 'name': fields[0], 'chrom': fields[1], 'strand': fields[2], 'txStart': int(fields[3]), 'txEnd': int(fields[4]), 'cdsStart': int(fields[5]), 'cdsEnd': int(fields[6]), 'exonStarts': exon_starts, 'exonEnds': exon_ends, } def build_cds_regions(tx): """Build list of CDS regions from transcript info""" cds_regions = [] for i in range(len(tx['exonStarts'])): ex_start = tx['exonStarts'][i] ex_end = tx['exonEnds'][i] # Skip exons outside CDS if ex_end <= tx['cdsStart'] or ex_start >= tx['cdsEnd']: continue # Clip exon to CDS boundaries region_start = max(ex_start, tx['cdsStart']) region_end = min(ex_end, tx['cdsEnd']) cds_regions.append((region_start, region_end, i+1)) # For minus strand, reverse order (CDS starts from high genomic coords) if tx['strand'] == '-': cds_regions = cds_regions[::-1] return cds_regions def aa_to_genomic_plus(aa_start, aa_end, cds_regions): """Convert AA range to genomic segments for + strand genes""" nt_start = (aa_start - 1) * 3 + 1 # 1-based nucleotide position nt_end = aa_end * 3 segments = [] cumulative = 0 for start, end, exon_num in cds_regions: region_len = end - start region_nt_start = cumulative + 1 region_nt_end = cumulative + region_len if region_nt_end >= nt_start and region_nt_start <= nt_end: overlap_nt_start = max(nt_start, region_nt_start) overlap_nt_end = min(nt_end, region_nt_end) genomic_start = start + (overlap_nt_start - region_nt_start) genomic_end = start + (overlap_nt_end - region_nt_start) + 1 segments.append((genomic_start, genomic_end, exon_num)) cumulative += region_len return segments def aa_to_genomic_minus(aa_start, aa_end, cds_regions): """Convert AA range to genomic segments for - strand genes""" nt_start = (aa_start - 1) * 3 + 1 nt_end = aa_end * 3 segments = [] cumulative = 0 for start, end, exon_num in cds_regions: region_len = end - start region_nt_start = cumulative + 1 region_nt_end = cumulative + region_len if region_nt_end >= nt_start and region_nt_start <= nt_end: overlap_nt_start = max(nt_start, region_nt_start) overlap_nt_end = min(nt_end, region_nt_end) offset_from_end_start = overlap_nt_start - region_nt_start offset_from_end_end = overlap_nt_end - region_nt_start genomic_end = end - offset_from_end_start genomic_start = end - offset_from_end_end - 1 if genomic_start > genomic_end: genomic_start, genomic_end = genomic_end, genomic_start segments.append((genomic_start, genomic_end, exon_num)) cumulative += region_len return segments def generate_bed_entries(db, transcripts_info): """Generate BED entries for all domains""" bed_lines = [] for gene, domain, aa_start, aa_end in DOMAINS: tx = transcripts_info[gene] chrom = tx['chrom'] strand = tx['strand'] accession = tx['name'] cds_regions = build_cds_regions(tx) if strand == '+': segments = aa_to_genomic_plus(aa_start, aa_end, cds_regions) else: segments = aa_to_genomic_minus(aa_start, aa_end, cds_regions) for seg_start, seg_end, exon_num in segments: if seg_start >= seg_end: print(f" WARNING: Skipping invalid segment for {gene} {domain}: {seg_start}-{seg_end}") continue aa_loc = f"{aa_start}-{aa_end}" mouse_over = f"Domain: {domain}
Gene: {gene}
Transcript: {accession}
Amino acid loc: {aa_loc}" bed_line = f"{chrom}\t{seg_start}\t{seg_end}\t{domain}\t0\t.\t{seg_start}\t{seg_end}\t230,3,131\t{gene}\t{accession}\t{aa_loc}\t{mouse_over}" bed_lines.append(bed_line) return bed_lines def create_track(db, output_dir): """Create BED and bigBed files for a given genome assembly""" print(f"\n{'='*70}") print(f"Processing {db}") print(f"{'='*70}") # Query transcript info from hgsql print(f"\nQuerying transcript coordinates from {db}.ncbiRefSeq...") transcripts_info = {} for gene, accession in TRANSCRIPTS.items(): print(f" {gene}: {accession}") transcripts_info[gene] = get_transcript_info(db, accession) # Generate BED entries print("\nGenerating BED entries...") bed_lines = generate_bed_entries(db, transcripts_info) print(f" Generated {len(bed_lines)} domain segments") # Write BED file bed_file = os.path.join(output_dir, f"InSiGHTclinDomains_{db}.bed") print(f"\nWriting BED file: {bed_file}") with open(bed_file, 'w') as f: f.write('\n'.join(bed_lines) + '\n') # Sort BED file print("Sorting BED file...") bash(f"sort -k1,1 -k2,2n {bed_file} -o {bed_file}") # Create bigBed as_file = os.path.join(output_dir, "InSiGHTclinDomains.as") bb_file = os.path.join(output_dir, f"InSiGHTclinDomains{db.capitalize()}.bb") chrom_sizes = f"/cluster/data/{db}/chrom.sizes" print(f"\nCreating bigBed file: {bb_file}") try: bash(f"bedToBigBed -as={as_file} -type=bed9+4 -tab {bed_file} {chrom_sizes} {bb_file}") print(f" Successfully created: {bb_file}") except Exception as e: print(f" ERROR creating bigBed: {e}") return bed_file, bb_file # ============================================================================ # Main execution # ============================================================================ if __name__ == "__main__": print("=" * 70) print("InSiGHT VCEP Clinically Relevant Protein Domains Track Generator") print("=" * 70) # Create output directory if needed os.makedirs(OUTPUT_DIR, exist_ok=True) # Write AutoSql file as_file = os.path.join(OUTPUT_DIR, "InSiGHTclinDomains.as") print(f"\nWriting AutoSql file: {as_file}") with open(as_file, 'w') as f: f.write(AUTOSQL) # Process both assemblies output_files = {} for db in ['hg38', 'hg19']: try: bed_file, bb_file = create_track(db, OUTPUT_DIR) output_files[db] = {'bed': bed_file, 'bb': bb_file} except Exception as e: print(f"\nERROR processing {db}: {e}") # Print summary print("\n" + "=" * 70) print("Output Files") print("=" * 70) print(f" AutoSql file: {as_file}") for db, files in output_files.items(): print(f"\n {db}:") print(f" BED file: {files['bed']}") if os.path.exists(files['bb']): print(f" BigBed file: {files['bb']}") print("\n" + "=" * 70) print("Custom Track Line (for bigBed)") print("=" * 70) print(""" track type=bigBed name="InSiGHT Domains" \\ description="InSiGHT VCEP Clinically Relevant Protein Domains" \\ visibility=pack itemRgb=on \\ bigDataUrl= """) print("=" * 70) print("Done!") print("=" * 70)