46f3b4d35904ddc3df91e916ad15c05ad878b3d1
lrnassar
Thu May 21 08:02:58 2026 -0700
Adding PMS2 Pseudogene Caution track to InSiGHT hub. refs #36582
diff --git src/hg/makeDb/scripts/insight/insightPMS2Caution.py src/hg/makeDb/scripts/insight/insightPMS2Caution.py
new file mode 100644
index 00000000000..89760e5fb6e
--- /dev/null
+++ src/hg/makeDb/scripts/insight/insightPMS2Caution.py
@@ -0,0 +1,221 @@
+#!/usr/bin/env python3
+"""
+InSiGHT VCEP PMS2 Pseudogene Caution Regions Track Generator
+
+This script generates UCSC Genome Browser tracks (bigBed format) flagging
+PMS2 exons that share high sequence homology with the PMS2CL pseudogene
+(chr7), and the PMS2CL pseudogene region itself. Variants called in these
+regions by short-read NGS may be pseudogene-derived and benefit from
+additional orthogonal confirmation.
+
+Per-exon homology data is drawn from the literature (Hayward 2007,
+van der Klift 2010, Vaughn 2011, Clendenning 2006).
+
+Exon coordinates are queried from hgsql ncbiRefSeq for both hg38 and hg19.
+PMS2CL coordinates are also queried from ncbiRefSeq.
+
+Author: Generated for InSiGHT VCEP
+Date: 2026
+"""
+
+import subprocess
+import os
+
+def bash(cmd):
+ """Run the cmd in bash subprocess"""
+ try:
+ rawBashOutput = subprocess.run(cmd, check=True, shell=True,
+ stdout=subprocess.PIPE, universal_newlines=True, stderr=subprocess.STDOUT)
+ bashStdout = rawBashOutput.stdout
+ except subprocess.CalledProcessError as e:
+ raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))
+ return(bashStdout)
+
+# ============================================================================
+# Configuration
+# ============================================================================
+OUTPUT_DIR = "/hive/users/lrnassar/insightHub/pms2Caution"
+
+PMS2_ACC = "NM_000535.7"
+PMS2CL_ACC = "NR_002217.1"
+
+# Per-exon caution data for PMS2 (15 exons, minus strand, so exon 1 has the
+# highest genomic coordinate). Codon ranges are approximate (boundaries fall
+# between codons). Homology percentages are from the literature.
+# Format: (exon_num, codon_start, codon_end, homology_text, caution_level, color)
+PMS2_EXONS = [
+ (1, None, None, None, 'Safe', '0,170,0'),
+ (2, None, None, None, 'Safe', '0,170,0'),
+ (3, None, None, None, 'Safe', '0,170,0'),
+ (4, None, None, None, 'Safe', '0,170,0'),
+ (5, None, None, None, 'Safe', '0,170,0'),
+ (6, None, None, None, 'Safe', '0,170,0'),
+ (7, None, None, None, 'Safe', '0,170,0'),
+ (8, None, None, None, 'Safe', '0,170,0'),
+ (9, 302, 329, '~98%', 'Moderate', '230,150,0'),
+ (10, 330, 381, None, 'Safe', '0,170,0'),
+ (11, 382, 668, '~99%', 'High', '210,0,0'),
+ (12, 669, 724, '>99%', 'High', '210,0,0'),
+ (13, 725, 758, '>99%', 'High', '210,0,0'),
+ (14, 759, 815, '>99%', 'High', '210,0,0'),
+ (15, 816, 862, '>99%', 'High', '210,0,0'),
+]
+
+# Recommendation text by caution level
+RECOMMENDATIONS = {
+ 'High': 'Variants called by short-read NGS in this region may be pseudogene-derived and may require additional orthogonal validation (e.g., long-range PCR).',
+ 'Moderate': 'This region shares moderate homology with PMS2CL; short-read NGS calls may benefit from additional orthogonal validation.',
+ 'Safe': 'No homology to PMS2CL; standard NGS calls are reliable in this region.',
+}
+
+# Pseudogene description
+PMS2CL_DESC = ('PMS2CL pseudogene; not a coding copy of PMS2. Variants assigned here '
+ 'are not pathogenic for Lynch syndrome.')
+
+# AutoSQL definition for BED9+5 format
+AUTOSQL = """table InSiGHTPMS2Caution
+"PMS2 pseudogene caution regions: PMS2 exons with PMS2CL homology, plus the PMS2CL pseudogene"
+ (
+ string chrom; "Reference sequence chromosome or scaffold"
+ uint chromStart; "Start position in chromosome"
+ uint chromEnd; "End position in chromosome"
+ string name; "Region name"
+ uint score; "Not used, all 0"
+ char[1] strand; "Not used, all ."
+ uint thickStart; "Same as chromStart"
+ uint thickEnd; "Same as chromEnd"
+ uint reserved; "RGB value (use R,G,B string in input file)"
+ string region; "PMS2 exon number or PMS2CL pseudogene"
+ string homology; "Sequence identity to PMS2CL"
+ string codonRange; "Approximate amino acid range in PMS2"
+ string cautionLevel; "High / Moderate / Safe / Pseudogene"
+ lstring _mouseOver; "Field only used as mouseOver"
+ )
+"""
+
+# ============================================================================
+# Functions
+# ============================================================================
+
+def get_pms2_exons(db):
+ """Query hgsql for PMS2 NM_000535.7 exon coordinates.
+ Returns list of (exon_num, genomic_start, genomic_end) in transcript order
+ (i.e., reversed for minus strand)."""
+ result = bash(f'hgsql {db} -Ne \'select chrom, txStart, exonStarts, exonEnds, strand from ncbiRefSeq where name="{PMS2_ACC}"\'')
+ fields = result.strip().split('\t')
+ chrom = fields[0]
+ starts = [int(x) for x in fields[2].rstrip(',').split(',')]
+ ends = [int(x) for x in fields[3].rstrip(',').split(',')]
+ strand = fields[4]
+ exons_genomic = list(zip(starts, ends))
+ # For minus strand, reverse so exon 1 is first in the list
+ if strand == '-':
+ exons_genomic = exons_genomic[::-1]
+ # Return (exon_num, start, end) where exon_num is 1-based transcript order
+ return chrom, [(i+1, s, e) for i, (s, e) in enumerate(exons_genomic)]
+
+
+def get_pms2cl_region(db):
+ """Query hgsql for PMS2CL coordinates."""
+ result = bash(f'hgsql {db} -Ne \'select chrom, txStart, txEnd from ncbiRefSeq where name="{PMS2CL_ACC}"\'')
+ fields = result.strip().split('\t')
+ return fields[0], int(fields[1]), int(fields[2])
+
+
+def html_escape_rule(text):
+ """HTML-encode special characters for mouseover."""
+ return text.replace('≥', '≥').replace('≤', '≤').replace('>', '>').replace('<', '<')
+
+
+def generate_bed_entries(db):
+ """Generate BED entries for PMS2 exons and PMS2CL pseudogene."""
+ bed_lines = []
+
+ # PMS2 exons
+ pms2_chrom, pms2_exons = get_pms2_exons(db)
+ for (exon_num, _, _, _, _, _), (_, ex_start, ex_end) in zip(PMS2_EXONS, pms2_exons):
+ meta = PMS2_EXONS[exon_num - 1]
+ _, codon_start, codon_end, homology, caution, color = meta
+
+ name = f"PMS2 exon {exon_num}"
+ if codon_start is not None:
+ codon_range = f"codons {codon_start}-{codon_end}"
+ else:
+ codon_range = ""
+ homology_str = homology if homology else "none"
+ rec = RECOMMENDATIONS[caution]
+
+ mouse_over = (f"Region: PMS2 exon {exon_num}"
+ f"Homology to PMS2CL: {html_escape_rule(homology_str)}"
+ f"Caution: {caution}"
+ f"Note: {html_escape_rule(rec)}")
+
+ bed_line = (f"{pms2_chrom}\t{ex_start}\t{ex_end}\tPMS2 exon {exon_num}\t0\t.\t"
+ f"{ex_start}\t{ex_end}\t{color}\t"
+ f"exon {exon_num}\t{homology_str}\t{codon_range}\t{caution}\t{mouse_over}")
+ bed_lines.append(bed_line)
+
+ # PMS2CL pseudogene
+ psg_chrom, psg_start, psg_end = get_pms2cl_region(db)
+ psg_color = '128,128,128'
+ mouse_over = (f"Region: PMS2CL pseudogene"
+ f"Note: {html_escape_rule(PMS2CL_DESC)}")
+ bed_line = (f"{psg_chrom}\t{psg_start}\t{psg_end}\tPMS2CL pseudogene\t0\t.\t"
+ f"{psg_start}\t{psg_end}\t{psg_color}\t"
+ f"PMS2CL pseudogene\tn/a\t\tPseudogene\t{mouse_over}")
+ bed_lines.append(bed_line)
+
+ return bed_lines
+
+
+def create_track(db):
+ """Create BED and bigBed files for a given genome assembly."""
+ print(f"\n{'='*70}")
+ print(f"Processing {db}")
+ print(f"{'='*70}")
+
+ bed_lines = generate_bed_entries(db)
+ print(f" Generated {len(bed_lines)} entries")
+
+ bed_file = os.path.join(OUTPUT_DIR, f"InSiGHTPMS2Caution_{db}.bed")
+ print(f" Writing BED file: {bed_file}")
+ with open(bed_file, 'w') as f:
+ for line in bed_lines:
+ f.write(line + '\n')
+
+ sorted_bed = bed_file.replace('.bed', '.sorted.bed')
+ bash(f"sort -k1,1 -k2,2n {bed_file} > {sorted_bed}")
+
+ as_file = os.path.join(OUTPUT_DIR, "InSiGHTPMS2Caution.as")
+ bb_file = os.path.join(OUTPUT_DIR, f"InSiGHTPMS2Caution{db.capitalize()}.bb")
+ chrom_sizes = f"/cluster/data/{db}/chrom.sizes"
+
+ bash(f"bedToBigBed -type=bed9+5 -as={as_file} -tab {sorted_bed} {chrom_sizes} {bb_file}")
+ print(f" Successfully created: {bb_file}")
+
+ os.remove(sorted_bed)
+ return bb_file
+
+
+def main():
+ print("=" * 70)
+ print("InSiGHT VCEP PMS2 Pseudogene Caution Regions Track Generator")
+ print("=" * 70)
+
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
+
+ as_file = os.path.join(OUTPUT_DIR, "InSiGHTPMS2Caution.as")
+ print(f"\nWriting AutoSql file: {as_file}")
+ with open(as_file, 'w') as f:
+ f.write(AUTOSQL)
+
+ for db in ['hg38', 'hg19']:
+ create_track(db)
+
+ print("\n" + "=" * 70)
+ print("Done!")
+ print("=" * 70)
+
+
+if __name__ == '__main__':
+ main()