f24ea956ba3b7a8c7ee8be0826d66489b85e0bbc lrnassar Tue Apr 28 18:22:48 2026 -0700 Adding TP53 VCEP track hub build scripts. refs #37399 16 Python scripts that build the 15 bigBed tracks for the ClinGen TP53 VCEP track hub (CSpec GN009 v2.4.0, NM_000546.6 / NP_000537.3). Includes the NON-FINAL Provisional Classification (Tavtigian point sum across PM1 + PS3/BS3 + PP3/BP4 + AF + BS2 + splicing PP3), gnomAD v4.1 AF codes, FLOSSIES BS2 evidence, Bioinformatic PP3/BP4 (missense + single-aa in-frame del), VCEP curated variants from EvRepo with ClinVar VCV backfill, PM1 (clinical domains + cancerhotspots), PVS1 regions and splice sites, and the Functional Evidence composite (VCEP preliminary PS3/BS3 with per-paper raw scores from Kato/Giacomelli/Kawaguchi/Funk). diff --git src/hg/makeDb/scripts/tp53/tp53FuncLib.py src/hg/makeDb/scripts/tp53/tp53FuncLib.py new file mode 100644 index 00000000000..e3b5da12edb --- /dev/null +++ src/hg/makeDb/scripts/tp53/tp53FuncLib.py @@ -0,0 +1,211 @@ +""" +Shared helpers for the TP53 VCEP hub build scripts. + +All TP53 tracks use the same canonical transcript (NM_000546.6, chr17 minus +strand, 393 aa) and share the amino-acid-to-genomic mapping logic. Centralizing +it here keeps the minus-strand handling consistent across tracks, avoiding the +off-by-one class of bugs that bit InSiGHT's PMS2 before being fixed. + +Canonical transcript: NM_000546.6 / NP_000537.3 (MANE Select) +""" + +import subprocess + +TRANSCRIPT = "NM_000546.6" +PROTEIN = "NP_000537.3" +GENE = "TP53" + + +def bash(cmd): + """Run cmd in bash subprocess, return stdout; raise on non-zero exit.""" + try: + out = subprocess.run( + cmd, check=True, shell=True, + stdout=subprocess.PIPE, universal_newlines=True, + stderr=subprocess.STDOUT, + ) + return out.stdout + except subprocess.CalledProcessError as e: + raise RuntimeError( + "command '{}' returned error (code {}): {}".format( + e.cmd, e.returncode, e.output) + ) + + +def get_transcript_info(db, accession=TRANSCRIPT): + """Query hgsql ncbiRefSeq and return a dict with tx/cds/exon info.""" + query = ( + "SELECT name, chrom, strand, txStart, txEnd, cdsStart, cdsEnd, " + "exonStarts, exonEnds FROM ncbiRefSeq WHERE name='{}'" + ).format(accession) + result = bash('hgsql {} -Ne "{}"'.format(db, query)) + if not result.strip(): + raise ValueError("Transcript {} not found in {}.ncbiRefSeq".format(accession, db)) + fields = result.strip().split('\t') + exon_starts = [int(x) for x in fields[7].rstrip(',').split(',')] + exon_ends = [int(x) for x in fields[8].rstrip(',').split(',')] + return { + 'name': fields[0], + 'chrom': fields[1], + 'strand': fields[2], + 'txStart': int(fields[3]), + 'txEnd': int(fields[4]), + 'cdsStart': int(fields[5]), + 'cdsEnd': int(fields[6]), + 'exonStarts': exon_starts, + 'exonEnds': exon_ends, + } + + +def build_cds_regions(tx): + """Return list of (start, end, exon_num) CDS segments, reversed for minus strand.""" + cds = [] + for i in range(len(tx['exonStarts'])): + ex_start = tx['exonStarts'][i] + ex_end = tx['exonEnds'][i] + if ex_end <= tx['cdsStart'] or ex_start >= tx['cdsEnd']: + continue + cds.append((max(ex_start, tx['cdsStart']), + min(ex_end, tx['cdsEnd']), + i + 1)) + if tx['strand'] == '-': + cds = cds[::-1] + return cds + + +def aa_to_genomic(aa_start, aa_end, tx): + """Map an AA range (1-based inclusive) to genomic segments in BED half-open. + + Uses the +/- strand-aware implementation from InSiGHT's clinDomains + (post-PMS2-off-by-one-fix). Do not re-derive this math. + """ + cds = build_cds_regions(tx) + nt_start = (aa_start - 1) * 3 + 1 + nt_end = aa_end * 3 + segments = [] + cumulative = 0 + for start, end, exon_num in cds: + region_len = end - start + r_nt_start = cumulative + 1 + r_nt_end = cumulative + region_len + if r_nt_end >= nt_start and r_nt_start <= nt_end: + ov_nt_start = max(nt_start, r_nt_start) + ov_nt_end = min(nt_end, r_nt_end) + off_start = ov_nt_start - r_nt_start + off_end = ov_nt_end - r_nt_start + if tx['strand'] == '+': + g_start = start + off_start + g_end = start + off_end + 1 + else: + g_end = end - off_start + g_start = end - off_end - 1 + if g_start > g_end: + g_start, g_end = g_end, g_start + segments.append((g_start, g_end, exon_num)) + cumulative += region_len + return segments + + +def aa_codon_genomic(aa_pos, tx): + """Convenience: genomic segments covering a single codon.""" + return aa_to_genomic(aa_pos, aa_pos, tx) + + +def cdna_coding_to_genomic(c_pos, tx): + """Map a CDS-relative cDNA position (1-based, c.1..c.CDSlen) to a genomic + coord (0-based inclusive). Returns None if position is outside the CDS. + """ + aa_pos = (c_pos - 1) // 3 + 1 + within = (c_pos - 1) % 3 + segs = aa_to_genomic(aa_pos, aa_pos, tx) + if not segs: + return None + # Collapse the (up to 2) segments for a codon into 3 nucleotide positions + # in mRNA order. + if tx['strand'] == '+': + nts = [] + for g_start, g_end, _ex in segs: + for g in range(g_start, g_end): + nts.append(g) + else: + # Minus strand: mRNA order is reverse of genomic order + nts = [] + for g_start, g_end, _ex in segs: + # This segment's mRNA order = highest-to-lowest genomic + for g in range(g_end - 1, g_start - 1, -1): + nts.append(g) + # Segments on minus strand came in "reversed exon order" — confirm by + # checking that nts is monotonically decreasing. If a codon spans an + # intron the two segments should already be ordered correctly. + if within >= len(nts): + return None + return nts[within] + + +def write_autosql(path, content): + with open(path, 'w') as f: + f.write(content) + if not content.endswith('\n'): + f.write('\n') + + +def run_bedToBigBed(bed, as_file, bb, chrom_sizes, bed_type): + """Invoke bedToBigBed with the given type (e.g., 'bed9+5').""" + bash("bedToBigBed -as={} -type={} -tab {} {} {}".format( + as_file, bed_type, bed, chrom_sizes, bb)) + + +def chrom_sizes_path(db): + return "/cluster/data/{}/chrom.sizes".format(db) + + +PER_PAPER_BEDS = { + 'kato': ("/hive/users/lrnassar/claude/RM37399/functionalAssays/kato/TP53FuncKato_hg38.bed", 10), + 'giacomelli': ("/hive/users/lrnassar/claude/RM37399/functionalAssays/giacomelli/TP53FuncGiacomelli_hg38.bed", 10), + 'kawaguchi': ("/hive/users/lrnassar/claude/RM37399/functionalAssays/kawaguchi/TP53FuncKawaguchi_hg38.bed", 9), + 'funk': ("/hive/users/lrnassar/claude/RM37399/functionalAssays/funk/TP53FuncFunk_hg38.bed", 10), +} + + +def load_per_paper_raw_scores(): + """Read the per-paper functional assay BED files and return + {short_protein_change: {kato_pct, giac_z, kaw_state, funk_rfs}}. + + Score column index (0-based): + kato -> col 10 (median % WT transactivation) + giacomelli -> col 10 (A549 etoposide Z-score) + kawaguchi -> col 9 (oligomer state: Tetramer / Dimer / Monomer) + funk -> col 10 (CRISPR RFS median) + """ + import os + out = {} + for paper, (path, score_col) in PER_PAPER_BEDS.items(): + if not os.path.exists(path): + continue + with open(path) as f: + for line in f: + flds = line.rstrip("\n").split("\t") + if len(flds) <= score_col: + continue + name = flds[3] + val = flds[score_col] + out.setdefault(name, {})[paper] = val + return out + + +def html_ascii_safe(s): + """Encode any non-ASCII character as an HTML numeric entity (&#NNNN;). + + UCSC hgTracks renders mouseover text through a pipeline that does not + always preserve raw UTF-8 bytes; non-ASCII characters show up as + mojibake (e.g. em dash as 'â€"'). Converting to &#NNNN; is reliable. + Keeps & < > " ' as literals since they are valid in the existing + mouseover HTML (the templates already escape user-controlled data + separately where needed). + """ + if s is None: + return s + return "".join( + ch if ord(ch) < 128 else "&#{};".format(ord(ch)) + for ch in str(s) + )