a56d88e05670b759ff3b32829542537ccc790c57 lrnassar Tue Apr 28 19:18:20 2026 -0700 Address CR feedback on insight + tp53 hub scripts. refs #37418 Drop duplicated bash() wrappers in favor of subprocess.run / check_output with list args, eliminating shell=True, embedded-quote concerns, and stderr-into-stdout merging. Centralize common operations as run_sort_bed/run_liftOver in tp53FuncLib alongside existing run_bedToBigBed. Switch HTML escaping to stdlib html.escape() consistently. insightHCIPriors mouseover (previously unescaped) now escapes HGVS fields, addressing the specific c.123A>G case Jonathan flagged. Replace invalid
tags with
across all five affected mouseover sites. diff --git src/hg/makeDb/scripts/tp53/tp53FuncLib.py src/hg/makeDb/scripts/tp53/tp53FuncLib.py index e3b5da12edb..3f50583642f 100644 --- src/hg/makeDb/scripts/tp53/tp53FuncLib.py +++ src/hg/makeDb/scripts/tp53/tp53FuncLib.py @@ -1,211 +1,210 @@ """ Shared helpers for the TP53 VCEP hub build scripts. All TP53 tracks use the same canonical transcript (NM_000546.6, chr17 minus strand, 393 aa) and share the amino-acid-to-genomic mapping logic. Centralizing it here keeps the minus-strand handling consistent across tracks, avoiding the off-by-one class of bugs that bit InSiGHT's PMS2 before being fixed. Canonical transcript: NM_000546.6 / NP_000537.3 (MANE Select) """ import subprocess TRANSCRIPT = "NM_000546.6" PROTEIN = "NP_000537.3" GENE = "TP53" -def bash(cmd): - """Run cmd in bash subprocess, return stdout; raise on non-zero exit.""" - try: - out = subprocess.run( - cmd, check=True, shell=True, - stdout=subprocess.PIPE, universal_newlines=True, - stderr=subprocess.STDOUT, - ) - return out.stdout - except subprocess.CalledProcessError as e: - raise RuntimeError( - "command '{}' returned error (code {}): {}".format( - e.cmd, e.returncode, e.output) - ) +def run_sort_bed(bed): + """Sort a BED file in place by chrom, then start.""" + subprocess.run(["sort", "-k1,1", "-k2,2n", bed, "-o", bed], check=True) + + +def run_liftOver(in_bed, chain, out_bed, unmapped): + """Wrap liftOver.""" + subprocess.run( + ["liftOver", in_bed, chain, out_bed, unmapped], + check=True) def get_transcript_info(db, accession=TRANSCRIPT): """Query hgsql ncbiRefSeq and return a dict with tx/cds/exon info.""" query = ( "SELECT name, chrom, strand, txStart, txEnd, cdsStart, cdsEnd, " "exonStarts, exonEnds FROM ncbiRefSeq WHERE name='{}'" ).format(accession) - result = bash('hgsql {} -Ne "{}"'.format(db, query)) + result = subprocess.check_output( + ["hgsql", db, "-Ne", query], text=True) if not result.strip(): raise ValueError("Transcript {} not found in {}.ncbiRefSeq".format(accession, db)) fields = result.strip().split('\t') exon_starts = [int(x) for x in fields[7].rstrip(',').split(',')] exon_ends = [int(x) for x in fields[8].rstrip(',').split(',')] return { 'name': fields[0], 'chrom': fields[1], 'strand': fields[2], 'txStart': int(fields[3]), 'txEnd': int(fields[4]), 'cdsStart': int(fields[5]), 'cdsEnd': int(fields[6]), 'exonStarts': exon_starts, 'exonEnds': exon_ends, } def build_cds_regions(tx): """Return list of (start, end, exon_num) CDS segments, reversed for minus strand.""" cds = [] for i in range(len(tx['exonStarts'])): ex_start = tx['exonStarts'][i] ex_end = tx['exonEnds'][i] if ex_end <= tx['cdsStart'] or ex_start >= tx['cdsEnd']: continue cds.append((max(ex_start, tx['cdsStart']), min(ex_end, tx['cdsEnd']), i + 1)) if tx['strand'] == '-': cds = cds[::-1] return cds def aa_to_genomic(aa_start, aa_end, tx): """Map an AA range (1-based inclusive) to genomic segments in BED half-open. Uses the +/- strand-aware implementation from InSiGHT's clinDomains (post-PMS2-off-by-one-fix). Do not re-derive this math. """ cds = build_cds_regions(tx) nt_start = (aa_start - 1) * 3 + 1 nt_end = aa_end * 3 segments = [] cumulative = 0 for start, end, exon_num in cds: region_len = end - start r_nt_start = cumulative + 1 r_nt_end = cumulative + region_len if r_nt_end >= nt_start and r_nt_start <= nt_end: ov_nt_start = max(nt_start, r_nt_start) ov_nt_end = min(nt_end, r_nt_end) off_start = ov_nt_start - r_nt_start off_end = ov_nt_end - r_nt_start if tx['strand'] == '+': g_start = start + off_start g_end = start + off_end + 1 else: g_end = end - off_start g_start = end - off_end - 1 if g_start > g_end: g_start, g_end = g_end, g_start segments.append((g_start, g_end, exon_num)) cumulative += region_len return segments def aa_codon_genomic(aa_pos, tx): """Convenience: genomic segments covering a single codon.""" return aa_to_genomic(aa_pos, aa_pos, tx) def cdna_coding_to_genomic(c_pos, tx): """Map a CDS-relative cDNA position (1-based, c.1..c.CDSlen) to a genomic coord (0-based inclusive). Returns None if position is outside the CDS. """ aa_pos = (c_pos - 1) // 3 + 1 within = (c_pos - 1) % 3 segs = aa_to_genomic(aa_pos, aa_pos, tx) if not segs: return None # Collapse the (up to 2) segments for a codon into 3 nucleotide positions # in mRNA order. if tx['strand'] == '+': nts = [] for g_start, g_end, _ex in segs: for g in range(g_start, g_end): nts.append(g) else: # Minus strand: mRNA order is reverse of genomic order nts = [] for g_start, g_end, _ex in segs: # This segment's mRNA order = highest-to-lowest genomic for g in range(g_end - 1, g_start - 1, -1): nts.append(g) # Segments on minus strand came in "reversed exon order" — confirm by # checking that nts is monotonically decreasing. If a codon spans an # intron the two segments should already be ordered correctly. if within >= len(nts): return None return nts[within] def write_autosql(path, content): with open(path, 'w') as f: f.write(content) if not content.endswith('\n'): f.write('\n') def run_bedToBigBed(bed, as_file, bb, chrom_sizes, bed_type): """Invoke bedToBigBed with the given type (e.g., 'bed9+5').""" - bash("bedToBigBed -as={} -type={} -tab {} {} {}".format( - as_file, bed_type, bed, chrom_sizes, bb)) + subprocess.run( + ["bedToBigBed", "-as=" + as_file, "-type=" + bed_type, "-tab", + bed, chrom_sizes, bb], + check=True) def chrom_sizes_path(db): return "/cluster/data/{}/chrom.sizes".format(db) PER_PAPER_BEDS = { 'kato': ("/hive/users/lrnassar/claude/RM37399/functionalAssays/kato/TP53FuncKato_hg38.bed", 10), 'giacomelli': ("/hive/users/lrnassar/claude/RM37399/functionalAssays/giacomelli/TP53FuncGiacomelli_hg38.bed", 10), 'kawaguchi': ("/hive/users/lrnassar/claude/RM37399/functionalAssays/kawaguchi/TP53FuncKawaguchi_hg38.bed", 9), 'funk': ("/hive/users/lrnassar/claude/RM37399/functionalAssays/funk/TP53FuncFunk_hg38.bed", 10), } def load_per_paper_raw_scores(): """Read the per-paper functional assay BED files and return {short_protein_change: {kato_pct, giac_z, kaw_state, funk_rfs}}. Score column index (0-based): kato -> col 10 (median % WT transactivation) giacomelli -> col 10 (A549 etoposide Z-score) kawaguchi -> col 9 (oligomer state: Tetramer / Dimer / Monomer) funk -> col 10 (CRISPR RFS median) """ import os out = {} for paper, (path, score_col) in PER_PAPER_BEDS.items(): if not os.path.exists(path): continue with open(path) as f: for line in f: flds = line.rstrip("\n").split("\t") if len(flds) <= score_col: continue name = flds[3] val = flds[score_col] out.setdefault(name, {})[paper] = val return out def html_ascii_safe(s): """Encode any non-ASCII character as an HTML numeric entity (&#NNNN;). UCSC hgTracks renders mouseover text through a pipeline that does not always preserve raw UTF-8 bytes; non-ASCII characters show up as mojibake (e.g. em dash as 'â€"'). Converting to &#NNNN; is reliable. Keeps & < > " ' as literals since they are valid in the existing mouseover HTML (the templates already escape user-controlled data separately where needed). """ if s is None: return s return "".join( ch if ord(ch) < 128 else "&#{};".format(ord(ch)) for ch in str(s) )