888e7470c14eeecdca310ed36bb45c3c00ae8052 lrnassar Tue Apr 21 15:14:04 2026 -0700 QA fixes for MPRA superTrack. refs #37359 Fix broken mpraVarDb bigDataUrl — pointed at /gbdb/hg38/mpra/mpravardb.bb but the file is at /gbdb/hg38/mpra/mpravardb/mpravardb.bb, causing hgTrackDb -strict to silently drop the subtrack. Rebuild mpravardb.bb after two fixes in mpravardbToBed.py: sanitize UTF-8 in user-visible string fields (curly quotes, primes, NBSP mojibake) that the browser does not transcode, eliminating ~246k non-ASCII occurrences across 42% of rows; and change safe_float / pval_to_score to write NaN and return score 0 for NA / out-of-range p-values instead of 0.0 and score 1000 (previously inflated untested variants to the top of score-sorted views). trackDb stanza cleanup: shorten mpraVarDb longLabel, drop superfluous type bed 4 from superTrack, make bigBed 9+13 explicit, remove redundant mouseOverField, align parent mpra on, add filterValues for cell_line/assay/cellLine and filterByRange sliders for percentile_rank / fdr / log2FC, add labelFields and maxWindowToDraw. Description pages: add cross-species disclosure (mouse reporter cells used to assay human sequences), update mpraVarDb header to post-liftOver count 239,028 with Studies-table footnote, fix mpraVarDb.html download-server paths, soften imprecise "51 MPRA experiments" claim in mpra.html and mprabase.html. relatedTracks.ra: reciprocal mpra <-> wgEncodeReg4 and mpra <-> cCREs. Expand mpra.txt makedoc with upstream provenance and QA-rebuild log. diff --git src/hg/makeDb/scripts/mpravardb/mpravardbToBed.py src/hg/makeDb/scripts/mpravardb/mpravardbToBed.py index a7b13d1317b..c97884640b5 100644 --- src/hg/makeDb/scripts/mpravardb/mpravardbToBed.py +++ src/hg/makeDb/scripts/mpravardb/mpravardbToBed.py @@ -1,91 +1,136 @@ #!/usr/bin/env python3 """ Convert mpravardb.csv to BED9+ format, liftOver hg19 to hg38, merge, and create bigBed. Usage: cd /hive/data/genomes/hg38/bed/mpra/mpravardb python3 ~/kent/src/hg/makeDb/scripts/mpravardb/mpravardbToBed.py """ import csv +import re import subprocess import sys import os import math SCRIPT_DIR = os.path.join(os.environ["HOME"], "kent/src/hg/makeDb/scripts/mpravardb") AS_FILE = os.path.join(SCRIPT_DIR, "mpravardb.as") LIFTOVER_CHAIN = "/gbdb/hg19/liftOver/hg19ToHg38.over.chain.gz" CHROM_SIZES = "/hive/data/genomes/hg38/chrom.sizes" INPUT_CSV = "mpravardb.csv" +# Upstream CSV contains UTF-8 curly quotes, primes, and NBSP mojibake. +# Browser does not transcode UTF-8 in bigBed fields, so everything user-visible +# must be plain ASCII. Transliterate or strip. +_SANITIZE_MAP = { + "’": "'", # RIGHT SINGLE QUOTATION MARK (used as apostrophe) + "‘": "'", # LEFT SINGLE QUOTATION MARK + "“": '"', # LEFT DOUBLE QUOTATION MARK + "”": '"', # RIGHT DOUBLE QUOTATION MARK + "′": "'", # PRIME (used after numerals: 3'UTR) + "″": '"', # DOUBLE PRIME + "–": "-", # EN DASH + "—": "-", # EM DASH + "…": "...", # HORIZONTAL ELLIPSIS + " ": " ", # NO-BREAK SPACE + "¬": "", # NOT SIGN (NBSP mojibake pair) + "†": "", # DAGGER (NBSP mojibake pair) +} +_SANITIZE_RE = re.compile("|".join(re.escape(k) for k in _SANITIZE_MAP)) + +def sanitize_text(s): + """Return ASCII-only version of s for bigBed string fields.""" + if s is None: + return "" + out = _SANITIZE_RE.sub(lambda m: _SANITIZE_MAP[m.group()], s) + # Drop any remaining non-ASCII (rare), then collapse runs of whitespace + out = out.encode("ascii", "ignore").decode("ascii") + out = re.sub(r"\s+", " ", out).strip() + return out + def pval_to_score(pval): - """Convert p-value to a 0-1000 score using -log10.""" - if pval is None or pval == "": + """Convert p-value to a 0-1000 score using -log10. + Missing / out-of-range / non-numeric → 0 (not 1000). + Many rows upstream encode NA as literal 0.0, which is indistinguishable from + a true p=0; treat all non-positive inputs as unscored.""" + if pval is None or pval in ("", "NA"): return 0 try: p = float(pval) except ValueError: return 0 - if p <= 0: - return 1000 + if p <= 0 or p > 1: + return 0 score = int(-math.log10(p) * 100) return max(0, min(1000, score)) def pval_to_color(pval, fdr): - """Color by significance: red if FDR<0.05, orange if p<0.05, black otherwise.""" + """Color by significance: red if FDR<0.05, orange if p<0.05, grey otherwise.""" try: fdr_val = float(fdr) if fdr not in (None, "", "NA") else 1.0 except ValueError: fdr_val = 1.0 try: p_val = float(pval) if pval not in (None, "", "NA") else 1.0 except ValueError: p_val = 1.0 if fdr_val < 0.05: return "200,0,0" # dark red - FDR significant elif p_val < 0.05: return "255,165,0" # orange - nominally significant else: return "190,190,190" # grey - not significant def safe_float(val): - """Convert to float, return 0.0 for NA or empty.""" + """Convert to float, return NaN for NA / empty / non-numeric. + Using NaN (rather than 0.0) keeps untested variants out of filterByRange + sliders by default and avoids masquerading as p=0 / fdr=0 in the details + page. bedToBigBed accepts the literal string "nan" in float fields.""" if val in (None, "", "NA"): - return 0.0 + return math.nan try: return float(val) except ValueError: - return 0.0 + return math.nan def csv_to_bed(input_csv, hg19_bed, hg38_bed): """Parse CSV and write two BED files, one per assembly.""" hg19_count = 0 hg38_count = 0 with open(input_csv, newline="") as fin, \ open(hg19_bed, "w") as f19, \ open(hg38_bed, "w") as f38: reader = csv.reader(fin) header = next(reader) for row in reader: chrom_num, pos, ref, alt, genome = row[0], row[1], row[2], row[3], row[4] rsid, disease, cellline = row[5], row[6], row[7] desc, log2fc, pvalue, fdr, study = row[8], row[9], row[10], row[11], row[12] + # Sanitize user-visible string fields (ASCII only, drop NBSP mojibake) + rsid = sanitize_text(rsid) + disease = sanitize_text(disease) + cellline = sanitize_text(cellline) + desc = sanitize_text(desc) + study = sanitize_text(study) + ref = sanitize_text(ref) + alt = sanitize_text(alt) + chrom = "chr" + chrom_num try: start = int(pos) - 1 # CSV uses 1-based coordinates except ValueError: continue end = start + max(1, len(ref)) # span the reference allele # Build name if rsid and rsid != "NA": name = rsid else: name = f"{chrom}:{pos}:{ref}>{alt}" score = pval_to_score(pvalue) color = pval_to_color(pvalue, fdr)