d069df7855eb648b5b53c51ef5218fe2e4a1d087 lrnassar Wed May 13 12:48:53 2026 -0700 mpraVarDb script rebuild: standardize NA, fix upstream typos, add pvalue note. refs #37359 Build-script changes (mpravardbToBed.py + mpravardb.as): - sanitize_text() now normalizes "None"/"NA"/"N/A"/"null"/"NULL"/"nan" to empty string. Removes 55,108 stale sentinels: 53,144 disease= "None" eQTL rows, 1,964 disease="NA" Kircher rows, 44 ref/alt=NA Myint rows. - Literal-replacement table for three upstream typos: "30 UTR" -> "3'UTR" (26,546 Schuster description rows), "Familial hypercholesterol emia" -> "Familial hypercholesterolemia" (2,176 Kircher disease rows), "Alchol use disorder" -> "Alcohol use disorder" (88 Rao disease rows). - fmt_mo() renders NaN floats as "NA" in mouseOver helper fields; 30,921 rows that previously showed literal "nan" now read "NA". - Tightened name+rsid handling: treat a value as an rsID only if it starts with "rs". 2,088 hg19-coord-style names like "1_1403972_CG" are reformatted to "chr<X>:<hg38pos>:<ref>><alt>" and the rsid field cleared to "" so the dbSNP linkout doesn't fire on a bogus value. - Removed the 250-char truncation that cut Griesemer descriptions mid-sentence; mpravardb.as switched description and mpraStudy from "string" to "lstring" to admit the full upstream text. Description page (mpraVarDb.html): - Added a "Note (pending upstream fix)" paragraph in Methods explaining the 5,092 rows with pvalue > 1 (test statistic mislabeled in the pvalue field by upstream curators for Mouri 2022 and Tewhey 2016). Bracketed by an HTML comment "TEMP: remove once Tao Wang fixes..." for future cleanup when the next snapshot lands. bigBed rebuilt; itemCount 239,028 preserved. Pre-rebuild backup: /hive/data/genomes/hg38/bed/mpra/mpravardb/mpravardb.bb.pre-2026-05-14-backup Makedoc updated with the QA-2 build-script rebuild section. diff --git src/hg/makeDb/scripts/mpravardb/mpravardbToBed.py src/hg/makeDb/scripts/mpravardb/mpravardbToBed.py index 6a6aa29f8cb..984367f26cf 100644 --- src/hg/makeDb/scripts/mpravardb/mpravardbToBed.py +++ src/hg/makeDb/scripts/mpravardb/mpravardbToBed.py @@ -29,38 +29,67 @@ "\u2019": "'", # RIGHT SINGLE QUOTATION MARK (used as apostrophe) "\u2018": "'", # LEFT SINGLE QUOTATION MARK "\u201c": '"', # LEFT DOUBLE QUOTATION MARK "\u201d": '"', # RIGHT DOUBLE QUOTATION MARK "\u2032": "'", # PRIME (used after numerals: 3'UTR) "\u2033": '"', # DOUBLE PRIME "\u2013": "-", # EN DASH "\u2014": "-", # EM DASH "\u2026": "...", # HORIZONTAL ELLIPSIS "\u00a0": " ", # NO-BREAK SPACE "\u00ac": "", # NOT SIGN (NBSP mojibake pair) "\u2020": "", # DAGGER (NBSP mojibake pair) }) _WS_RE = re.compile(r"\s+") +# Literal typos in the upstream MPRAVarDB CSV that are visible in user-facing +# fields (description, disease). Repaired here on every build until upstream +# curators fix them. Counts at first observation 2026-05-01: +# - "30 UTR" : 26,546 Schuster 2023 description rows (should be "3'UTR") +# - "Familial hypercholesterol emia" : 2,176 Kircher 2019 disease rows +# - "Alchol use disorder" : 88 Rao 2021 disease rows +_TYPO_FIXES = { + "30 UTR": "3'UTR", + "Familial hypercholesterol emia": "Familial hypercholesterolemia", + "Alchol use disorder": "Alcohol use disorder", +} +_TYPO_FIX_RE = re.compile("|".join(re.escape(k) for k in _TYPO_FIXES)) + +# Sentinel strings that upstream uses to mean "no value". Standardize to "" +# so mouseOver and details don't carry "None" / "NA" / "nan" tokens. +_NA_SENTINELS = {"None", "NA", "N/A", "null", "NULL", "nan"} + def sanitize_text(s): - """Return ASCII-only version of s for bigBed string fields.""" + """Return ASCII-only, sentinel-normalized version of s for bigBed string fields.""" if not s: return "" s = s.translate(_SANITIZE_MAP) + s = _TYPO_FIX_RE.sub(lambda m: _TYPO_FIXES[m.group()], s) # Drop any remaining non-ASCII (rare), then collapse runs of whitespace s = s.encode("ascii", "ignore").decode("ascii") - return _WS_RE.sub(" ", s).strip() + s = _WS_RE.sub(" ", s).strip() + if s in _NA_SENTINELS: + return "" + return s + +def fmt_mo(val): + """Format a float for mouseOver helper fields. Renders NaN as 'NA' + (rather than literal 'nan') so the mouseOver reads 'p-value: NA' on + untested variants.""" + if math.isnan(val): + return "NA" + return f"{val:.3g}" def pval_to_score(pval): """Convert p-value to a 0-1000 score using -log10. Missing / out-of-range / non-numeric → 0 (not 1000). Many rows upstream encode NA as literal 0.0, which is indistinguishable from a true p=0; treat all non-positive inputs as unscored.""" if pval is None or pval in ("", "NA"): return 0 try: p = float(pval) except ValueError: return 0 if p <= 0 or p > 1: return 0 score = int(-math.log10(p) * 100) @@ -114,64 +143,59 @@ rsid = sanitize_text(rsid) disease = sanitize_text(disease) cellline = sanitize_text(cellline) desc = sanitize_text(desc) study = sanitize_text(study) ref = sanitize_text(ref) alt = sanitize_text(alt) chrom = "chr" + chrom_num try: start = int(pos) - 1 # CSV uses 1-based coordinates except ValueError: continue end = start + max(1, len(ref)) # span the reference allele - # Build name - if rsid and rsid != "NA": + # Treat rsid as authoritative only if it actually looks like one. + # Upstream preserves hg19-coord-style strings (e.g. "1_1403972_CG") + # in the rsid column for ~2,088 rows; those would otherwise leak + # into name + rsid field + dbSNP linkout. + if rsid.startswith("rs"): name = rsid + rsid_field = rsid else: name = f"{chrom}:{pos}:{ref}>{alt}" + rsid_field = "" score = pval_to_score(pvalue) color = pval_to_color(pvalue, fdr) - # Truncate long string fields to stay within bigBed limits - if len(desc) > 250: - desc = desc[:247] + "..." - if len(study) > 250: - study = study[:247] + "..." - - # Short values for mouseOver (3 significant digits) log2fc_val = safe_float(log2fc) pvalue_val = safe_float(pvalue) fdr_val = safe_float(fdr) - mo_log2fc = f"{log2fc_val:.3g}" - mo_pvalue = f"{pvalue_val:.3g}" - mo_fdr = f"{fdr_val:.3g}" fields = [ chrom, str(start), str(end), name, str(score), ".", str(start), str(end), color, ref, alt, - rsid if rsid and rsid != "NA" else ".", + rsid_field, disease, cellline, desc, str(log2fc_val), str(pvalue_val), str(fdr_val), study, - mo_log2fc, mo_pvalue, mo_fdr, + fmt_mo(log2fc_val), fmt_mo(pvalue_val), fmt_mo(fdr_val), ] line = "\t".join(fields) + "\n" if genome == "hg19": f19.write(line) hg19_count += 1 elif genome == "hg38": f38.write(line) hg38_count += 1 print(f"Wrote {hg19_count} hg19 rows to {hg19_bed}") print(f"Wrote {hg38_count} hg38 rows to {hg38_bed}") def run(cmd): """Run a shell command, exit on failure."""