src/hg/makeDb/scripts/mpravardb/mpravardbToBed.py d069df7855eb648b5b53c51ef5218fe2e4a1d087

d069df7855eb648b5b53c51ef5218fe2e4a1d087
lrnassar
  Wed May 13 12:48:53 2026 -0700
mpraVarDb script rebuild: standardize NA, fix upstream typos, add pvalue note. refs #37359

Build-script changes (mpravardbToBed.py + mpravardb.as):
- sanitize_text() now normalizes "None"/"NA"/"N/A"/"null"/"NULL"/"nan"
to empty string.  Removes 55,108 stale sentinels: 53,144 disease=
"None" eQTL rows, 1,964 disease="NA" Kircher rows, 44 ref/alt=NA
Myint rows.
- Literal-replacement table for three upstream typos: "30 UTR" ->
"3'UTR" (26,546 Schuster description rows), "Familial hypercholesterol
emia" -> "Familial hypercholesterolemia" (2,176 Kircher disease
rows), "Alchol use disorder" -> "Alcohol use disorder" (88 Rao
disease rows).
- fmt_mo() renders NaN floats as "NA" in mouseOver helper fields;
30,921 rows that previously showed literal "nan" now read "NA".
- Tightened name+rsid handling: treat a value as an rsID only if it
starts with "rs".  2,088 hg19-coord-style names like "1_1403972_CG"
are reformatted to "chr<X>:<hg38pos>:<ref>><alt>" and the rsid field
cleared to "" so the dbSNP linkout doesn't fire on a bogus value.
- Removed the 250-char truncation that cut Griesemer descriptions
mid-sentence; mpravardb.as switched description and mpraStudy from
"string" to "lstring" to admit the full upstream text.

Description page (mpraVarDb.html):
- Added a "Note (pending upstream fix)" paragraph in Methods explaining
the 5,092 rows with pvalue > 1 (test statistic mislabeled in the
pvalue field by upstream curators for Mouri 2022 and Tewhey 2016).
Bracketed by an HTML comment "TEMP: remove once Tao Wang fixes..."
for future cleanup when the next snapshot lands.

bigBed rebuilt; itemCount 239,028 preserved.
Pre-rebuild backup: /hive/data/genomes/hg38/bed/mpra/mpravardb/mpravardb.bb.pre-2026-05-14-backup
Makedoc updated with the QA-2 build-script rebuild section.

diff --git src/hg/makeDb/scripts/mpravardb/mpravardbToBed.py src/hg/makeDb/scripts/mpravardb/mpravardbToBed.py
index 6a6aa29f8cb..984367f26cf 100644
--- src/hg/makeDb/scripts/mpravardb/mpravardbToBed.py
+++ src/hg/makeDb/scripts/mpravardb/mpravardbToBed.py
@@ -29,38 +29,67 @@
     "\u2019": "'",   # RIGHT SINGLE QUOTATION MARK (used as apostrophe)
     "\u2018": "'",   # LEFT SINGLE QUOTATION MARK
     "\u201c": '"',   # LEFT DOUBLE QUOTATION MARK
     "\u201d": '"',   # RIGHT DOUBLE QUOTATION MARK
     "\u2032": "'",   # PRIME (used after numerals: 3'UTR)
     "\u2033": '"',   # DOUBLE PRIME
     "\u2013": "-",   # EN DASH
     "\u2014": "-",   # EM DASH
     "\u2026": "...", # HORIZONTAL ELLIPSIS
     "\u00a0": " ",   # NO-BREAK SPACE
     "\u00ac": "",    # NOT SIGN  (NBSP mojibake pair)
     "\u2020": "",    # DAGGER    (NBSP mojibake pair)
 })
 _WS_RE = re.compile(r"\s+")
 
+# Literal typos in the upstream MPRAVarDB CSV that are visible in user-facing
+# fields (description, disease).  Repaired here on every build until upstream
+# curators fix them.  Counts at first observation 2026-05-01:
+#   - "30 UTR" : 26,546 Schuster 2023 description rows (should be "3'UTR")
+#   - "Familial hypercholesterol emia" : 2,176 Kircher 2019 disease rows
+#   - "Alchol use disorder" : 88 Rao 2021 disease rows
+_TYPO_FIXES = {
+    "30 UTR": "3'UTR",
+    "Familial hypercholesterol emia": "Familial hypercholesterolemia",
+    "Alchol use disorder": "Alcohol use disorder",
+}
+_TYPO_FIX_RE = re.compile("|".join(re.escape(k) for k in _TYPO_FIXES))
+
+# Sentinel strings that upstream uses to mean "no value".  Standardize to ""
+# so mouseOver and details don't carry "None" / "NA" / "nan" tokens.
+_NA_SENTINELS = {"None", "NA", "N/A", "null", "NULL", "nan"}
+
 def sanitize_text(s):
-    """Return ASCII-only version of s for bigBed string fields."""
+    """Return ASCII-only, sentinel-normalized version of s for bigBed string fields."""
     if not s:
         return ""
     s = s.translate(_SANITIZE_MAP)
+    s = _TYPO_FIX_RE.sub(lambda m: _TYPO_FIXES[m.group()], s)
     # Drop any remaining non-ASCII (rare), then collapse runs of whitespace
     s = s.encode("ascii", "ignore").decode("ascii")
-    return _WS_RE.sub(" ", s).strip()
+    s = _WS_RE.sub(" ", s).strip()
+    if s in _NA_SENTINELS:
+        return ""
+    return s
+
+def fmt_mo(val):
+    """Format a float for mouseOver helper fields.  Renders NaN as 'NA'
+    (rather than literal 'nan') so the mouseOver reads 'p-value: NA' on
+    untested variants."""
+    if math.isnan(val):
+        return "NA"
+    return f"{val:.3g}"
 
 def pval_to_score(pval):
     """Convert p-value to a 0-1000 score using -log10.
     Missing / out-of-range / non-numeric → 0 (not 1000).
     Many rows upstream encode NA as literal 0.0, which is indistinguishable from
     a true p=0; treat all non-positive inputs as unscored."""
     if pval is None or pval in ("", "NA"):
         return 0
     try:
         p = float(pval)
     except ValueError:
         return 0
     if p <= 0 or p > 1:
         return 0
     score = int(-math.log10(p) * 100)
@@ -114,64 +143,59 @@
             rsid = sanitize_text(rsid)
             disease = sanitize_text(disease)
             cellline = sanitize_text(cellline)
             desc = sanitize_text(desc)
             study = sanitize_text(study)
             ref = sanitize_text(ref)
             alt = sanitize_text(alt)
 
             chrom = "chr" + chrom_num
             try:
                 start = int(pos) - 1  # CSV uses 1-based coordinates
             except ValueError:
                 continue
             end = start + max(1, len(ref))  # span the reference allele
 
-            # Build name
-            if rsid and rsid != "NA":
+            # Treat rsid as authoritative only if it actually looks like one.
+            # Upstream preserves hg19-coord-style strings (e.g. "1_1403972_CG")
+            # in the rsid column for ~2,088 rows; those would otherwise leak
+            # into name + rsid field + dbSNP linkout.
+            if rsid.startswith("rs"):
                 name = rsid
+                rsid_field = rsid
             else:
                 name = f"{chrom}:{pos}:{ref}>{alt}"
+                rsid_field = ""
 
             score = pval_to_score(pvalue)
             color = pval_to_color(pvalue, fdr)
 
-            # Truncate long string fields to stay within bigBed limits
-            if len(desc) > 250:
-                desc = desc[:247] + "..."
-            if len(study) > 250:
-                study = study[:247] + "..."
-
-            # Short values for mouseOver (3 significant digits)
             log2fc_val = safe_float(log2fc)
             pvalue_val = safe_float(pvalue)
             fdr_val = safe_float(fdr)
-            mo_log2fc = f"{log2fc_val:.3g}"
-            mo_pvalue = f"{pvalue_val:.3g}"
-            mo_fdr = f"{fdr_val:.3g}"
 
             fields = [
                 chrom, str(start), str(end), name, str(score), ".",
                 str(start), str(end), color,
                 ref, alt,
-                rsid if rsid and rsid != "NA" else ".",
+                rsid_field,
                 disease, cellline, desc,
                 str(log2fc_val),
                 str(pvalue_val),
                 str(fdr_val),
                 study,
-                mo_log2fc, mo_pvalue, mo_fdr,
+                fmt_mo(log2fc_val), fmt_mo(pvalue_val), fmt_mo(fdr_val),
             ]
             line = "\t".join(fields) + "\n"
 
             if genome == "hg19":
                 f19.write(line)
                 hg19_count += 1
             elif genome == "hg38":
                 f38.write(line)
                 hg38_count += 1
 
     print(f"Wrote {hg19_count} hg19 rows to {hg19_bed}")
     print(f"Wrote {hg38_count} hg38 rows to {hg38_bed}")
 
 def run(cmd):
     """Run a shell command, exit on failure."""