09f347da2873b42ab1b5c6f5684e429dc4e321d5 lrnassar Mon May 4 08:52:54 2026 -0700 Switch mpravardbToBed sanitize map to str.maketrans with \u escape keys per CR feedback. refs #37359 Replaces literal UTF-8 characters in the dictionary keys (curly quotes, primes, dashes, NBSP, mojibake) with their \uXXXX escape forms so an editor can't silently re-encode them and break the pipeline. Translation now uses str.translate(), and the whitespace-collapse regex is precompiled at module level. diff --git src/hg/makeDb/scripts/mpravardb/mpravardbToBed.py src/hg/makeDb/scripts/mpravardb/mpravardbToBed.py index c97884640b5..6a6aa29f8cb 100644 --- src/hg/makeDb/scripts/mpravardb/mpravardbToBed.py +++ src/hg/makeDb/scripts/mpravardb/mpravardbToBed.py @@ -11,56 +11,56 @@ import csv import re import subprocess import sys import os import math SCRIPT_DIR = os.path.join(os.environ["HOME"], "kent/src/hg/makeDb/scripts/mpravardb") AS_FILE = os.path.join(SCRIPT_DIR, "mpravardb.as") LIFTOVER_CHAIN = "/gbdb/hg19/liftOver/hg19ToHg38.over.chain.gz" CHROM_SIZES = "/hive/data/genomes/hg38/chrom.sizes" INPUT_CSV = "mpravardb.csv" # Upstream CSV contains UTF-8 curly quotes, primes, and NBSP mojibake. # Browser does not transcode UTF-8 in bigBed fields, so everything user-visible -# must be plain ASCII. Transliterate or strip. -_SANITIZE_MAP = { - "’": "'", # RIGHT SINGLE QUOTATION MARK (used as apostrophe) - "‘": "'", # LEFT SINGLE QUOTATION MARK - "“": '"', # LEFT DOUBLE QUOTATION MARK - "”": '"', # RIGHT DOUBLE QUOTATION MARK - "′": "'", # PRIME (used after numerals: 3'UTR) - "″": '"', # DOUBLE PRIME - "–": "-", # EN DASH - "—": "-", # EM DASH - "…": "...", # HORIZONTAL ELLIPSIS - " ": " ", # NO-BREAK SPACE - "¬": "", # NOT SIGN (NBSP mojibake pair) - "†": "", # DAGGER (NBSP mojibake pair) -} -_SANITIZE_RE = re.compile("|".join(re.escape(k) for k in _SANITIZE_MAP)) +# must be plain ASCII. Transliterate or strip. Keys use \u escapes rather than +# literal characters so an editor can't silently re-encode them. +_SANITIZE_MAP = str.maketrans({ + "\u2019": "'", # RIGHT SINGLE QUOTATION MARK (used as apostrophe) + "\u2018": "'", # LEFT SINGLE QUOTATION MARK + "\u201c": '"', # LEFT DOUBLE QUOTATION MARK + "\u201d": '"', # RIGHT DOUBLE QUOTATION MARK + "\u2032": "'", # PRIME (used after numerals: 3'UTR) + "\u2033": '"', # DOUBLE PRIME + "\u2013": "-", # EN DASH + "\u2014": "-", # EM DASH + "\u2026": "...", # HORIZONTAL ELLIPSIS + "\u00a0": " ", # NO-BREAK SPACE + "\u00ac": "", # NOT SIGN (NBSP mojibake pair) + "\u2020": "", # DAGGER (NBSP mojibake pair) +}) +_WS_RE = re.compile(r"\s+") def sanitize_text(s): """Return ASCII-only version of s for bigBed string fields.""" - if s is None: + if not s: return "" - out = _SANITIZE_RE.sub(lambda m: _SANITIZE_MAP[m.group()], s) + s = s.translate(_SANITIZE_MAP) # Drop any remaining non-ASCII (rare), then collapse runs of whitespace - out = out.encode("ascii", "ignore").decode("ascii") - out = re.sub(r"\s+", " ", out).strip() - return out + s = s.encode("ascii", "ignore").decode("ascii") + return _WS_RE.sub(" ", s).strip() def pval_to_score(pval): """Convert p-value to a 0-1000 score using -log10. Missing / out-of-range / non-numeric → 0 (not 1000). Many rows upstream encode NA as literal 0.0, which is indistinguishable from a true p=0; treat all non-positive inputs as unscored.""" if pval is None or pval in ("", "NA"): return 0 try: p = float(pval) except ValueError: return 0 if p <= 0 or p > 1: return 0 score = int(-math.log10(p) * 100)