58e070461663f4d71ce17eb93b17225b20071371 lrnassar Fri Jun 5 10:12:34 2026 -0700 Expand verboten.lst with 18 more patterns surfaced during Phase C1 dry-run of the remaining 12 RTS sessions: leaked state from other CGIs (hgg_, hglft_, hgta_, hgHub_do_search), additional hgTracks UI state (hgt_, hgt_configGroupTarget, hgt_doJsCommand, hgt_mdbVal/Var, rulerBaseZoom, hgTracksConfigPage), debris (European, source, sessionTable_length), per-db reverse-complement toggle (complement_<db>), gateway-style position-search input (search), Track Search dialog state (ts*), and single-letter hgc track selector (g). Also drop pairs with empty or whitespace-tainted keys in scrub() to defend against future cart-string corruption (caught a stray ' hgsid=...' from a manually-edited 2021 cart row in BRCA1_BRCA2_ENIGMA_hg19). Re-fetch the 2 already-seeded files so the whole corpus uses the final scrub list. refs #32768 diff --git src/hg/utils/rts/rtsUpdate src/hg/utils/rts/rtsUpdate index 410e89530c4..39308f91e2b 100755 --- src/hg/utils/rts/rtsUpdate +++ src/hg/utils/rts/rtsUpdate @@ -1,326 +1,330 @@ #!/usr/bin/env python3 # rtsUpdate -- fetch a curator's Recommended Track Set session from hgcentral # and write the scrubbed key=value contents into htdocs/data/recTrackSets/, # where Chris's file-based loader reads it. No DB writes. # Refs #32768, #34907. import argparse import datetime import getpass import pathlib import re import subprocess import sys from urllib.parse import urlparse HOME = pathlib.Path.home() RTS_DIR = HOME / "kent/src/hg/utils/rts" DATA_DIR = HOME / "kent/src/hg/htdocs/data/recTrackSets" INC_DIR = HOME / "kent/src/hg/htdocs/inc" LOGFILE = HOME / ".rtsUpdate.log" VERBOTEN = RTS_DIR / "verboten.lst" TARGET_USER = "View" # Source host => (hgsql -h target, db). Default source is rr (canonical curator save location). HOSTS = { "rr": ("genome-centdb", "hgcentral"), "dev": ("hgwdev", "hgcentraltest"), "beta": ("hgwbeta", "hgcentralbeta"), } # Session and userName must match this. Excludes anything that could be SQL- or # shell-special; permits the URL-encoding form ('%' and digits) used for sessions # whose human-readable names contain spaces (e.g. 'CNVs%20Clinical'). SAFE_NAME = re.compile(r"^[A-Za-z0-9_%.\-]+$") def log(msg): ts = datetime.datetime.now(datetime.timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") line = f"{ts} {getpass.getuser()} {msg}\n" LOGFILE.parent.mkdir(parents=True, exist_ok=True) with open(LOGFILE, "a") as f: f.write(line) def die(msg, code=1): print(f"ERROR: {msg}", file=sys.stderr) sys.exit(code) def confirm(prompt): while True: try: ans = input(prompt + " (yes/no): ").strip().lower() except EOFError: return False if ans == "yes": print() return True if ans == "no": print() return False print(" please type 'yes' or 'no'") def validate_name(name, label): if not name or not SAFE_NAME.match(name): die(f"unsafe {label}: {name!r} (must match {SAFE_NAME.pattern})") # ---------- hgsql shim (read-only) ---------- def hgsql_read(host_key, sql): """Run a SELECT. Returns list[list[str]]. Caller is responsible for interpolating only validated values into sql.""" host, db = HOSTS[host_key] cmd = ["hgsql", "-h", host, db, "-N", "-B", "-e", sql] p = subprocess.run(cmd, capture_output=True, text=True) if p.returncode != 0: die(f"hgsql read {host_key} failed: {p.stderr.strip()}") out = p.stdout if out.endswith("\n"): out = out[:-1] if not out: return [] return [row.split("\t") for row in out.split("\n")] # ---------- RTS domain ---------- def load_verboten(): pats = [] for line in VERBOTEN.read_text().splitlines(): line = line.strip() if line and not line.startswith("#"): pats.append(re.compile(line)) return pats def split_pairs(contents): return [p for p in contents.split("&") if p] def join_pairs(pairs): return "&".join(pairs) def scrub(pairs, verboten): out = [] for p in pairs: key = p.split("=", 1)[0] + # Drop pairs with empty or whitespace-tainted keys (corruption / stray + # data from manual session edits — e.g. an `& hgsid=...` artifact). + if not key or any(c.isspace() for c in key): + continue if any(rx.match(key) for rx in verboten): continue out.append(p) return sorted(set(out)) def find_db_for_session(session): """Return the assembly db for a session by scanning recTrackSets.<db>.tab. Reads from htdocs/inc/ (the authoritative manifest location).""" for tab in sorted(INC_DIR.glob("recTrackSets.*.tab")): m = re.match(r"recTrackSets\.([A-Za-z0-9]+)\.tab$", tab.name) if not m: continue db = m.group(1) for line in tab.read_text().splitlines(): if line.startswith("#") or not line.strip(): continue cols = line.split("\t") if len(cols) >= 3 and cols[2] == session: return db return None def kent_path(db, session): return DATA_DIR / db / session def write_kent_file(db, session, pairs): """Write pairs (one per line) to the kent tree atomically via temp+rename.""" path = kent_path(db, session) path.parent.mkdir(parents=True, exist_ok=True) tmp = path.parent / (path.name + ".tmp") tmp.write_text("\n".join(pairs) + "\n") tmp.replace(path) return path def fetch_session_row(host_key, user, session): """Return (contents, settings, exists).""" validate_name(user, "userName") validate_name(session, "sessionName") sql = (f"SELECT contents, settings FROM namedSessionDb " f"WHERE userName='{user}' AND sessionName='{session}'") rows = hgsql_read(host_key, sql) if not rows: return "", "", False row = rows[0] contents = row[0] if len(row) > 0 else "" settings = row[1] if len(row) > 1 else "" return contents, settings, True def show_pair_diff(label_a, pairs_a, label_b, pairs_b, verbose): """Symmetric diff between two pair lists.""" set_a = set(pairs_a) set_b = set(pairs_b) only_a = sorted(set_a - set_b) only_b = sorted(set_b - set_a) print(f" - {len(only_a)} vars in [{label_a}] not in [{label_b}]") print(f" + {len(only_b)} vars in [{label_b}] not in [{label_a}]") if verbose: for p in only_a: print(f" -{p}") for p in only_b: print(f" +{p}") def show_git_diff_for(path): """Run `git -C kent diff -- <path>`, after `git add -N` so new files show.""" rel = str(path) try: subprocess.run(["git", "-C", str(HOME / "kent"), "add", "-N", rel], check=False, capture_output=True) except FileNotFoundError: return print("\n--- git diff (kent tree) ---") subprocess.run(["git", "-C", str(HOME / "kent"), "--no-pager", "diff", "--no-color", "--", rel], check=False) # ---------- subcommand ---------- def cmd_fetch(args): verboten = load_verboten() if args.src_url: u = urlparse(args.src_url) m = re.match(r"^/s/([^/]+)/([^/]+)/?$", u.path) if not m: die(f"bad --src-url; expected .../s/<user>/<sessionName>: {args.src_url}") src_user, src_session = m.group(1), m.group(2) else: if not (args.src_user and args.src_session): die("must provide --src-url OR (--src-user AND --src-session)") src_user, src_session = args.src_user, args.src_session target = args.target_session validate_name(src_user, "src userName") validate_name(src_session, "src sessionName") validate_name(target, "target sessionName") # .tab validation: must be listed in recTrackSets.<db>.tab. This determines # which assembly the file lands under and rejects typos / missing manifest rows. db = find_db_for_session(target) if db is None: die(f"target session {target!r} not found in any recTrackSets.<db>.tab\n" f" Add a row to ~/kent/src/hg/htdocs/inc/recTrackSets.<db>.tab first, then re-run.") print(f"assembly resolved from recTrackSets.{db}.tab: {db}") src_host = args.src_host src_h, src_db = HOSTS[src_host] print(f"Fetching {src_user}/{src_session} from {src_host} ({src_h}:{src_db})...") contents, _, exists = fetch_session_row(src_host, src_user, src_session) if not exists: die(f"source session {src_user}/{src_session} not found on {src_host}") pairs = split_pairs(contents) print(f" fetched {len(pairs)} cart variables ({len(contents)} bytes)") scrubbed = scrub(pairs, verboten) dropped = len(pairs) - len(scrubbed) print(f" scrubbed: dropped {dropped} verboten/duplicate vars, kept {len(scrubbed)}") target_path = kent_path(db, target) existing_pairs = [] if target_path.exists(): existing_pairs = [ln.rstrip() for ln in target_path.read_text().splitlines() if ln.strip()] if not args.commit: print(f"\n--dry-run-- (no --commit flag); would write to {target_path}") print(f" ({len(existing_pairs)} -> {len(scrubbed)} cart vars after scrub)") show_pair_diff("existing", existing_pairs, "new", scrubbed, args.verbose) return 0 path = write_kent_file(db, target, scrubbed) print(f" wrote {path}") show_git_diff_for(path) if existing_pairs: print("\n--- diff: previous file vs new ---") show_pair_diff("prev", existing_pairs, "new", scrubbed, args.verbose) if not confirm(f"\nKeep this update (yes), or revert (no)?"): # Restore previous content if it existed; otherwise remove the new file if existing_pairs: tmp = path.parent / (path.name + ".tmp") tmp.write_text("\n".join(existing_pairs) + "\n") tmp.replace(path) print(f" reverted {path}") else: path.unlink() print(f" removed {path}") return 1 log(f"fetch src={src_host}:{src_user}/{src_session} target={target} db={db} -> {path}") print() print(f"File written. To test in your sandbox:") print(f" cd ~/kent/src/hg/htdocs/data && make user") print(f" Then click the RTS in the dialog at https://hgwdev-{getpass.getuser()}.gi.ucsc.edu/cgi-bin/hgTracks?db={db}") print() print("Once validated, commit and push:") print(f" cd ~/kent") print(f" git add src/hg/htdocs/data/recTrackSets/{db}/{target}") print(f" git commit -m 'Updating RTS {target}. refs #32768'") print(f" git push") return 0 # ---------- main ---------- def build_parser(): p = argparse.ArgumentParser( prog="rtsUpdate", description="Fetch a curator's Recommended Track Set from hgcentral, scrub it, " "and write the result to htdocs/data/recTrackSets/<db>/<session>. " "Refs #32768, #34907.") sub = p.add_subparsers(dest="cmd", required=True) pf = sub.add_parser("fetch", help="fetch source session, scrub, write the htdocs file") pf.add_argument("--src-url", help="full URL like https://genome.ucsc.edu/s/<user>/<session>") pf.add_argument("--src-user", help="source userName (alternative to --src-url)") pf.add_argument("--src-session", help="source sessionName (alternative to --src-url)") pf.add_argument("--src-host", choices=("rr", "dev", "beta"), default="rr", help="where to read the source from (default: rr)") pf.add_argument("--target-session", required=True, help="target sessionName under userName='View'; must already " "appear in recTrackSets.<db>.tab") pf.add_argument("--commit", action="store_true", help="actually write the file (default: dry-run)") pf.add_argument("--verbose", "-v", action="store_true", help="show every var in diffs") return p def main(): # Line-buffer stdout so prints interleave correctly with subprocess output # (git diff, hgsql) that writes directly via the OS. try: sys.stdout.reconfigure(line_buffering=True) except AttributeError: pass args = build_parser().parse_args() if args.cmd == "fetch": return cmd_fetch(args) return 0 if __name__ == "__main__": sys.exit(main() or 0)