d9568e415754516b14bff72b0daf67a0f7f69999 braney Tue May 12 16:53:34 2026 -0700 quickLiftBench: add nSweep + posSweep orchestrators and paper Table 1, refs #37445 nSweep.py rebuilds testHub at each (N, BW_STEP) point and runs the bench, tagging every row with N + bw_step so the merged sweep.tsv plots "quickLift overhead vs feature count". buildTestHub.sh now also takes a FEATURE_W env var so the BED12 block model scales down at high N (auto-picked by nSweep so requested N fits the region without overlap). posSweep.py mirrors the orchestrator shape but varies the viewed window on a fixed hub; built-in canonical positions cover 0..5000 in-window features against the default testHub. paper_table1.md collects the headline Mode C cells from the 2026-05-12 N + position sweeps: bigBed quickLift ratio scales 4.3x -> 10.3x with feature count, bigWig stays flat ~5x; sparse windows show near-zero quickLift overhead. diff --git src/utils/qa/quickLiftBench/nSweep.py src/utils/qa/quickLiftBench/nSweep.py new file mode 100755 index 00000000000..8467ddbda02 --- /dev/null +++ src/utils/qa/quickLiftBench/nSweep.py @@ -0,0 +1,385 @@ +#!/usr/bin/env python3 +""" +nSweep.py - rebuild testHub at multiple (N, BW_STEP) sizes and run +quickLiftBench.py at each size. Tags every row of every per-point results.tsv +with the sweep's N and BW_STEP so downstream plotting can isolate +"render time vs feature count" or "render time vs bin count" curves. + +For each (N, BW_STEP) combination: + 1. buildTestHub.sh is run with N + BW_STEP env vars into a per-point hub + directory under --hub-dest-base (default + ~/public_html/quickLiftBench/sweep/N{N}_S{S}/). + 2. cases.yaml is loaded and the Mode B / Mode C hub URLs are rewritten to + point at the per-point hub. The rewritten config is dropped into the + output dir. + 3. quickLiftBench.py runs with --config <rewritten> --cases <selected>. + 4. The resulting results.tsv is read back, prepended with N + bw_step + columns, and appended to sweep.tsv in the output dir. + +Refs Redmine #37445. +""" + +import argparse +import csv +import os +import shutil +import statistics +import subprocess +import sys +from datetime import datetime + +import yaml + + +HERE = os.path.dirname(os.path.abspath(__file__)) +BUILDER = os.path.join(HERE, "testHub", "buildTestHub.sh") +BENCH = os.path.join(HERE, "quickLiftBench.py") +DEFAULT_CASES_FILE = os.path.join(HERE, "cases.yaml") +DEFAULT_CASES = "mode_b_bb,mode_b_bw,mode_c_hs1_bb,mode_c_hs1_bw" + + +def parse_args(): + p = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + p.add_argument( + "--n-values", + default="500,1000,5000,10000,20000", + help="Comma-separated feature counts (default: 500,1000,5000,10000,20000)", + ) + p.add_argument( + "--bw-step-values", + default="1000", + help="Comma-separated bigWig step sizes in bp (default: 1000)", + ) + p.add_argument( + "--cases", + default=DEFAULT_CASES, + help="Comma-separated bench case ids to run at each point " + f"(default: {DEFAULT_CASES})", + ) + p.add_argument( + "--config", + default=DEFAULT_CASES_FILE, + help="Base cases.yaml whose hub URLs are rewritten per point", + ) + p.add_argument( + "--iterations", + type=int, + default=10, + help="Iterations per (point, variant) (default: 10). Sweep defaults " + "lower than the standalone bench since many points are run.", + ) + p.add_argument( + "--warmup", + type=int, + default=1, + help="Warmup iterations per (point, variant) (default: 1)", + ) + p.add_argument( + "--region-start", + type=int, + default=15_000_000, + help="REGION_START passed to buildTestHub.sh (default: 15000000)", + ) + p.add_argument( + "--region-end", + type=int, + default=50_000_000, + help="REGION_END passed to buildTestHub.sh (default: 50000000)", + ) + p.add_argument( + "--feature-w", + type=int, + default=None, + help="Override FEATURE_W per point (default: auto-pick from N + region " + "span so features fit without overlap; clamped to [50, 5000])", + ) + p.add_argument( + "--hub-dest-base", + default=os.path.expanduser("~/public_html/quickLiftBench/sweep"), + help="Parent dir for per-point hub builds. The URL the bench uses is " + "inherited from cases.yaml -- this script rewrites the testHub " + "segment to sweep/<point>, so --hub-dest-base must be served at " + "the equivalent URL.", + ) + p.add_argument( + "--out", + default=None, + help="Output dir (default: results/nsweep-<timestamp>/)", + ) + p.add_argument( + "--clean-builds", + action="store_true", + help="Delete per-point hub dirs after the run (default: keep them)", + ) + p.add_argument( + "--skip-existing", + action="store_true", + help="Skip hub rebuild if the per-point dir already has hub_hs1.txt", + ) + p.add_argument( + "--dry-run", + action="store_true", + help="Print the plan and exit without building or benching", + ) + return p.parse_args() + + +def pick_feature_w(n, region_span, override): + """Pick a FEATURE_W that lets N features fit in region_span without + overlap. buildTestHub.sh uses stride = max(span/N, W), so N fits when + W <= span/N. Clamp to [50, 5000]: small N keeps the original 5000bp + baseline; very large N still produces a valid BED12 with >=50bp width. + At N = region_span/50 = 700k (with default 35M region) the region is + fully packed; beyond that the builder warns and the point is mislabeled.""" + if override is not None: + return override + if n <= 0: + return 5000 + target = region_span // n + return max(50, min(5000, target)) + + +def rewrite_cases(base_cfg, point_subdir, case_ids): + """Filter cases.yaml to the selected ids and rewrite each hub variant's + hubUrl from .../testHub/... to .../sweep/<point_subdir>/... so each + sweep point hits its own freshly-built hub.""" + src_marker = "/quickLiftBench/testHub/" + dst_marker = f"/quickLiftBench/sweep/{point_subdir}/" + out_cases = [] + for case in base_cfg.get("cases") or []: + if case["id"] not in case_ids: + continue + new_case = dict(case) + new_variants = {} + for vname, vraw in (case.get("variants") or {}).items(): + if isinstance(vraw, dict) and "hubUrl" in vraw: + v = dict(vraw) + v["hubUrl"] = v["hubUrl"].replace(src_marker, dst_marker) + new_variants[vname] = v + else: + new_variants[vname] = vraw + new_case["variants"] = new_variants + out_cases.append(new_case) + out = dict(base_cfg) + out["cases"] = out_cases + return out + + +def median_or_none(xs): + xs = [x for x in xs if x is not None] + if not xs: + return None + return int(statistics.median(xs)) + + +def p90(xs): + xs = sorted(x for x in xs if x is not None) + if not xs: + return None + if len(xs) == 1: + return xs[0] + k = max(0, int(round(0.9 * (len(xs) - 1)))) + return xs[k] + + +def to_int(v): + if v == "" or v is None: + return None + try: + return int(v) + except ValueError: + return None + + +def write_summary(merged_path, summary_path): + """Group sweep.tsv rows by (N, bw_step, case, variant) and emit median / + p90 of total_ms, load_ms_sum, draw_ms_sum. Then a second section with + per (N, bw_step, case) ratio of lifted/native total medians (sorted by N) + -- the headline curve the paper plots.""" + rows = [] + with open(merged_path) as f: + reader = csv.DictReader(f, delimiter="\t") + for r in reader: + rows.append(r) + + by_group = {} + for r in rows: + if r.get("error"): + continue + key = (int(r["N"]), int(r["bw_step"]), r["case"], r["variant"]) + by_group.setdefault(key, []).append(r) + + fields = [ + "N", "bw_step", "case", "variant", + "n_ok", + "total_median", "total_p90", + "load_sum_median", "load_sum_p90", + "draw_sum_median", "draw_sum_p90", + ] + per_key_stats = {} + with open(summary_path, "w", newline="") as f: + w = csv.DictWriter(f, fieldnames=fields, delimiter="\t") + w.writeheader() + for key in sorted(by_group): + group = by_group[key] + total = [to_int(r["total_ms"]) for r in group] + load = [to_int(r["load_ms_sum"]) for r in group] + draw = [to_int(r["draw_ms_sum"]) for r in group] + stats = { + "N": key[0], "bw_step": key[1], + "case": key[2], "variant": key[3], + "n_ok": len(group), + "total_median": median_or_none(total), + "total_p90": p90(total), + "load_sum_median": median_or_none(load), + "load_sum_p90": p90(load), + "draw_sum_median": median_or_none(draw), + "draw_sum_p90": p90(draw), + } + w.writerow({k: ("" if v is None else v) for k, v in stats.items()}) + per_key_stats[key] = stats + + f.write("\n# Pairwise ratio per (N, bw_step, case): lifted/native\n") + pair_fields = [ + "N", "bw_step", "case", + "native_total_median", "lifted_total_median", + "ratio_total", "ratio_load_sum", "ratio_draw_sum", + ] + wp = csv.DictWriter(f, fieldnames=pair_fields, delimiter="\t") + wp.writeheader() + seen_cases = set((n, s, c) for (n, s, c, _v) in per_key_stats) + for n, s, c in sorted(seen_cases): + ns = per_key_stats.get((n, s, c, "native")) + ls = per_key_stats.get((n, s, c, "lifted")) + if not ns or not ls: + continue + def ratio(num, denom): + if num is None or denom is None or denom == 0: + return "" + return f"{num / denom:.2f}" + wp.writerow({ + "N": n, "bw_step": s, "case": c, + "native_total_median": "" if ns["total_median"] is None else ns["total_median"], + "lifted_total_median": "" if ls["total_median"] is None else ls["total_median"], + "ratio_total": ratio(ls["total_median"], ns["total_median"]), + "ratio_load_sum": ratio(ls["load_sum_median"], ns["load_sum_median"]), + "ratio_draw_sum": ratio(ls["draw_sum_median"], ns["draw_sum_median"]), + }) + + +def main(): + args = parse_args() + + try: + n_values = [int(x) for x in args.n_values.split(",") if x.strip()] + bw_step_values = [int(x) for x in args.bw_step_values.split(",") if x.strip()] + except ValueError as e: + sys.exit(f"--n-values / --bw-step-values must be ints: {e}") + case_ids = [c.strip() for c in args.cases.split(",") if c.strip()] + if not n_values or not bw_step_values or not case_ids: + sys.exit("must supply at least one N, one BW_STEP, and one case id") + + with open(args.config) as f: + base_cfg = yaml.safe_load(f) + known_ids = {c["id"] for c in base_cfg.get("cases") or []} + unknown = set(case_ids) - known_ids + if unknown: + sys.exit(f"unknown case id(s): {sorted(unknown)} (known: {sorted(known_ids)})") + + timestamp = datetime.now().strftime("%Y%m%d-%H%M%S") + out_dir = args.out or os.path.join(HERE, "results", f"nsweep-{timestamp}") + os.makedirs(out_dir, exist_ok=True) + sweep_path = os.path.join(out_dir, "sweep.tsv") + summary_path = os.path.join(out_dir, "sweep_summary.tsv") + print(f"sweep output -> {out_dir}", file=sys.stderr) + + region_span = args.region_end - args.region_start + if region_span <= 0: + sys.exit("--region-end must be greater than --region-start") + + points = [] + for n in n_values: + for s in bw_step_values: + w = pick_feature_w(n, region_span, args.feature_w) + points.append((n, s, w)) + + print( + f"plan: {len(points)} points, " + f"{args.iterations} iterations x {len(case_ids)} cases x 2 variants, " + f"region={args.region_start}-{args.region_end} ({region_span}bp)", + file=sys.stderr, + ) + for n, s, w in points: + print(f" N={n} BW_STEP={s} FEATURE_W={w}", file=sys.stderr) + if args.dry_run: + return + + merged_writer = None + built_dirs = [] + with open(sweep_path, "w", newline="") as merged_fp: + for n, s, w in points: + point_subdir = f"N{n}_S{s}_W{w}" + hub_dir = os.path.join(args.hub_dest_base, point_subdir) + os.makedirs(hub_dir, exist_ok=True) + built_dirs.append(hub_dir) + + hub_marker = os.path.join(hub_dir, "hub_hs1.txt") + if args.skip_existing and os.path.exists(hub_marker): + print(f"\n=== reuse hub N={n} BW_STEP={s} W={w} ({hub_dir})", file=sys.stderr) + else: + print(f"\n=== build hub N={n} BW_STEP={s} W={w} -> {hub_dir}", file=sys.stderr) + env = os.environ.copy() + env["N"] = str(n) + env["BW_STEP"] = str(s) + env["FEATURE_W"] = str(w) + env["REGION_START"] = str(args.region_start) + env["REGION_END"] = str(args.region_end) + subprocess.run(["bash", BUILDER, hub_dir], env=env, check=True) + + point_cfg = rewrite_cases(base_cfg, point_subdir, case_ids) + cfg_path = os.path.join(out_dir, f"cases_{point_subdir}.yaml") + with open(cfg_path, "w") as f: + yaml.safe_dump(point_cfg, f, sort_keys=False) + + point_out = os.path.join(out_dir, point_subdir) + print(f"=== bench N={n} BW_STEP={s}", file=sys.stderr) + subprocess.run( + [ + sys.executable, BENCH, + "--config", cfg_path, + "--iterations", str(args.iterations), + "--warmup", str(args.warmup), + "--out", point_out, + ], + check=True, + ) + + per_point_results = os.path.join(point_out, "results.tsv") + with open(per_point_results) as rf: + reader = csv.DictReader(rf, delimiter="\t") + if merged_writer is None: + fields = ["N", "bw_step"] + reader.fieldnames + merged_writer = csv.DictWriter( + merged_fp, fieldnames=fields, delimiter="\t" + ) + merged_writer.writeheader() + for row in reader: + row["N"] = n + row["bw_step"] = s + merged_writer.writerow(row) + merged_fp.flush() + + if args.clean_builds: + for d in built_dirs: + shutil.rmtree(d, ignore_errors=True) + + write_summary(sweep_path, summary_path) + print(f"\nmerged: {sweep_path}", file=sys.stderr) + print(f"summary: {summary_path}", file=sys.stderr) + + +if __name__ == "__main__": + main()