d9568e415754516b14bff72b0daf67a0f7f69999 braney Tue May 12 16:53:34 2026 -0700 quickLiftBench: add nSweep + posSweep orchestrators and paper Table 1, refs #37445 nSweep.py rebuilds testHub at each (N, BW_STEP) point and runs the bench, tagging every row with N + bw_step so the merged sweep.tsv plots "quickLift overhead vs feature count". buildTestHub.sh now also takes a FEATURE_W env var so the BED12 block model scales down at high N (auto-picked by nSweep so requested N fits the region without overlap). posSweep.py mirrors the orchestrator shape but varies the viewed window on a fixed hub; built-in canonical positions cover 0..5000 in-window features against the default testHub. paper_table1.md collects the headline Mode C cells from the 2026-05-12 N + position sweeps: bigBed quickLift ratio scales 4.3x -> 10.3x with feature count, bigWig stays flat ~5x; sparse windows show near-zero quickLift overhead. diff --git src/utils/qa/quickLiftBench/testHub/buildTestHub.sh src/utils/qa/quickLiftBench/testHub/buildTestHub.sh index 784cabe7573..efe915b784b 100755 --- src/utils/qa/quickLiftBench/testHub/buildTestHub.sh +++ src/utils/qa/quickLiftBench/testHub/buildTestHub.sh @@ -9,64 +9,95 @@ # - modeB_bb_native -- the hg38-coord bigBed rendered natively. # - modeB_bw_native -- the hg38-coord bigWig rendered natively. # Mode B's "lifted" variant is just modeC_bb_lifted / modeC_bw_lifted on hs1. # Drops bb/bw files + chain pair + hub.txt into $1 # (default ~/public_html/quickLiftBench/testHub). # # Refs Redmine #37445. set -euo pipefail N="${N:-5000}" SEED="${SEED:-42}" REGION_START="${REGION_START:-15000000}" REGION_END="${REGION_END:-50000000}" BW_STEP="${BW_STEP:-1000}" +# Per-feature span in bp. Stride is max(span/N, FEATURE_W), so shrinking +# FEATURE_W lets the bench fit more features in the same region without +# forcing pack-stacking from overlaps. Adjacent features are placed +# back-to-back (no gap) when N saturates the region. The BED12 block model +# scales proportionally with FEATURE_W (three blocks at 10%/40%/10% of width +# with 30% and 90% block-start offsets), preserving the same exon/intron +# shape across all sizes. +FEATURE_W="${FEATURE_W:-5000}" DEST="${1:-$HOME/public_html/quickLiftBench/testHub}" HG38_CHROM_SIZES=/hive/data/genomes/hg38/chrom.sizes HS1_CHROM_SIZES=/hive/data/genomes/hs1/chrom.sizes HG38_TO_HS1_CHAIN=/gbdb/hg38/liftOver/hg38ToHs1.over.chain.gz # bigChain pair used by quickLift for hg38 -> hs1 on-the-fly remap. # Copied into the hub dir so the bench is self-contained on one host. QUICK_CHAIN_SRC=/hive/data/genomes/hs1/bed/lastz.hg38/axtChain/hs1.hg38.quick.bb QUICK_CHAIN_LINK_SRC=/hive/data/genomes/hs1/bed/lastz.hg38/axtChain/hs1.hg38.quickLink.bb mkdir -p "$DEST" cd "$DEST" -python3 - "$N" "$SEED" "$REGION_START" "$REGION_END" > modeC_hg38.bed <<'PY' +python3 - "$N" "$SEED" "$REGION_START" "$REGION_END" "$FEATURE_W" > modeC_hg38.bed <<'PY' import random, sys -N, seed, start_region, end_region = int(sys.argv[1]), int(sys.argv[2]), int(sys.argv[3]), int(sys.argv[4]) +N, seed, start_region, end_region, W = (int(sys.argv[i]) for i in range(1, 6)) +if W < 50: + sys.exit(f"FEATURE_W must be >= 50bp (got {W})") random.seed(seed) chrom = "chr22" span = end_region - start_region -# Lay features at regular stride so they don't overlap (~5kb each). -stride = max(span // N, 6000) +# Lay features at regular stride. Stride must clear the feature width itself +# to keep features non-overlapping; at high N adjacent features are placed +# back-to-back (stride == W) with no gap. +stride = max(span // N, W) +# Block model: three blocks at 10% / 40% / 10% of width, starting at +# offsets 0, 30%, and 90% of width. Matches the original 500/2000/500 + +# 0/1500/4500 pattern when W=5000. Bound each block at >=1bp so very small +# FEATURE_W (e.g. 50) still produces a valid BED12. +def w_at(frac): + return max(1, int(round(W * frac))) +b1, b2, b3 = w_at(0.1), w_at(0.4), w_at(0.1) +s1, s2, s3 = 0, w_at(0.3), w_at(0.9) +thick_lo = w_at(0.2) +thick_hi = W - w_at(0.2) +emitted = 0 for i in range(N): s = start_region + i * stride - e = s + 5000 + e = s + W if e > end_region: break strand = "+" if i % 2 == 0 else "-" - thickStart, thickEnd = s + 1000, e - 1000 + thickStart, thickEnd = s + thick_lo, s + thick_hi print("\t".join(map(str, [ chrom, s, e, f"feat_{i:05d}", 0, strand, thickStart, thickEnd, "100,100,200", - 3, "500,2000,500,", "0,1500,4500," + 3, f"{b1},{b2},{b3},", f"{s1},{s2},{s3},", ]))) + emitted += 1 +if emitted < N: + print( + f"WARNING: requested N={N} but only {emitted} features fit " + f"in {span}bp at stride={stride}bp (FEATURE_W={W}). " + f"Pass a smaller FEATURE_W or larger REGION span.", + file=sys.stderr, + ) PY liftOver -bedPlus=12 modeC_hg38.bed "$HG38_TO_HS1_CHAIN" modeC_hs1.bed unmapped_hs1.bed sort -k1,1 -k2,2n modeC_hg38.bed -o modeC_hg38.bed sort -k1,1 -k2,2n modeC_hs1.bed -o modeC_hs1.bed bedToBigBed -type=bed12 modeC_hg38.bed "$HG38_CHROM_SIZES" modeC_hg38.bb bedToBigBed -type=bed12 modeC_hs1.bed "$HS1_CHROM_SIZES" modeC_hs1.bb # ---- bigWig: BedGraph at BW_STEP bp resolution, lifted by chain ---- python3 - "$REGION_START" "$REGION_END" "$BW_STEP" <<'PY' > modeC_hg38.bedGraph import math, sys s0, e0, step = int(sys.argv[1]), int(sys.argv[2]), int(sys.argv[3]) chrom = "chr22"