d9568e415754516b14bff72b0daf67a0f7f69999 braney Tue May 12 16:53:34 2026 -0700 quickLiftBench: add nSweep + posSweep orchestrators and paper Table 1, refs #37445 nSweep.py rebuilds testHub at each (N, BW_STEP) point and runs the bench, tagging every row with N + bw_step so the merged sweep.tsv plots "quickLift overhead vs feature count". buildTestHub.sh now also takes a FEATURE_W env var so the BED12 block model scales down at high N (auto-picked by nSweep so requested N fits the region without overlap). posSweep.py mirrors the orchestrator shape but varies the viewed window on a fixed hub; built-in canonical positions cover 0..5000 in-window features against the default testHub. paper_table1.md collects the headline Mode C cells from the 2026-05-12 N + position sweeps: bigBed quickLift ratio scales 4.3x -> 10.3x with feature count, bigWig stays flat ~5x; sparse windows show near-zero quickLift overhead. diff --git src/utils/qa/quickLiftBench/testHub/buildTestHub.sh src/utils/qa/quickLiftBench/testHub/buildTestHub.sh index 784cabe7573..efe915b784b 100755 --- src/utils/qa/quickLiftBench/testHub/buildTestHub.sh +++ src/utils/qa/quickLiftBench/testHub/buildTestHub.sh @@ -1,192 +1,223 @@ #!/bin/bash # buildTestHub.sh - regenerate the Mode B + Mode C reference hub data. # # Builds a self-contained two-genome hub: # * hs1 genome (Mode C: lift on/off, same hub + position): # - modeC_bb_native / modeC_bb_lifted -- bigBed12, N features. # - modeC_bw_native / modeC_bw_lifted -- bigWig at BW_STEP bp resolution. # * hg38 genome (Mode B: native rendering of the same source data): # - modeB_bb_native -- the hg38-coord bigBed rendered natively. # - modeB_bw_native -- the hg38-coord bigWig rendered natively. # Mode B's "lifted" variant is just modeC_bb_lifted / modeC_bw_lifted on hs1. # Drops bb/bw files + chain pair + hub.txt into $1 # (default ~/public_html/quickLiftBench/testHub). # # Refs Redmine #37445. set -euo pipefail N="${N:-5000}" SEED="${SEED:-42}" REGION_START="${REGION_START:-15000000}" REGION_END="${REGION_END:-50000000}" BW_STEP="${BW_STEP:-1000}" +# Per-feature span in bp. Stride is max(span/N, FEATURE_W), so shrinking +# FEATURE_W lets the bench fit more features in the same region without +# forcing pack-stacking from overlaps. Adjacent features are placed +# back-to-back (no gap) when N saturates the region. The BED12 block model +# scales proportionally with FEATURE_W (three blocks at 10%/40%/10% of width +# with 30% and 90% block-start offsets), preserving the same exon/intron +# shape across all sizes. +FEATURE_W="${FEATURE_W:-5000}" DEST="${1:-$HOME/public_html/quickLiftBench/testHub}" HG38_CHROM_SIZES=/hive/data/genomes/hg38/chrom.sizes HS1_CHROM_SIZES=/hive/data/genomes/hs1/chrom.sizes HG38_TO_HS1_CHAIN=/gbdb/hg38/liftOver/hg38ToHs1.over.chain.gz # bigChain pair used by quickLift for hg38 -> hs1 on-the-fly remap. # Copied into the hub dir so the bench is self-contained on one host. QUICK_CHAIN_SRC=/hive/data/genomes/hs1/bed/lastz.hg38/axtChain/hs1.hg38.quick.bb QUICK_CHAIN_LINK_SRC=/hive/data/genomes/hs1/bed/lastz.hg38/axtChain/hs1.hg38.quickLink.bb mkdir -p "$DEST" cd "$DEST" -python3 - "$N" "$SEED" "$REGION_START" "$REGION_END" > modeC_hg38.bed <<'PY' +python3 - "$N" "$SEED" "$REGION_START" "$REGION_END" "$FEATURE_W" > modeC_hg38.bed <<'PY' import random, sys -N, seed, start_region, end_region = int(sys.argv[1]), int(sys.argv[2]), int(sys.argv[3]), int(sys.argv[4]) +N, seed, start_region, end_region, W = (int(sys.argv[i]) for i in range(1, 6)) +if W < 50: + sys.exit(f"FEATURE_W must be >= 50bp (got {W})") random.seed(seed) chrom = "chr22" span = end_region - start_region -# Lay features at regular stride so they don't overlap (~5kb each). -stride = max(span // N, 6000) +# Lay features at regular stride. Stride must clear the feature width itself +# to keep features non-overlapping; at high N adjacent features are placed +# back-to-back (stride == W) with no gap. +stride = max(span // N, W) +# Block model: three blocks at 10% / 40% / 10% of width, starting at +# offsets 0, 30%, and 90% of width. Matches the original 500/2000/500 + +# 0/1500/4500 pattern when W=5000. Bound each block at >=1bp so very small +# FEATURE_W (e.g. 50) still produces a valid BED12. +def w_at(frac): + return max(1, int(round(W * frac))) +b1, b2, b3 = w_at(0.1), w_at(0.4), w_at(0.1) +s1, s2, s3 = 0, w_at(0.3), w_at(0.9) +thick_lo = w_at(0.2) +thick_hi = W - w_at(0.2) +emitted = 0 for i in range(N): s = start_region + i * stride - e = s + 5000 + e = s + W if e > end_region: break strand = "+" if i % 2 == 0 else "-" - thickStart, thickEnd = s + 1000, e - 1000 + thickStart, thickEnd = s + thick_lo, s + thick_hi print("\t".join(map(str, [ chrom, s, e, f"feat_{i:05d}", 0, strand, thickStart, thickEnd, "100,100,200", - 3, "500,2000,500,", "0,1500,4500," + 3, f"{b1},{b2},{b3},", f"{s1},{s2},{s3},", ]))) + emitted += 1 +if emitted < N: + print( + f"WARNING: requested N={N} but only {emitted} features fit " + f"in {span}bp at stride={stride}bp (FEATURE_W={W}). " + f"Pass a smaller FEATURE_W or larger REGION span.", + file=sys.stderr, + ) PY liftOver -bedPlus=12 modeC_hg38.bed "$HG38_TO_HS1_CHAIN" modeC_hs1.bed unmapped_hs1.bed sort -k1,1 -k2,2n modeC_hg38.bed -o modeC_hg38.bed sort -k1,1 -k2,2n modeC_hs1.bed -o modeC_hs1.bed bedToBigBed -type=bed12 modeC_hg38.bed "$HG38_CHROM_SIZES" modeC_hg38.bb bedToBigBed -type=bed12 modeC_hs1.bed "$HS1_CHROM_SIZES" modeC_hs1.bb # ---- bigWig: BedGraph at BW_STEP bp resolution, lifted by chain ---- python3 - "$REGION_START" "$REGION_END" "$BW_STEP" <<'PY' > modeC_hg38.bedGraph import math, sys s0, e0, step = int(sys.argv[1]), int(sys.argv[2]), int(sys.argv[3]) chrom = "chr22" for s in range(s0, e0, step): e = s + step # smooth signal in [50, 150]; chrom-position-driven, deterministic v = 100 + 50 * math.sin((s - s0) / 100000.0) print(f"{chrom}\t{s}\t{e}\t{v:.4f}") PY # liftOver of bedGraph: keep 4th column (value) as bedPlus extra. liftOver -bedPlus=3 modeC_hg38.bedGraph "$HG38_TO_HS1_CHAIN" \ modeC_hs1.bedGraph unmapped_hs1.bedGraph # liftOver may emit overlapping intervals when a chain block edge falls # inside a step; bedGraphToBigWig requires strictly non-overlapping. Sort, # then drop any line whose start is < the previous line's end (cheap # defensive trim; loses at most a handful of bins per chain edge). sort -k1,1 -k2,2n modeC_hg38.bedGraph -o modeC_hg38.bedGraph sort -k1,1 -k2,2n modeC_hs1.bedGraph | awk 'BEGIN{OFS="\t"; pc=""; pe=-1} {if (!($1==pc && $2<pe)) {print; pc=$1; pe=$3}}' > modeC_hs1.bedGraph.trim mv modeC_hs1.bedGraph.trim modeC_hs1.bedGraph bedGraphToBigWig modeC_hg38.bedGraph "$HG38_CHROM_SIZES" modeC_hg38.bw bedGraphToBigWig modeC_hs1.bedGraph "$HS1_CHROM_SIZES" modeC_hs1.bw # Co-locate the quickLift chain pair so both stanzas fetch from one host. # quickLift's chain loader constructs the link filename from the .bb name # by replacing ".bb" with ".link.bb", so the link file MUST be named # <chain>.link.bb (not <chain>Link.bb). cp "$QUICK_CHAIN_SRC" hg38ToHs1.quick.bb cp "$QUICK_CHAIN_LINK_SRC" hg38ToHs1.quick.link.bb # Two single-genome useOneFile hubs (hub_hs1.txt, hub_hg38.txt). We can't # put both genomes in one useOneFile because trackHubGenomeReadRa() breaks # out of the genome-parsing loop at the first `track` line (trackHub.c:679), # so a 2nd `genome` block after any track stanza is silently dropped. cat > hub_hs1.txt <<EOF hub quickLiftBench_hs1 shortLabel quickLiftBench (hs1) longLabel quickLiftBench: hs1 stanzas for Mode C and the lifted side of Mode B useOneFile on email braney@ucsc.edu descriptionUrl https://redmine.gi.ucsc.edu/issues/37445 genome hs1 track modeC_bb_native shortLabel modeC bb native longLabel Mode C bigBed native: BED12 features in hs1 coords type bigBed 12 bigDataUrl modeC_hs1.bb visibility pack color 0,128,0 priority 1 track modeC_bb_lifted shortLabel modeC bb lifted longLabel Mode C bigBed lifted: same features in hg38 coords + quickLift hg38->hs1 type bigBed 12 bigDataUrl modeC_hg38.bb quickLiftUrl hg38ToHs1.quick.bb quickLiftDb hg38 visibility pack color 0,128,0 priority 2 track modeC_bw_native shortLabel modeC bw native longLabel Mode C bigWig native: signal in hs1 coords type bigWig bigDataUrl modeC_hs1.bw visibility full color 0,128,0 viewLimits 0:200 priority 3 track modeC_bw_lifted shortLabel modeC bw lifted longLabel Mode C bigWig lifted: same signal in hg38 coords + quickLift hg38->hs1 type bigWig bigDataUrl modeC_hg38.bw quickLiftUrl hg38ToHs1.quick.bb quickLiftDb hg38 viewLimits 0:200 visibility full color 0,128,0 priority 4 EOF cat > hub_hg38.txt <<EOF hub quickLiftBench_hg38 shortLabel quickLiftBench (hg38) longLabel quickLiftBench: hg38-native stanzas for the native side of Mode B useOneFile on email braney@ucsc.edu descriptionUrl https://redmine.gi.ucsc.edu/issues/37445 genome hg38 track modeB_bb_native shortLabel modeB bb native longLabel Mode B bigBed native: features rendered natively on hg38 (source assembly) type bigBed 12 bigDataUrl modeC_hg38.bb visibility pack color 0,128,0 priority 1 track modeB_bw_native shortLabel modeB bw native longLabel Mode B bigWig native: signal rendered natively on hg38 (source assembly) type bigWig bigDataUrl modeC_hg38.bw visibility full color 0,128,0 viewLimits 0:200 priority 2 EOF echo "hub built at $DEST" echo " modeC_hg38.bb: $(wc -l < modeC_hg38.bed) features" echo " modeC_hs1.bb: $(wc -l < modeC_hs1.bed) features (after liftOver)" echo " bedGraph bins: hg38=$(wc -l < modeC_hg38.bedGraph) hs1=$(wc -l < modeC_hs1.bedGraph)"