clemsgrs · clemsgrs · Mar 18, 2026 · Mar 18, 2026 · Mar 19, 2026 · Mar 19, 2026
diff --git a/.dockerignore b/.dockerignore
@@ -1,3 +1,3 @@
 data/
 output/
-docker/
+outputs/
diff --git a/Dockerfile b/Dockerfile
@@ -1,11 +1,10 @@
 ARG UBUNTU_VERSION=22.04
-ARG CUDA_MAJOR_VERSION=11.8.0
-ARG CUDNN_MAJOR_VERSION=8
+ARG CUDA_MAJOR_VERSION=12.8.1
 
 ########################
 # Stage 1: build stage #
 ########################
-FROM nvidia/cuda:${CUDA_MAJOR_VERSION}-cudnn${CUDNN_MAJOR_VERSION}-devel-ubuntu${UBUNTU_VERSION} AS build
+FROM nvidia/cuda:${CUDA_MAJOR_VERSION}-cudnn-devel-ubuntu${UBUNTU_VERSION} AS build
 
 ARG USER_UID=1001
 ARG USER_GID=1001
@@ -70,7 +69,7 @@ RUN python -m pip install 'flash-attn>=2.7.1,<=2.8.0' --no-build-isolation
 ##########################
 # Stage 2: runtime stage #
 ##########################
-FROM nvidia/cuda:${CUDA_MAJOR_VERSION}-cudnn${CUDNN_MAJOR_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
+FROM nvidia/cuda:${CUDA_MAJOR_VERSION}-cudnn-runtime-ubuntu${UBUNTU_VERSION}
 
 ARG USER_UID=1001
 ARG USER_GID=1001

diff --git a/docs/benchmarking.md b/docs/benchmarking.md
@@ -0,0 +1,82 @@
+# Benchmarking
+
+`slide2vec` includes a benchmark runner for end-to-end embedding throughput sweeps across different GPU environments and multiple model configs.
+
+The script samples a balanced subset of your manifest, runs untimed warmups plus repeated measured trials, tunes only:
+
+- `model.batch_size`
+- `speed.num_workers_embedding`
+
+It keeps the rest of each model config fixed, disables previews / resume / Weights & Biases, and writes:
+
+- `trial_results.csv`
+- `best_results.csv`
+- `throughput_by_gpu.png`
+- `throughput_by_gpu_and_size.png`
+- `tuning_<gpu>_<model>.png`
+
+Default sweep values:
+
+- `--n-slides 0` to benchmark the full manifest by default
+- `--batch-sizes 1 32 64 128 256`
+- `--embedding-workers 4 8 16 32 64 128`
+
+## Basic Usage
+
+```shell
+python scripts/benchmark_embedding_throughput.py \
+  --config-files /path/to/pathojepa-small.yaml /path/to/pathojepa-base.yaml /path/to/pathojepa-large.yaml \
+  --model-labels PathoJEPA-S PathoJEPA-B PathoJEPA-L \
+  --size-labels S B L \
+  --csv /path/to/slides.csv \
+  --gpu-label "A100-80GB" \
+  --batch-sizes 1 32 64 128 256 \
+  --embedding-workers 4 8 16 32 64 128 \
+  --repeat 3 \
+  --n-slides 0 \
+  --output-dir /tmp/slide2vec-benchmark
+```
+
+Notes:
+
+- the benchmark measures the full `Pipeline.run(...)` path, including tiling
+- stage timings for tiling, embedding, and aggregation are also recorded when progress events are available
+- embedding trials also record per-batch timing summaries from `embedding.batch.timing` events, including mean loader wait, mean ready-wait after async copy/preprocess, mean preprocess time, mean forward time, and a loader-wait fraction
+- every compared model reuses the same sampled manifest within a run
+- each config gets an untimed warmup before measured repeats
+- benchmark config files are loaded through the same default-merge and validation path as the regular CLI, so omitted standard keys inherit the usual defaults
+
+Single-model usage is still supported:
+
+```shell
+python scripts/benchmark_embedding_throughput.py \
+  --config-file /path/to/model-config.yaml \
+  --csv /path/to/slides.csv \
+  --gpu-label "A100-80GB"
+```
+
+In multi-model mode:
+
+- `--config-files` is the primary interface
+- `--model-labels` must match the config count
+- `--size-labels` must match the config count
+- size labels are explicit metadata like `S`, `B`, `L`, `XL`; the script does not infer them
+
+## Merging GPU Runs
+
+Run the benchmark once per GPU environment, then regenerate the cross-GPU comparison chart from multiple `trial_results.csv` files:
+
+```shell
+python scripts/benchmark_embedding_throughput.py \
+  --chart-only \
+  /tmp/a100-benchmark/trial_results.csv \
+  /tmp/h100-benchmark/trial_results.csv \
+  --output-dir /tmp/slide2vec-benchmark-merged
+```
+
+The merged outputs include:
+
+- `throughput_by_gpu.png` for best tuned model entries per GPU
+- `throughput_by_gpu_and_size.png` for grouped GPU-vs-size bars, choosing the winning model for each `(gpu, size)` bucket
+
+Use `--copy-locally` when your slide source lives on network storage and you want to reduce I/O variance during the sweep.
diff --git a/docs/gpu-throughput-optimization-protocol.md b/docs/gpu-throughput-optimization-protocol.md
@@ -0,0 +1,161 @@
+# GPU Throughput Optimization Protocol
+
+You are optimizing slide2vec embedding throughput on this machine. Use the existing benchmark and timing metrics as the ground truth. Prioritize changes that maximize throughput, reduce loader_wait_fraction and mean_ready_wait_ms while preserving outputs. For every change, rerun the same benchmark slice, compare throughput and timing metrics to baseline, and keep only changes that improve throughput or clearly reduce GPU idle time.
+
+## Goal
+
+Iterate on `slide2vec` code to maximize embedding throughput while preserving correctness.
+Primary optimization targets:
+
+- maximize throughput
+- minimize `loader_wait_fraction`
+- minimize `mean_loader_wait_ms`
+- minimize `mean_ready_wait_ms`
+- keep outputs unchanged
+
+## Recommended Config Shape
+
+Keep the preprocessing config unchanged, just vary model config to try different model sizes (from ViT-S to ViT-G) and embedding-related parameters (batch_size, num_workers_embeddimg, prefetch_factor_embedding, persistent_workers_embedding)
+
+
+## Baseline Benchmark
+
+Start with one model config:
+
+Run:
+
+```bash
+python slide2vec/scripts/benchmark_embedding_throughput.py \
+  --config-file slide2vec/configs/h0-mini.yaml \
+  --csv debug-histai-local.csv \
+  --batch-sizes 32 64 128 256 512 \
+  --embedding-workers 4 8 16 32 \
+  --repeat 2 \
+  --n-slides 0 \
+  --output-dir output/benchmark
+```
+
+This benchmark writes per-trial metrics including embedding timing summaries derived from `embedding.batch.timing` events.
+
+## Follow-Up Targeted Sweep
+
+After the baseline:
+
+- identify the best 2-3 batch sizes
+- identify the best 2-3 worker counts
+- rerun a tighter sweep around them
+- test `prefetch_factor_embedding` values `2`, `4`, `8`
+
+Example:
+
+```bash
+python slide2vec/scripts/benchmark_embedding_throughput.py \
+  --config-file slide2vec/configs/h0-mini.yaml \
+  --csv debug-histai-local.csv \
+  --batch-sizes 128 256 384 \
+  --embedding-workers 8 16 \
+  --repeat 3 \
+  --n-slides 0 \
+  --output-dir output/benchmark-tuned
+```
+
+## Metrics To Optimize
+
+Read these from `trial_results.csv`, `best_results.csv`, and `metrics.json`:
+
+- throughput
+- `loader_wait_fraction`
+- `mean_loader_wait_ms`
+- `max_loader_wait_ms`
+- `mean_ready_wait_ms`
+- `mean_preprocess_ms`
+- `mean_forward_ms`
+- `timed_batches`
+
+Interpretation:
+
+- high `loader_wait_fraction`: the reader side is the bottleneck
+- high `mean_ready_wait_ms`: transfer or preprocessing is not overlapping enough with forward
+- high `mean_preprocess_ms` with low `mean_forward_ms`: preprocessing is the bottleneck
+- throughput flattening while `mean_forward_ms` dominates: the run is compute-bound rather than loader-bound
+
+## GPU Telemetry
+
+Capture lightweight telemetry during benchmark runs:
+
+```bash
+nvidia-smi dmon -s pucvmet -d 1
+```
+
+or:
+
+```bash
+watch -n 1 nvidia-smi
+```
+
+Record:
+
+- GPU utilization
+- memory usage
+- power
+- SM activity trends during the run
+
+## Artifacts To Hand To The Optimizing Agent
+
+Provide:
+
+- benchmark output directory
+- `trial_results.csv`
+- `best_results.csv`
+- `metrics.json`
+- progress JSONL if present
+- one or two `.nsys-rep` files
+- the exact model config YAML
+- GPU type
+- CPU core count
+- local disk type
+- slide count
+
+## Instructions For The Optimizing Agent
+
+Give the agent a prompt like:
+
+```text
+You are optimizing slide2vec embedding throughput for h0-mini on a single GPU. Start from slide2vec/configs/models/h0-mini.yaml and benchmark with python slide2vec/scripts/benchmark_embedding_throughput.py --config-file slide2vec/configs/models/h0-mini.yaml --csv debug-histai-local.csv --batch-sizes 32 64 128 256 512 --embedding-workers 4 8 16 32 --repeat 2 --n-slides 0 --output-dir output/benchmark-baseline.
+
+Your goal is to maximize throughput while preserving embedding correctness. You may change config parameters and, if justified by the metrics, change the codebase. Prioritize improvements that reduce loader_wait_fraction, mean_loader_wait_ms, and mean_ready_wait_ms. Test prefetch_factor_embedding and persistent_workers_embedding. Keep one variable sweep tight and controlled, compare every run to the same baseline, and only keep a change if throughput improves or GPU idle time clearly drops.
+
+After each promising change, rerun the same benchmark slice, record the throughput delta and timing deltas, and summarize whether the bottleneck is reader-bound, preprocess-bound, or compute-bound. If code changes are made, keep them minimal, document them under docs/optimize-throughput, rerun the benchmark, and verify that output shapes and metadata contracts stay unchanged. Do not count a change as good unless throughput improves or idle-related metrics clearly improve.
+```
+
+Additional constraints for the agent:
+
+- compare against the same manifest
+- compare on the same GPU type
+- compare with the same batch-size and worker grid unless intentionally testing a new knob
+- do not count a change as good unless throughput improves or idle-related metrics clearly improve
+- preserve embedding outputs and metadata contracts
+
+## Suggested Iteration Loop
+
+For each code change:
+
+1. run the same benchmark slice used for the baseline
+2. compare throughput and timing metrics against the baseline
+3. keep the change only if it improves throughput or materially reduces idle time
+4. rerun one Nsight Systems profile when a change looks promising
+5. keep notes on:
+   - what changed
+   - throughput delta
+   - loader-wait delta
+   - ready-wait delta
+   - whether correctness changed
+
+## Success Criteria
+
+The optimization effort is successful when:
+
+- throughput improves materially on the target GPU
+- `loader_wait_fraction` becomes a small minority of embedding time
+- large batches are limited mainly by compute or memory, not by loader stalls
+- Nsight shows reduced gaps between forward passes
diff --git a/docs/python-api.md b/docs/python-api.md
@@ -60,6 +60,7 @@ Commonly overridden fields:
 - `target_tile_size_px`
 - `tissue_threshold`
 - `backend`
+  `backend` is the tiling / HS2P slide-reader backend. It may be `"asap"` or `"openslide"` depending on the reader you want HS2P to use.
 
 Defaults that most users can leave alone:
 
@@ -70,9 +71,12 @@ Defaults that most users can leave alone:
 - `segmentation={}`
 - `filtering={}`
 - `preview={}`
-- `read_tiles_from=None`
+- `read_coordinates_from=<output_dir>/coordinates` when omitted
+- `read_tiles_from=None` unless you want slide2vec to reuse an explicitly linked external `.tiles.tar` store root
 - `resume=False`
 
+`slide2vec` writes `.tiles.tar` stores during tiling by default. Set `read_tiles_from` only when you want embedding to consume an existing external tile-store root instead of the stores generated in the current run.
+
 Advanced example:
 
 ```python
@@ -102,6 +106,9 @@ preprocessing = PreprocessingConfig(
 - `num_workers`
 - `num_gpus`
 - `mixed_precision`
+- `prefetch_factor`
+- `persistent_workers`
+- `gpu_batch_preprocessing`
 - `save_tile_embeddings`
 - `save_latents`
 

diff --git a/requirements.in b/requirements.in
@@ -6,7 +6,7 @@ pandas
 pillow
 rich
 tqdm
-hs2p>=2.3.0,<3
+hs2p>=2.4.0,<3
 torch
 torchvision
 transformers

diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,4 @@
-hs2p>=2.3.0,<3
+hs2p>=2.4.0,<3
 omegaconf>=2.3.0
 h5py
 matplotlib