Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 7 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,15 +47,18 @@ result = pipeline.run(manifest_path="/path/to/slides.csv")

### Input Manifest

Manifest-driven runs use the schema below. `mask_path` is optional.
Manifest-driven runs use the schema below. `mask_path` and `spacing_at_level_0` are optional.

```csv
sample_id,image_path,mask_path
slide-1,/path/to/slide-1.svs,/path/to/mask-1.png
slide-2,/path/to/slide-2.svs,
sample_id,image_path,mask_path,spacing_at_level_0
slide-1,/path/to/slide-1.svs,/path/to/mask-1.png,0.25
slide-2,/path/to/slide-2.svs,,
...
```

Use `spacing_at_level_0` when the slide file reports a missing or incorrect level-0 spacing and you want to override it.


### Outputs

The package writes explicit artifact directories:
Expand Down
22 changes: 18 additions & 4 deletions docs/cli.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,14 +34,16 @@ This command:

## Input Manifest

The manifest must use the HS2P schema. `mask_path` is optional.
The manifest must use the hs2p schema. `mask_path` and `spacing_at_level_0` are optional.

```csv
sample_id,image_path,mask_path
slide-1,/path/to/slide-1.svs,/path/to/mask-1.png
slide-2,/path/to/slide-2.svs,
sample_id,image_path,mask_path,spacing_at_level_0
slide-1,/path/to/slide-1.svs,/path/to/mask-1.png,0.25
slide-2,/path/to/slide-2.svs,,
```

Use `spacing_at_level_0` when you need to override the slide's native level-0 spacing metadata for tiling.

Set `csv:` in your config file to point to this manifest.

## What the Config Controls
Expand Down Expand Up @@ -115,6 +117,18 @@ The CLI writes explicit artifact directories under the run output directory:
- optional `slide_latents/<sample_id>.pt` or `.npz`
- `process_list.csv`
- the resolved saved config file for the run
- `logs/` with the main log plus distributed worker stdout/stderr captures when multi-GPU workers are used

## Progress UX

When stdout is an interactive terminal, the CLI shows live `rich` progress for:

- tiling discovery and completion
- overall slide embedding progress
- current-slide tile or region progress
- slide-level aggregation when the model pools tile features into slide embeddings

When stdout is not interactive, the CLI falls back to plain text stage updates and summaries.

## Typical Workflows

Expand Down
4 changes: 4 additions & 0 deletions docs/python-api.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ Shape conventions:

Use `embed_slides(...)` for ordered multi-slide in-memory extraction.

If a slide reports the wrong native spacing, pass a `SlideSpec`-like object or mapping with `spacing_at_level_0`, or use `Model.embed_slide(..., spacing_at_level_0=...)` for path-like inputs.

When `ExecutionOptions(num_gpus=2)` or another value greater than `1` is used:

- `embed_slide(...)` shards one slide's tiles across GPUs
Expand Down Expand Up @@ -145,3 +147,5 @@ result = pipeline.run(manifest_path="/path/to/slides.csv")
- `tile_artifacts`
- `slide_artifacts`
- `process_list_path`

The manifest schema matches HS2P and accepts optional `mask_path` and `spacing_at_level_0` columns.
1 change: 1 addition & 0 deletions requirements.in
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ huggingface-hub>=0.30.0,<1.0
numpy<2
pandas
pillow
rich
tqdm
wandb
torch>=2.3,<2.8
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,11 @@ wandb
numpy==1.26.1
pandas
pillow
rich
einops
tqdm
omegaconf
wholeslidedata
huggingface_hub
torch==2.1.0
torchvision==0.16.0
torchvision==0.16.0
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ install_requires =
numpy<2
pandas
pillow
rich
tqdm
torchvision
wholeslidedata<0.0.16
Expand Down
20 changes: 14 additions & 6 deletions slide2vec/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,11 @@
from slide2vec.artifacts import SlideEmbeddingArtifact, TileEmbeddingArtifact

if TYPE_CHECKING:
from slide2vec.inference import LoadedModel, SlideRecord
from hs2p import SlideSpec
from slide2vec.inference import LoadedModel
else:
LoadedModel = Any
SlideRecord = Any
SlideSpec = Any


DEFAULT_LEVEL_BY_NAME = {
Expand All @@ -29,9 +30,10 @@ class SlideLike(Protocol):
sample_id: str
image_path: PathLike
mask_path: PathLike | None
spacing_at_level_0: float | None


SlideInput = PathLike | Mapping[str, object] | SlideLike | SlideRecord
SlideInput = PathLike | Mapping[str, object] | SlideLike | SlideSpec
SlideSequence = Sequence[SlideInput]
TilingResultsInput = Sequence[Any] | Mapping[str, Any]

Expand Down Expand Up @@ -237,18 +239,20 @@ def embed_slide(
execution: ExecutionOptions | None = None,
sample_id: str | None = None,
mask_path: PathLike | None = None,
spacing_at_level_0: float | None = None,
) -> EmbeddedSlide:
...

@overload
def embed_slide(
self,
slide: Mapping[str, object] | SlideLike | SlideRecord,
slide: Mapping[str, object] | SlideLike | SlideSpec,
*,
preprocessing: PreprocessingConfig,
execution: ExecutionOptions | None = None,
sample_id: None = None,
mask_path: None = None,
spacing_at_level_0: None = None,
) -> EmbeddedSlide:
...

Expand All @@ -260,15 +264,19 @@ def embed_slide(
execution: ExecutionOptions | None = None,
sample_id: str | None = None,
mask_path: PathLike | None = None,
spacing_at_level_0: float | None = None,
) -> EmbeddedSlide:
if isinstance(slide, (str, Path)):
slide = {
"sample_id": sample_id or Path(slide).stem,
"image_path": Path(slide),
"mask_path": Path(mask_path) if mask_path is not None else None,
"spacing_at_level_0": spacing_at_level_0,
}
elif sample_id is not None or mask_path is not None:
raise ValueError("sample_id and mask_path overrides are only supported when slide is a path-like input")
elif sample_id is not None or mask_path is not None or spacing_at_level_0 is not None:
raise ValueError(
"sample_id, mask_path, and spacing_at_level_0 overrides are only supported when slide is a path-like input"
)
return self.embed_slides(
[slide],
preprocessing=preprocessing,
Expand Down
11 changes: 7 additions & 4 deletions slide2vec/cli.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import argparse

from slide2vec.api import ExecutionOptions, Model, Pipeline, PreprocessingConfig
import slide2vec.progress as progress


def get_args_parser(add_help: bool = True):
Expand Down Expand Up @@ -44,10 +45,12 @@ def main(argv=None):
parser = get_args_parser(add_help=True)
args = parser.parse_args(argv)
pipeline, cfg = build_model_and_pipeline(args)
return pipeline.run(
manifest_path=cfg.csv,
tiling_only=args.tiling_only,
)
reporter = progress.create_cli_progress_reporter(output_dir=getattr(cfg, "output_dir", None))
with progress.activate_progress_reporter(reporter):
return pipeline.run(
manifest_path=cfg.csv,
tiling_only=args.tiling_only,
)


def _setup_cli_config(args):
Expand Down
4 changes: 2 additions & 2 deletions slide2vec/configs/preprocessing/default.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,15 @@ tiling:
drop_holes: false # whether or not to drop tiles whose center pixel falls withing an identified holes
use_padding: true # whether to pad the border of the slide
seg_params:
downsample: 16 # find the closest downsample in the slide for tissue segmentation
downsample: 64 # find the closest downsample in the slide for tissue segmentation
sthresh: 8 # segmentation threshold (positive integer, using a higher threshold leads to less foreground and more background detection) (not used when use_otsu=True)
sthresh_up: 255 # upper threshold value for scaling the binary mask
mthresh: 7 # median filter size (positive, odd integer)
close: 4 # additional morphological closing to apply following initial thresholding (positive integer)
use_otsu: false # use otsu's method instead of simple binary thresholding
use_hsv: true # use HSV thresholding instead of simple binary thresholding
filter_params:
ref_tile_size: 16 # reference tile size at spacing tiling.params.target_spacing_um
ref_tile_size: ${target_tile_size_px} # reference tile size at spacing tiling.params.target_spacing_um
a_t: 4 # area filter threshold for tissue (positive integer, the minimum size of detected foreground contours to consider, relative to the reference tile size ref_tile_size, e.g. a value 10 means only detected foreground contours of size greater than 10 [ref_tile_size, ref_tile_size] tiles at spacing tiling.params.target_spacing_um will be kept)
a_h: 2 # area filter threshold for holes (positive integer, the minimum size of detected holes/cavities in foreground contours to avoid, once again relative to the reference tile size ref_tile_size)
max_n_holes: 8 # maximum of holes to consider per detected foreground contours (positive integer, higher values lead to more accurate patching but increase computational cost ; keeps the biggest holes)
Expand Down
80 changes: 43 additions & 37 deletions slide2vec/distributed/direct_embed_worker.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import argparse
from contextlib import nullcontext
import json
from pathlib import Path

Expand All @@ -25,6 +26,7 @@ def main(argv=None) -> int:
deserialize_preprocessing,
load_successful_tiled_slides,
)
from slide2vec.progress import JsonlProgressReporter, activate_progress_reporter

parser = get_args_parser(add_help=True)
args = parser.parse_args(argv)
Expand Down Expand Up @@ -52,49 +54,53 @@ def main(argv=None) -> int:
slide.sample_id: (slide, tiling_result)
for slide, tiling_result in zip(slide_records, tiling_results)
}
progress_events_path = request.get("progress_events_path")
reporter = JsonlProgressReporter(progress_events_path, rank=global_rank) if progress_events_path else None
context = activate_progress_reporter(reporter) if reporter is not None else nullcontext()

if request["strategy"] == "tile_shard":
sample_id = request["sample_id"]
slide, tiling_result = paired_by_sample[sample_id]
num_tiles = len(tiling_result.x)
tile_indices = np.array_split(np.arange(num_tiles, dtype=np.int64), world_size)[global_rank]
loaded = model._load_backend()
tile_embeddings = _compute_tile_embeddings_for_slide(
loaded,
with context:
if request["strategy"] == "tile_shard":
sample_id = request["sample_id"]
slide, tiling_result = paired_by_sample[sample_id]
num_tiles = len(tiling_result.x)
tile_indices = np.array_split(np.arange(num_tiles, dtype=np.int64), world_size)[global_rank]
loaded = model._load_backend()
tile_embeddings = _compute_tile_embeddings_for_slide(
loaded,
model,
slide,
tiling_result,
preprocessing=preprocessing,
execution=execution,
tile_indices=tile_indices,
)
payload = {
"tile_index": torch.as_tensor(tile_indices, dtype=torch.long),
"tile_embeddings": tile_embeddings.detach().cpu() if torch.is_tensor(tile_embeddings) else torch.as_tensor(tile_embeddings),
}
torch.save(payload, coordination_dir / f"{sample_id}.tiles.rank{global_rank}.pt")
return 0

assigned_ids = list(request.get("assignments", {}).get(str(global_rank), []))
if not assigned_ids:
return 0
assigned_slides = [paired_by_sample[sample_id][0] for sample_id in assigned_ids]
assigned_tiling_results = [paired_by_sample[sample_id][1] for sample_id in assigned_ids]
embedded_slides = _compute_embedded_slides(
model,
slide,
tiling_result,
assigned_slides,
assigned_tiling_results,
preprocessing=preprocessing,
execution=execution,
tile_indices=tile_indices,
)
payload = {
"tile_index": torch.as_tensor(tile_indices, dtype=torch.long),
"tile_embeddings": tile_embeddings.detach().cpu() if torch.is_tensor(tile_embeddings) else torch.as_tensor(tile_embeddings),
}
torch.save(payload, coordination_dir / f"{sample_id}.tiles.rank{global_rank}.pt")
return 0

assigned_ids = list(request.get("assignments", {}).get(str(global_rank), []))
if not assigned_ids:
for embedded_slide in embedded_slides:
payload = {
"tile_embeddings": _to_cpu_payload(torch, embedded_slide.tile_embeddings),
"slide_embedding": _to_cpu_payload(torch, embedded_slide.slide_embedding),
"latents": _to_cpu_payload(torch, embedded_slide.latents),
}
torch.save(payload, coordination_dir / f"{embedded_slide.sample_id}.embedded.pt")
return 0
assigned_slides = [paired_by_sample[sample_id][0] for sample_id in assigned_ids]
assigned_tiling_results = [paired_by_sample[sample_id][1] for sample_id in assigned_ids]
embedded_slides = _compute_embedded_slides(
model,
assigned_slides,
assigned_tiling_results,
preprocessing=preprocessing,
execution=execution,
)
for embedded_slide in embedded_slides:
payload = {
"tile_embeddings": _to_cpu_payload(torch, embedded_slide.tile_embeddings),
"slide_embedding": _to_cpu_payload(torch, embedded_slide.slide_embedding),
"latents": _to_cpu_payload(torch, embedded_slide.latents),
}
torch.save(payload, coordination_dir / f"{embedded_slide.sample_id}.embedded.pt")
return 0
finally:
if dist.is_available() and dist.is_initialized():
dist.destroy_process_group()
Expand Down
28 changes: 17 additions & 11 deletions slide2vec/distributed/pipeline_worker.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import argparse
from contextlib import nullcontext
import json
from pathlib import Path

Expand All @@ -22,6 +23,7 @@ def main(argv=None) -> int:
deserialize_preprocessing,
load_successful_tiled_slides,
)
from slide2vec.progress import JsonlProgressReporter, activate_progress_reporter

parser = get_args_parser(add_help=True)
args = parser.parse_args(argv)
Expand Down Expand Up @@ -49,21 +51,25 @@ def main(argv=None) -> int:
return 0
assigned_slides = [slide for slide, _ in assigned_pairs]
assigned_tiling_results = [tiling_result for _, tiling_result in assigned_pairs]
embedded_slides = _compute_embedded_slides(
model,
assigned_slides,
assigned_tiling_results,
preprocessing=preprocessing,
execution=execution,
)
for embedded_slide, tiling_result in zip(embedded_slides, assigned_tiling_results):
_persist_embedded_slide(
progress_events_path = request.get("progress_events_path")
reporter = JsonlProgressReporter(progress_events_path, rank=global_rank) if progress_events_path else None
context = activate_progress_reporter(reporter) if reporter is not None else nullcontext()
with context:
embedded_slides = _compute_embedded_slides(
model,
embedded_slide,
tiling_result,
assigned_slides,
assigned_tiling_results,
preprocessing=preprocessing,
execution=execution,
)
for embedded_slide, tiling_result in zip(embedded_slides, assigned_tiling_results):
_persist_embedded_slide(
model,
embedded_slide,
tiling_result,
preprocessing=preprocessing,
execution=execution,
)
return 0
finally:
if dist.is_available() and dist.is_initialized():
Expand Down
Loading
Loading