dpeerlab · EliHei2 · Jan 31, 2026 · Feb 2, 2026 · Feb 2, 2026 · Feb 2, 2026
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
@@ -0,0 +1,17 @@
+version: 2
+updates:
+  - package-ecosystem: "pip"
+    directory: "/"
+    schedule:
+      interval: "weekly"
+      day: "sunday"
+      time: "08:00"
+    open-pull-requests-limit: 10
+
+  - package-ecosystem: "github-actions"
+    directory: "/"
+    schedule:
+      interval: "weekly"
+      day: "sunday"
+      time: "08:00"
+    open-pull-requests-limit: 10
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -0,0 +1,181 @@
+# CI/CD Pipeline for Segger v0.2.0
+#
+# This workflow runs CPU-only tests on every push and pull request.
+# GPU tests are excluded by default and can be run on self-hosted runners.
+#
+# Test Categories:
+# - CPU-only tests: I/O, quality filtering, writers (always run)
+# - SpatialData tests: Require spatialdata package (optional dep)
+# - GPU tests: Require CUDA (skipped on GitHub-hosted runners)
+
+name: Tests
+
+on:
+  push:
+    branches: [main, develop]
+    paths:
+      - 'src/**'
+      - 'tests/**'
+      - 'pyproject.toml'
+      - '.github/workflows/test.yml'
+  pull_request:
+    branches: [main, develop]
+    paths:
+      - 'src/**'
+      - 'tests/**'
+      - 'pyproject.toml'
+      - '.github/workflows/test.yml'
+
+# Cancel in-progress runs on new pushes to same branch
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  # ==========================================================================
+  # CPU-Only Tests (Ubuntu)
+  # ==========================================================================
+  test-cpu:
+    name: CPU Tests (Python ${{ matrix.python-version }})
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.11", "3.12"]
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+          cache: 'pip'
+
+      - name: Install base dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -e ".[dev]"
+
+      - name: Run CPU-only tests
+        run: |
+          pytest tests/ -v -m "not gpu and not spatialdata and not sopa" \
+            --ignore=tests/test_spatialdata_io.py
+        env:
+          SEGGER_DATA_DIR: ${{ runner.temp }}/segger_data
+
+      - name: Upload coverage report
+        if: matrix.python-version == '3.11'
+        uses: codecov/codecov-action@v4
+        with:
+          files: ./coverage.xml
+          fail_ci_if_error: false
+
+  # ==========================================================================
+  # SpatialData Tests (Ubuntu, Python 3.11)
+  # ==========================================================================
+  test-spatialdata:
+    name: SpatialData Tests
+    runs-on: ubuntu-latest
+    # Only run if CPU tests pass
+    needs: test-cpu
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Set up Python 3.11
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+          cache: 'pip'
+
+      - name: Install dependencies with SpatialData
+        run: |
+          python -m pip install --upgrade pip
+          pip install -e ".[dev,spatialdata]"
+
+      - name: Run SpatialData tests
+        run: |
+          pytest tests/test_spatialdata_io.py -v
+        env:
+          SEGGER_DATA_DIR: ${{ runner.temp }}/segger_data
+
+  # ==========================================================================
+  # macOS Tests (Basic I/O only)
+  # ==========================================================================
+  test-macos:
+    name: macOS Tests
+    runs-on: macos-latest
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Set up Python 3.11
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+          cache: 'pip'
+
+      - name: Install minimal dependencies
+        run: |
+          python -m pip install --upgrade pip
+          # Install only what's needed for I/O tests
+          pip install polars geopandas shapely pyarrow pandas numpy pytest pooch
+
+      - name: Install segger in no-deps mode
+        run: |
+          pip install -e . --no-deps || true
+
+      - name: Run I/O tests only
+        run: |
+          pytest tests/test_quality_filter.py tests/test_merged_writer.py -v
+        env:
+          SEGGER_DATA_DIR: ${{ runner.temp }}/segger_data
+
+  # ==========================================================================
+  # Linting and Formatting
+  # ==========================================================================
+  lint:
+    name: Lint & Format Check
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Set up Python 3.11
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+          cache: 'pip'
+
+      - name: Install linting tools
+        run: |
+          python -m pip install --upgrade pip
+          pip install black ruff
+
+      - name: Check formatting with black
+        run: |
+          black --check --diff src/ tests/
+
+      - name: Lint with ruff
+        run: |
+          ruff check src/ tests/
+
+  # ==========================================================================
+  # Type Checking (Optional)
+  # ==========================================================================
+  # Uncomment when type annotations are complete
+  # typecheck:
+  #   name: Type Check
+  #   runs-on: ubuntu-latest
+  #   steps:
+  #     - uses: actions/checkout@v4
+  #     - uses: actions/setup-python@v5
+  #       with:
+  #         python-version: "3.11"
+  #     - run: pip install mypy
+  #     - run: mypy src/segger --ignore-missing-imports
diff --git a/.gitignore b/.gitignore
@@ -3,6 +3,13 @@ __pycache__/
 *.py[codz]
 *$py.class
 
+# Claude
+CLAUDE.md
+claude.md
+CLAUDE.*
+claude.*
+.claude/
+
 # C extensions
 *.so
 
@@ -205,4 +212,28 @@ __marimo__/
 # Custom
 .dev
 .dev/*
-*.pyc
+*.pyc
+
+# Segger-specific
+*.zarr/
+*.zarr.zip
+*.parquet
+!tests/fixtures/*.parquet
+output/
+results/
+checkpoints/
+lightning_logs/
+
+# Large data files (at project root only)
+/data/
+*.h5ad
+*.h5
+
+# Jupyter
+*.ipynb_checkpoints/
+
+# IDE
+.idea/
+.vscode/
+*.swp
+*.swo
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -0,0 +1,141 @@
+# Changelog
+
+All notable changes to this project will be documented in this file.
+
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+
+## [Unreleased]
+
+### 1. High-level
+- No unreleased feature changes yet.
+
+### 2. Low-level
+- N/A.
+
+## [0.2.0] - 2026-02-12
+
+Comparison scope for this release note (relative to `v0.1.0`):
+- Baseline reference: `dd681a8` (`2025-12-17`, `pyproject.toml` version `0.1.0`)
+- Base comparison: `dd681a8...release/v2-stable`
+- Branch snapshot used for this summary: `2c92b43` (`2026-02-13`)
+- Delta size at that snapshot: `33` commits, `76` files changed, `18,232` insertions, `321` deletions.
+
+### 0. Technical Summary (concise)
+
+#### New CLI workflows
+- `segger predict`:
+  - Checkpoint-only inference with strict checkpoint/data compatibility checks (`segger_vocab`, `segger_me_gene_pairs`, `n_genes`).
+  - Supports inference-time graph overrides, assignment threshold controls, fragment controls, and `--use-3d`.
+- `segger export`:
+  - Unified format conversion (`xenium_explorer|merged|spatialdata|anndata`) from parquet/csv/SpatialData segmentation inputs.
+  - Adds explicit input resolution (`--input-format`) and boundary policy controls (`--boundary-method`).
+- `segger plot`:
+  - Resolves Lightning metrics automatically (or via `--log-version`), groups train/val curves by metric key, and renders terminal or PNG outputs.
+
+#### New capabilities
+- End-to-end SpatialData support (ingest + export), including optional AnnData table embedding.
+- Alignment-loss pipeline with ME-gene constraints, scheduled weighting, and checkpoint metadata persistence.
+- Fragment-mode assignment for unassigned transcripts via tx-tx connected components with GPU-first/CPU-fallback execution.
+
+#### Stability/performance changes
+- Strong checkpoint-first safety checks to prevent silent inference mismatches.
+- Improved thresholding and memory behavior in segmentation writing.
+- Hardened boundary generation and parallel Xenium export fallback (process -> thread retry).
+- Expanded lazy optional-dependency handling with clearer failure modes.
+- Broader tests/CI coverage across CLI, export, alignment, fragment, and SpatialData paths.
+
+### 1. High-level (major changes)
+
+#### 1.1 CLI and workflow expansion
+- Added a checkpoint-first inference command: `segger predict -c <checkpoint>`.
+- Added checkpoint metadata validation for saved vocabulary and ME-gene pairs before inference starts.
+- Added training early-stopping controls and best-checkpoint prediction handoff in `segger segment`.
+- Added `segger plot` for loss curves with both terminal output (`--quick`, `uniplot`) and image output (`matplotlib`).
+- Expanded CLI output controls to multi-format segmentation exports (`segger_raw`, `merged`, `spatialdata`, `anndata`, `all`).
+- Expanded export controls to include `--input-format`, `--boundary-method`, and related boundary-generation knobs.
+
+#### 1.2 New export architecture and format support
+- Added a format registry (`OutputFormat`, writer protocol/registration) for consistent export extension.
+- Added dedicated writers for merged transcript output, AnnData output, and SpatialData output.
+- Added a richer Xenium Explorer export path with improved polygon handling and metadata consistency.
+- Added support for choosing boundary-generation strategy (`input`, `convex_hull`, `delaunay`, `skip` where supported).
+- Added SOPA compatibility helpers and conversion utilities for SpatialData-centric downstream workflows.
+
+#### 1.3 SpatialData support from input to output
+- Added SpatialData loader support and `.zarr` path detection in the data module and CLI.
+- Added SpatialData export writer support, including transcript points and optional shapes.
+- Added optional embedding of an AnnData table in SpatialData output.
+- Added lightweight SpatialData Zarr read/write utilities for environments that avoid full `spatialdata` dependency trees.
+
+#### 1.4 Data loading and graph construction upgrades
+- Added configurable transcript quality filtering (`min_qv`) with platform-aware logic.
+- Added explicit quality-filter classes for Xenium, CosMx, MERSCOPE, and SpatialData-based inputs.
+- Added 3D-aware graph construction controls (`use_3d` with `auto/true/false` semantics).
+- Added prediction graph scale-factor plumbing and alignment so CLI and data-module behavior stay consistent.
+- Added optional transcript-edge similarity capture in graph construction for downstream fragment operations.
+
+#### 1.5 Model/loss evolution (alignment + metadata-aware inference)
+- Added `AlignmentLoss` integration with scheduled weighting and combination modes (`interpolate` and `additive`).
+- Added ME-gene edge generation and labeling in heterodata construction.
+- Added contrastive same-gene positive edges and ME-pair negative edges for alignment training.
+- Added positive subsampling logic to control alignment class imbalance.
+- Added checkpoint persistence and restore of `segger_vocab` and `segger_me_gene_pairs`.
+- Added stricter runtime compatibility checks between checkpoint metadata and prediction input data.
+
+#### 1.6 Fragment-mode segmentation for unassigned transcripts
+- Added fragment-mode assignment pipeline for previously unassigned transcripts.
+- Added connected-component grouping using transcript-transcript edges with similarity thresholding.
+- Added GPU-first execution path (when RAPIDS is available) with CPU fallback behavior.
+- Added minimum-fragment-size controls and auto-threshold options for fragment similarity.
+
+#### 1.7 Optional dependency model and package surface cleanup
+- Added centralized optional dependency utilities (`segger.utils.optional_deps`) with clear install guidance.
+- Added lazy module loading in `segger.io`, `segger.export`, `segger.datasets`, and other package entry points.
+- Added explicit RAPIDS requirement checks where GPU-only operations are required.
+- Added optional dependency groups in `pyproject.toml` (`spatialdata`, `spatialdata-io`, `sopa`, `plot`, `spatialdata-all`, `dev`).
+
+#### 1.8 New datasets/helpers for reproducible testing and demos
+- Added `segger.datasets` with toy Xenium loaders and synthetic data generation.
+- Added sample-output generation helpers for merged/parquet and SpatialData conversion workflows.
+- Added plotting and SpatialData demo notebooks to document end-to-end usage.
+
+#### 1.9 Testing and CI expansion
+- Added a full test suite scaffold (`tests/`, fixtures, and targeted modules by subsystem).
+- Added tests for alignment loss, fragment mode, prediction graph behavior, exporters, optional deps, and SpatialData I/O.
+- Added CI workflow (`.github/workflows/test.yml`) and Dependabot config for dependency hygiene.
+- Added pytest and coverage configuration directly in `pyproject.toml`.
+
+#### 1.10 Documentation expansion
+- Added dedicated docs for installation troubleshooting, release process, versioning policy, loss functions, and math foundations.
+- Added structured release note document for `v0.2.0`.
+
+### 2. Low-level (minor changes and refinements)
+
+#### 2.1 Accuracy, performance, and stability refinements
+- Improved thresholding logic in segmentation writing with robust Li/Yen handling and safe fallbacks.
+- Reduced peak memory in per-gene threshold calculations through iterative sampling-based processing.
+- Improved boundary generation throughput with parallel Delaunay options.
+- Added fallback from process workers to thread workers in parallel Xenium export when process pools fail.
+- Added safer empty/degenerate polygon handling in boundary extraction and export code paths.
+- Added additional positional-embedding guards for empty batches and zero-variance coordinates.
+
+#### 2.2 ME-gene discovery and alignment tuning refinements
+- Added ME-gene discovery caching keyed by scRNA input metadata and discovery parameters.
+- Added scRNA preprocessing normalization and optional per-cell-type subsampling for faster ME discovery.
+- Added progress/debug messages for ME discovery and alignment-edge creation (`SEGGER_ME_VERBOSE` / debug flags).
+- Tightened default ME exclusivity criteria and increased pair coverage tuning in discovery defaults.
+
+#### 2.3 CLI polish and compatibility refinements
+- Unified worker-count semantics across related CLI steps.
+- Improved CLI help text for format/export settings and deprecation messaging.
+- Added robust cell-id column alias resolution for export inputs.
+- Added typed handling for unassigned IDs in AnnData export paths.
+
+#### 2.4 Internal API and import refinements
+- Switched multiple package-level imports to lazy-loading patterns to reduce import side effects and startup overhead.
+- Updated data utility import strategy to stay consistent with existing project patterns.
+- Added compatibility comments and deprecation guidance around legacy `cli/config.yaml` defaults.
+
+#### 2.5 Housekeeping
+- No additional housekeeping notes in this release summary.