openpipelines-bio · DriesSchaumont · Sep 15, 2025 · Sep 15, 2025 · Sep 15, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,12 +4,28 @@
 
 * `differential_expression/create_pseudobulks`: Removed functionality to filter psuedobulk samples based on number of aggregated samples threshold, as this functionality is now covered in `filter/delimit_count` (PR #1044).
 
+## MARJOR CHANGES
+
+* `correction/cellbender_remove_background_v0_2` is now deprecated in favor of `correction/cellbender_remove_background` and will be removed in a future release (PR #).
+
+## MINOR CHANGES
+
+* Update scanpy to `1.11.4` (PR #).
+
+* `cluster/leiden` and `transform/log1p`: avoid creating unnecessary copies of the output data (PR #).
+
+* `convert/from_10xh5_to_h5mu`: avoid creating copies of data when filtering data (PR #)
+
+* `cluster/leiden`: added `flavor`, `n_iterations` and `seed` arguments (PR #)
+
 ## NEW FUNCTIONALITY
 
 * `filter/filter_with_pattern`: Filters a MuData object based on gene names using a regex pattern (PR #1070).
 
 * `filter/delimit_counts`: Turns an .obs column of a MuData file containing count data into a boolean column based on thresholds (PR #1069)
 
+* `dimred/pca`: added possibility to do chunked processing using arguments `chunks` and `chunk_size`. Also added a `seed` argument in order to better control the variability between executions (PR #).
+
 ## EXPERIMENTAL
 
 * `differential_expression/deseq2`: Performs differential expression analysis using DESeq2 on bulk or pseudobulk datasets (PR #1044).

diff --git a/src/base/requirements/scanpy.yaml b/src/base/requirements/scanpy.yaml
@@ -1,2 +1,2 @@
 packages:
-  - scanpy~=1.10.4
+  - scanpy~=1.11.4
diff --git a/src/cluster/leiden/config.vsh.yaml b/src/cluster/leiden/config.vsh.yaml
@@ -50,9 +50,26 @@ arguments:
       Name of the .obsm key under which to add the cluster labels.
       The name of the columns in the matrix will correspond to the resolutions.
     default: "leiden"
-
-    # todo: add uns_params
-    # example: uns["leiden"] = {'params': {'n_iterations': -1, 'random_state': 0, 'resolution': 1.0}}
+  - name: --flavor
+    type: string
+    description: |
+          Which package's implementation to use.
+    choices: ["leidenalg", "igraph"]
+    default: "leidenalg"
+  - name: "--n_iterations"
+    required: false
+    type: integer
+    min: 1
+    description: |
+      How many iterations of the Leiden clustering algorithm to perform.
+      When defined, positive values above 2 define the total number of iterations to perform.
+      When not set, the algorithm will run until it reaches its optimal clustering.
+  - name: "--seed"
+    required: false
+    type: integer
+    description: |
+      Fix the initialization of the optimization. Can be used to increase reproducibility.
+    min: 0
 
   # arguments
   - name: "--resolution"

diff --git a/src/cluster/leiden/script.py b/src/cluster/leiden/script.py
@@ -4,7 +4,6 @@
 import time
 import logging
 import logging.handlers
-import warnings
 import mudata as mu
 import pandas as pd
 import scanpy as sc
@@ -54,6 +53,9 @@
 
 _shared_logger_name = "leiden"
 
+if not par["n_iterations"]:
+    par["n_iterations"] = -1
+
 
 # Function to check available space in /dev/shm
 def get_available_shared_memory():
@@ -152,18 +154,18 @@ def run_single_resolution(shared_csr_matrix, obs_names, resolution):
     try:
         connectivities = shared_csr_matrix.to_csr_matrix()
         adata = create_empty_anndata_with_connectivities(connectivities, obs_names)
-        with warnings.catch_warnings():
-            # In the future, the default backend for leiden will be igraph instead of leidenalg.
-            warnings.simplefilter(action="ignore", category=FutureWarning)
-            adata_out = sc.tl.leiden(
-                adata,
-                resolution=resolution,
-                key_added=str(resolution),
-                obsp="connectivities",
-                copy=True,
-            )
+        sc.tl.leiden(
+            adata,
+            resolution=resolution,
+            key_added=str(resolution),
+            obsp="connectivities",
+            flavor=par["flavor"],
+            n_iterations=par["n_iterations"],
+            random_state=par["seed"],
+            copy=False,
+        )
         logger.info(f"Returning result for resolution {resolution}")
-        return adata_out.obs[str(resolution)]
+        return adata.obs[str(resolution)]
     finally:
         obs_names.shm.close()
         shared_csr_matrix.close()

diff --git a/src/cluster/leiden/test.py b/src/cluster/leiden/test.py
@@ -35,15 +35,6 @@ def mudata_custom_connectivities_key(input_data, random_h5mu_path):
     return output_path
 
 
-# @pytest.fixture
-# def random_h5mu_path(tmp_path):
-#     def wrapper():
-#         unique_filename = f"{str(uuid.uuid4())}.h5mu"
-#         temp_file = tmp_path / unique_filename
-#         return temp_file
-#     return wrapper
-
-
 @pytest.mark.parametrize("compression", ["gzip", ""])
 @pytest.mark.parametrize(
     "output_key,expected_output_key", [("fooleiden", "fooleiden"), ("", "leiden")]

diff --git a/src/convert/from_10xh5_to_h5mu/script.py b/src/convert/from_10xh5_to_h5mu/script.py
@@ -10,9 +10,10 @@
     "uns_metrics": "metrics_cellranger",
     "output": "foo.h5mu",
     "min_genes": None,
-    "min_counts": None,
     "output_compression": "gzip",
+    "min_counts": 1,
 }
+meta = {"resources_dir": "src/utils"}
 ## VIASH END
 
 sys.path.append(meta["resources_dir"])
@@ -51,13 +52,16 @@ def read_percentage(val):
 
 # might perform basic filtering to get rid of some data
 # applicable when starting from the raw counts
-if par["min_genes"]:
-    logger.info("Filtering with min_genes=%d", par["min_genes"])
-    sc.pp.filter_cells(adata, min_genes=par["min_genes"])
-
-if par["min_counts"]:
-    logger.info("Filtering with min_counts=%d", par["min_counts"])
-    sc.pp.filter_cells(adata, min_counts=par["min_counts"])
+filtering_args = {}
+for filtering_par_name in ("min_genes", "min_counts"):
+    if (arg_value := par[filtering_par_name]) is not None:
+        filtering_args[filtering_par_name] = arg_value
+if filtering_args:
+    logger.info(
+        "Filtering with %s",
+        ", ".join(["=".join(map(str, _)) for _ in filtering_args.items()]),
+    )
+    sc.pp.filter_cells(adata, **filtering_args)
 
 # generate output
 logger.info("Convert to mudata")

diff --git a/src/convert/from_10xh5_to_h5mu/test.py b/src/convert/from_10xh5_to_h5mu/test.py
@@ -35,7 +35,7 @@ def test_run(run_component, random_h5mu_path):
     # check if file exists
     assert path.exists(output), "No output was created."
 
-    # read it with scanpy
+    # read it with mudata
     data = read_h5mu(output)
 
     # check whether gex was found
@@ -69,7 +69,7 @@ def test_run_with_metrics(run_component, random_h5mu_path):
     # check if file exists
     assert path.exists(output), "No output was created."
 
-    # read it with scanpy
+    # read it with mudata
     data = read_h5mu(output)
 
     # check whether uns slot was found

diff --git a/src/convert/from_10xmtx_to_h5mu/run_test.py b/src/convert/from_10xmtx_to_h5mu/run_test.py
@@ -32,7 +32,7 @@ def test_run(run_component, random_h5mu_path):
     # check if file exists
     assert path.exists(output), "No output was created."
 
-    # read it with scanpy
+    # read it with mudata
     data = read_h5mu(output)
 
     # check whether gex was found

diff --git a/src/correction/cellbender_remove_background_v0_2/config.vsh.yaml b/src/correction/cellbender_remove_background_v0_2/config.vsh.yaml
@@ -1,5 +1,6 @@
 name: cellbender_remove_background_v0_2
 namespace: "correction"
+status: "deprecated"
 scope: "public"
 description: |
   Eliminating technical artifacts from high-throughput single-cell RNA sequencing data.

diff --git a/src/dimred/pca/config.vsh.yaml b/src/dimred/pca/config.vsh.yaml
@@ -2,7 +2,7 @@ name: pca
 namespace: "dimred"
 scope: "public"
 description: |
-  Computes PCA coordinates, loadings and variance decomposition. Uses the implementation of scikit-learn [Pedregosa11].
+  Computes PCA coordinates, loadings and variance decomposition. 
 authors:
   - __merge__: /src/authors/dries_de_maeyer.yaml
     roles: [ maintainer ]
@@ -33,6 +33,26 @@ arguments:
     description: Column name in .var matrix that will be used to select which genes to run the PCA on.
     example: filter_with_hvg
 
+  - name: "--chunked"
+    type: boolean_true
+    description: |
+      If True, perform an incremental PCA on segments of a predefined size. Setting this flag automatically implies zero centering.
+      Must be specified together with --chunk_size.
+
+  - name: "--chunk_size"
+    type: integer
+    min: 2
+    required: false
+    description: |
+      Number of observations to include in each chunk. Required if chunked=True was passed.
+
+  - name: "--seed"
+    type: integer
+    min: 0
+    required: false
+    description: |
+      Used to set the initial states for the optimization. 
+
   # outputs
   - name: "--output"
     alternatives: ["-o"]
@@ -88,9 +108,7 @@ engines:
         - procps
     - type: python
       __merge__: [/src/base/requirements/anndata_mudata.yaml, /src/base/requirements/scanpy.yaml, .]
-  test_setup:
-    - type: python
-      __merge__: [ /src/base/requirements/viashpy.yaml, .]
+  __merge__: [ /src/base/requirements/python_test_setup.yaml, .]
 
 runners:
 - type: executable

diff --git a/src/dimred/pca/script.py b/src/dimred/pca/script.py
@@ -1,6 +1,7 @@
 import scanpy as sc
 import mudata as mu
 import sys
+import pandas as pd
 from anndata import AnnData
 
 ## VIASH START
@@ -17,6 +18,8 @@
     "uns_output": "pca_variance",
     "overwrite": True,
 }
+
+meta = {"resources_dir": "src/utils"}
 ## VIASH END
 
 sys.path.append(meta["resources_dir"])
@@ -31,27 +34,43 @@
 logger.info("Computing PCA components for modality '%s'", par["modality"])
 if par["layer"] and par["layer"] not in data.layers:
     raise ValueError(f"{par['layer']} was not found in modality {par['modality']}.")
+
+chunked, chunk_size = par["chunked"], par["chunk_size"]
+if chunked:
+    if not chunk_size:
+        raise ValueError(
+            "Requested to perform an incremental PCA "
+            "('chunked'), but the chunk size is not set."
+        )
+    if chunk_size < par["num_components"]:
+        raise ValueError(
+            f"The requested chunk size ({chunk_size}) must not be smaller "
+            f"than the number of components ({par['num_components']})"
+        )
+
 layer = data.X if not par["layer"] else data.layers[par["layer"]]
-adata_input_layer = AnnData(layer)
-adata_input_layer.var.index = data.var.index
+adata_input_layer = AnnData(layer, var=pd.DataFrame([], index=data.var.index))
 
-use_highly_variable = False
+mask_var = None
 if par["var_input"]:
     if par["var_input"] not in data.var.columns:
         raise ValueError(
             f"Requested to use .var column {par['var_input']} "
             "as a selection of genes to run the PCA on, "
             f"but the column is not available for modality {par['modality']}"
         )
-    use_highly_variable = True
-    adata_input_layer.var["highly_variable"] = data.var[par["var_input"]]
+    mask_var = data.var[par["var_input"]]
 
 # run pca
-output_adata = sc.tl.pca(
+sc.tl.pca(
     adata_input_layer,
     n_comps=par["num_components"],
-    copy=True,
-    use_highly_variable=use_highly_variable,
+    copy=False,  # A copy was already created
+    return_info=True,
+    mask_var=mask_var,
+    chunked=chunked,
+    chunk_size=chunk_size,
+    random_state=par["seed"],
 )
 
 # store output in specific objects
@@ -70,11 +89,11 @@
             )
         del getattr(data, field)[par[parameter_name]]
 
-data.obsm[par["obsm_output"]] = output_adata.obsm["X_pca"]
-data.varm[par["varm_output"]] = output_adata.varm["PCs"]
+data.obsm[par["obsm_output"]] = adata_input_layer.obsm["X_pca"]
+data.varm[par["varm_output"]] = adata_input_layer.varm["PCs"]
 data.uns[par["uns_output"]] = {
-    "variance": output_adata.uns["pca"]["variance"],
-    "variance_ratio": output_adata.uns["pca"]["variance_ratio"],
+    "variance": adata_input_layer.uns["pca"]["variance"],
+    "variance_ratio": adata_input_layer.uns["pca"]["variance_ratio"],
 }