Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,28 @@

* `differential_expression/create_pseudobulks`: Removed functionality to filter psuedobulk samples based on number of aggregated samples threshold, as this functionality is now covered in `filter/delimit_count` (PR #1044).

## MARJOR CHANGES

* `correction/cellbender_remove_background_v0_2` is now deprecated in favor of `correction/cellbender_remove_background` and will be removed in a future release (PR #).

## MINOR CHANGES

* Update scanpy to `1.11.4` (PR #).

* `cluster/leiden` and `transform/log1p`: avoid creating unnecessary copies of the output data (PR #).

* `convert/from_10xh5_to_h5mu`: avoid creating copies of data when filtering data (PR #)

* `cluster/leiden`: added `flavor`, `n_iterations` and `seed` arguments (PR #)

## NEW FUNCTIONALITY

* `filter/filter_with_pattern`: Filters a MuData object based on gene names using a regex pattern (PR #1070).

* `filter/delimit_counts`: Turns an .obs column of a MuData file containing count data into a boolean column based on thresholds (PR #1069)

* `dimred/pca`: added possibility to do chunked processing using arguments `chunks` and `chunk_size`. Also added a `seed` argument in order to better control the variability between executions (PR #).

## EXPERIMENTAL

* `differential_expression/deseq2`: Performs differential expression analysis using DESeq2 on bulk or pseudobulk datasets (PR #1044).
Expand Down
2 changes: 1 addition & 1 deletion src/base/requirements/scanpy.yaml
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
packages:
- scanpy~=1.10.4
- scanpy~=1.11.4
23 changes: 20 additions & 3 deletions src/cluster/leiden/config.vsh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,26 @@ arguments:
Name of the .obsm key under which to add the cluster labels.
The name of the columns in the matrix will correspond to the resolutions.
default: "leiden"

# todo: add uns_params
# example: uns["leiden"] = {'params': {'n_iterations': -1, 'random_state': 0, 'resolution': 1.0}}
- name: --flavor
type: string
description: |
Which package's implementation to use.
choices: ["leidenalg", "igraph"]
default: "leidenalg"
- name: "--n_iterations"
required: false
type: integer
min: 1
description: |
How many iterations of the Leiden clustering algorithm to perform.
When defined, positive values above 2 define the total number of iterations to perform.
When not set, the algorithm will run until it reaches its optimal clustering.
- name: "--seed"
required: false
type: integer
description: |
Fix the initialization of the optimization. Can be used to increase reproducibility.
min: 0

# arguments
- name: "--resolution"
Expand Down
26 changes: 14 additions & 12 deletions src/cluster/leiden/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import time
import logging
import logging.handlers
import warnings
import mudata as mu
import pandas as pd
import scanpy as sc
Expand Down Expand Up @@ -54,6 +53,9 @@

_shared_logger_name = "leiden"

if not par["n_iterations"]:
par["n_iterations"] = -1


# Function to check available space in /dev/shm
def get_available_shared_memory():
Expand Down Expand Up @@ -152,18 +154,18 @@ def run_single_resolution(shared_csr_matrix, obs_names, resolution):
try:
connectivities = shared_csr_matrix.to_csr_matrix()
adata = create_empty_anndata_with_connectivities(connectivities, obs_names)
with warnings.catch_warnings():
# In the future, the default backend for leiden will be igraph instead of leidenalg.
warnings.simplefilter(action="ignore", category=FutureWarning)
adata_out = sc.tl.leiden(
adata,
resolution=resolution,
key_added=str(resolution),
obsp="connectivities",
copy=True,
)
sc.tl.leiden(
adata,
resolution=resolution,
key_added=str(resolution),
obsp="connectivities",
flavor=par["flavor"],
n_iterations=par["n_iterations"],
random_state=par["seed"],
copy=False,
)
logger.info(f"Returning result for resolution {resolution}")
return adata_out.obs[str(resolution)]
return adata.obs[str(resolution)]
finally:
obs_names.shm.close()
shared_csr_matrix.close()
Expand Down
9 changes: 0 additions & 9 deletions src/cluster/leiden/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,15 +35,6 @@ def mudata_custom_connectivities_key(input_data, random_h5mu_path):
return output_path


# @pytest.fixture
# def random_h5mu_path(tmp_path):
# def wrapper():
# unique_filename = f"{str(uuid.uuid4())}.h5mu"
# temp_file = tmp_path / unique_filename
# return temp_file
# return wrapper


@pytest.mark.parametrize("compression", ["gzip", ""])
@pytest.mark.parametrize(
"output_key,expected_output_key", [("fooleiden", "fooleiden"), ("", "leiden")]
Expand Down
20 changes: 12 additions & 8 deletions src/convert/from_10xh5_to_h5mu/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,10 @@
"uns_metrics": "metrics_cellranger",
"output": "foo.h5mu",
"min_genes": None,
"min_counts": None,
"output_compression": "gzip",
"min_counts": 1,
}
meta = {"resources_dir": "src/utils"}
## VIASH END

sys.path.append(meta["resources_dir"])
Expand Down Expand Up @@ -51,13 +52,16 @@ def read_percentage(val):

# might perform basic filtering to get rid of some data
# applicable when starting from the raw counts
if par["min_genes"]:
logger.info("Filtering with min_genes=%d", par["min_genes"])
sc.pp.filter_cells(adata, min_genes=par["min_genes"])

if par["min_counts"]:
logger.info("Filtering with min_counts=%d", par["min_counts"])
sc.pp.filter_cells(adata, min_counts=par["min_counts"])
filtering_args = {}
for filtering_par_name in ("min_genes", "min_counts"):
if (arg_value := par[filtering_par_name]) is not None:
filtering_args[filtering_par_name] = arg_value
if filtering_args:
logger.info(
"Filtering with %s",
", ".join(["=".join(map(str, _)) for _ in filtering_args.items()]),
)
sc.pp.filter_cells(adata, **filtering_args)

# generate output
logger.info("Convert to mudata")
Expand Down
4 changes: 2 additions & 2 deletions src/convert/from_10xh5_to_h5mu/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def test_run(run_component, random_h5mu_path):
# check if file exists
assert path.exists(output), "No output was created."

# read it with scanpy
# read it with mudata
data = read_h5mu(output)

# check whether gex was found
Expand Down Expand Up @@ -69,7 +69,7 @@ def test_run_with_metrics(run_component, random_h5mu_path):
# check if file exists
assert path.exists(output), "No output was created."

# read it with scanpy
# read it with mudata
data = read_h5mu(output)

# check whether uns slot was found
Expand Down
2 changes: 1 addition & 1 deletion src/convert/from_10xmtx_to_h5mu/run_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def test_run(run_component, random_h5mu_path):
# check if file exists
assert path.exists(output), "No output was created."

# read it with scanpy
# read it with mudata
data = read_h5mu(output)

# check whether gex was found
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
name: cellbender_remove_background_v0_2
namespace: "correction"
status: "deprecated"
scope: "public"
description: |
Eliminating technical artifacts from high-throughput single-cell RNA sequencing data.
Expand Down
26 changes: 22 additions & 4 deletions src/dimred/pca/config.vsh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ name: pca
namespace: "dimred"
scope: "public"
description: |
Computes PCA coordinates, loadings and variance decomposition. Uses the implementation of scikit-learn [Pedregosa11].
Computes PCA coordinates, loadings and variance decomposition.
authors:
- __merge__: /src/authors/dries_de_maeyer.yaml
roles: [ maintainer ]
Expand Down Expand Up @@ -33,6 +33,26 @@ arguments:
description: Column name in .var matrix that will be used to select which genes to run the PCA on.
example: filter_with_hvg

- name: "--chunked"
type: boolean_true
description: |
If True, perform an incremental PCA on segments of a predefined size. Setting this flag automatically implies zero centering.
Must be specified together with --chunk_size.

- name: "--chunk_size"
type: integer
min: 2
required: false
description: |
Number of observations to include in each chunk. Required if chunked=True was passed.

- name: "--seed"
type: integer
min: 0
required: false
description: |
Used to set the initial states for the optimization.

# outputs
- name: "--output"
alternatives: ["-o"]
Expand Down Expand Up @@ -88,9 +108,7 @@ engines:
- procps
- type: python
__merge__: [/src/base/requirements/anndata_mudata.yaml, /src/base/requirements/scanpy.yaml, .]
test_setup:
- type: python
__merge__: [ /src/base/requirements/viashpy.yaml, .]
__merge__: [ /src/base/requirements/python_test_setup.yaml, .]

runners:
- type: executable
Expand Down
43 changes: 31 additions & 12 deletions src/dimred/pca/script.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import scanpy as sc
import mudata as mu
import sys
import pandas as pd
from anndata import AnnData

## VIASH START
Expand All @@ -17,6 +18,8 @@
"uns_output": "pca_variance",
"overwrite": True,
}

meta = {"resources_dir": "src/utils"}
## VIASH END

sys.path.append(meta["resources_dir"])
Expand All @@ -31,27 +34,43 @@
logger.info("Computing PCA components for modality '%s'", par["modality"])
if par["layer"] and par["layer"] not in data.layers:
raise ValueError(f"{par['layer']} was not found in modality {par['modality']}.")

chunked, chunk_size = par["chunked"], par["chunk_size"]
if chunked:
if not chunk_size:
raise ValueError(
"Requested to perform an incremental PCA "
"('chunked'), but the chunk size is not set."
)
if chunk_size < par["num_components"]:
raise ValueError(
f"The requested chunk size ({chunk_size}) must not be smaller "
f"than the number of components ({par['num_components']})"
)

layer = data.X if not par["layer"] else data.layers[par["layer"]]
adata_input_layer = AnnData(layer)
adata_input_layer.var.index = data.var.index
adata_input_layer = AnnData(layer, var=pd.DataFrame([], index=data.var.index))

use_highly_variable = False
mask_var = None
if par["var_input"]:
if par["var_input"] not in data.var.columns:
raise ValueError(
f"Requested to use .var column {par['var_input']} "
"as a selection of genes to run the PCA on, "
f"but the column is not available for modality {par['modality']}"
)
use_highly_variable = True
adata_input_layer.var["highly_variable"] = data.var[par["var_input"]]
mask_var = data.var[par["var_input"]]

# run pca
output_adata = sc.tl.pca(
sc.tl.pca(
adata_input_layer,
n_comps=par["num_components"],
copy=True,
use_highly_variable=use_highly_variable,
copy=False, # A copy was already created
return_info=True,
mask_var=mask_var,
chunked=chunked,
chunk_size=chunk_size,
random_state=par["seed"],
)

# store output in specific objects
Expand All @@ -70,11 +89,11 @@
)
del getattr(data, field)[par[parameter_name]]

data.obsm[par["obsm_output"]] = output_adata.obsm["X_pca"]
data.varm[par["varm_output"]] = output_adata.varm["PCs"]
data.obsm[par["obsm_output"]] = adata_input_layer.obsm["X_pca"]
data.varm[par["varm_output"]] = adata_input_layer.varm["PCs"]
data.uns[par["uns_output"]] = {
"variance": output_adata.uns["pca"]["variance"],
"variance_ratio": output_adata.uns["pca"]["variance_ratio"],
"variance": adata_input_layer.uns["pca"]["variance"],
"variance_ratio": adata_input_layer.uns["pca"]["variance_ratio"],
}


Expand Down
Loading
Loading