Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,595 changes: 1,595 additions & 0 deletions notebooks/Cross_segmentation_comparison_user_guide.ipynb

Large diffs are not rendered by default.

1,685 changes: 0 additions & 1,685 deletions notebooks/cross_seg_comp_user_guide.ipynb

This file was deleted.

139 changes: 41 additions & 98 deletions spatial_compare/spatial_compare.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,11 @@
import scanpy as sc
import seaborn as sns
from scipy import sparse
from scipy.spatial import cKDTree
from scipy.spatial import cKDTree, distance
from pathlib import Path
import plotly.graph_objs as go
from matplotlib.ticker import FormatStrFormatter

import warnings

from spatial_compare.utils import grouped_obs_mean
Expand Down Expand Up @@ -663,9 +664,13 @@ def collect_mutual_match_and_doublets(
col_name_both_filt = (
"match_" + self.data_names[0] + "_filt_" + self.data_names[1] + "_filt"
)
seg_comp_df[col_name_both_filt] = seg_comp_df.index.map(
get_mutual_matches(dfa, dfb, nn_dist)
) # filt both
matched_dfa_only, matched_dfb_only = get_mutual_matches(dfa, dfb, nn_dist)
seg_comp_df.loc[matched_dfb_only.index.values.tolist(), col_name_both_filt] = (
matched_dfb_only.iloc[:, 0]
)
seg_comp_df.loc[matched_dfa_only.index.values.tolist(), col_name_both_filt] = (
matched_dfa_only.iloc[:, 0]
)

# mutual matches with seg a filtered only
dfa = seg_comp_df[
Expand All @@ -676,9 +681,14 @@ def collect_mutual_match_and_doublets(
col_name_afilt = (
"match_" + self.data_names[0] + "_filt_" + self.data_names[1] + "_unfilt"
)
seg_comp_df[col_name_afilt] = seg_comp_df.index.map(
get_mutual_matches(dfa, dfb, nn_dist)
) # a filt only
# dfa filt only
matched_dfa_only, matched_dfb_only = get_mutual_matches(dfa, dfb, nn_dist)
seg_comp_df.loc[matched_dfb_only.index.values.tolist(), col_name_afilt] = (
matched_dfb_only.iloc[:, 0]
)
seg_comp_df.loc[matched_dfa_only.index.values.tolist(), col_name_afilt] = (
matched_dfa_only.iloc[:, 0]
)

# mutual matches with seg b filtered only
dfa = seg_comp_df[seg_comp_df["source"] == self.data_names[0]]
Expand All @@ -689,32 +699,20 @@ def collect_mutual_match_and_doublets(
col_name_bfilt = (
"match_" + self.data_names[0] + "_unfilt_" + self.data_names[1] + "_filt"
)
seg_comp_df[col_name_bfilt] = seg_comp_df.index.map(
get_mutual_matches(dfa, dfb, nn_dist)
) # b filt only
# dfa filt only
matched_dfa_only, matched_dfb_only = get_mutual_matches(dfa, dfb, nn_dist)
seg_comp_df.loc[matched_dfb_only.index.values.tolist(), col_name_bfilt] = (
matched_dfb_only.iloc[:, 0]
)
seg_comp_df.loc[matched_dfa_only.index.values.tolist(), col_name_bfilt] = (
matched_dfa_only.iloc[:, 0]
)

# save results
if save:
seg_comp_df.to_csv(
savepath
+ bc
+ "_seg_comp_df_"
+ self.data_names[0]
+ "_and_"
+ self.data_names[1]
+ "_populated.csv",
index=True,
)
print(
"Saved to: "
+ savepath
+ bc
+ "_seg_comp_df_"
+ self.data_names[0]
+ "_and_"
+ self.data_names[1]
+ "_populated.csv"
)
result_path = f"{savepath}{bc}_seg_comp_df_{self.data_names[0]}_and_{self.data_names[1]}_populated.csv"
seg_comp_df.to_csv(result_path, index=True)
print("Saved to: " + result_path)
return seg_comp_df

def generate_sankey_diagram(
Expand Down Expand Up @@ -775,11 +773,12 @@ def generate_sankey_diagram(
+ self.data_names[0]
+ "_"
+ self.data_names[1]
+ ".html"
+ ".png"
)
fig.write_html(filepath)
fig.write_image(filepath)
return fig, unknown_unmatched_cells, filepath
fig.show()
return fig, unknown_unmatched_cells, filepath
return fig, unknown_unmatched_cells

def scaling_check(self, seg_comp_df):
"""
Expand Down Expand Up @@ -1183,47 +1182,6 @@ def get_column_ordering(df, ordered_rows):
return output


def create_seg_comp_df(barcode, seg_name, base_path, min_transcripts):
"""
Gathers related segmentation results to create descriptive dataframe
INPUTS
barcode: unique identifier, for locating specific segmentation path. String.
seg name: descriptor of segmentation, i.e. algo name. String.
base path: path to segmentation results
min transcripts: minimum number of transcripts needed to define a cell too low quality to be considered for mapping
OUTPUTS
seg_df: dataframe with segmentation cell xy locations, low quality cell identifier, and source segmentation ID. index is cell IDs + segmentation identifier string
"""
seg_path = Path(base_path).joinpath(barcode)
cell_check = [x for x in seg_path.glob("*.csv") if "cellpose" in x.stem]
if cell_check:
cxg = pd.read_table(
str(seg_path) + "/cellpose-cell-by-gene.csv", index_col=0, sep=","
)
metadata = pd.read_table(
str(seg_path) + "/cellpose_metadata.csv", index_col=0, sep=","
)
else:
cxg = pd.read_table(str(seg_path) + "/cell_by_gene.csv", index_col=0, sep=",")
meta = ad.read_h5ad(str(seg_path) + "/metadata.h5ad")
metadata = meta.obs

# assemble seg df with filt cells col, xy cols, and seg name
high_quality_cells = (
cxg.index[np.where((transcripts_per_cell(cxg) >= min_transcripts))[0]]
.astype(str)
.values.tolist()
)
seg_df = metadata[["center_x", "center_y"]].copy()
seg_df.index = seg_df.index.astype(str)
seg_df.loc[:, "source"] = seg_name
seg_df.loc[:, "low_quality_cells"] = np.where(
seg_df.index.isin(high_quality_cells), False, True
)
seg_df.index = seg_name + "_" + seg_df.index
return seg_df


def get_segmentation_data(
bc,
anndata_a,
Expand Down Expand Up @@ -1261,7 +1219,7 @@ def get_segmentation_data(
else:
seg_h5ad = anndata_b
name = seg_name_b
df_cols = ["center_x", "center_y", "source"]
df_cols = ["center_x", "center_y", "source", "low_quality_cells"]
# sum across .X to get sum transcripts/cell > 40
if "low_quality_cells" not in seg_h5ad.obs.columns.tolist():
high_quality_cells = [
Expand All @@ -1271,13 +1229,13 @@ def get_segmentation_data(
]
seg_h5ad.obs["low_quality_cells"] = [True] * len(seg_h5ad.obs)
seg_h5ad.obs.iloc[high_quality_cells, -1] = False
df_cols.append("low_quality_cells")
seg_h5ad.obs.loc[:, "source"] = name
# save only necessary columns
if "center_x" not in seg_h5ad.obs.columns.tolist():
seg_h5ad.obs["center_x"] = seg_h5ad.obsm["spatial"][:, 0]
seg_h5ad.obs["center_y"] = seg_h5ad.obsm["spatial"][:, 1]
seg_df = seg_h5ad.obs[df_cols]
seg_df.index = f"{name}_" + seg_df.index
seg_dfs.append(seg_df)
seg_comp_df = pd.concat(seg_dfs, axis=0)
seg_comp_df.index = seg_comp_df.index.astype(str)
Expand Down Expand Up @@ -1312,31 +1270,16 @@ def get_mutual_matches(dfa, dfb, nn_dist):
columns=["match_dfa_index"],
)
match_to_dfb["same_cell"] = np.where(dists_a <= nn_dist, True, False)
matched_dfb_only = match_to_dfb[match_to_dfb["same_cell"] == True]

match_to_dfa = pd.DataFrame(
data=dfa.index,
index=pd.Index(dfb.iloc[inds_b].index.values.tolist(), name="match_dfb_index"),
columns=["match_dfa_index"],
data=dfb.iloc[inds_b].index.values.tolist(),
index=pd.Index(dfa.index, name="match_dfa_index"),
columns=["match_dfb_index"],
)
match_to_dfa["same_cell"] = np.where(dists_b <= nn_dist, True, False)
mutual_matches = pd.merge(match_to_dfa, match_to_dfb, how="outer")
mutual_matches = mutual_matches.set_index("match_dfa_index").join(
match_to_dfa.reset_index().set_index("match_dfa_index"),
how="left",
rsuffix="_match",
)
mutual_match_dict = (
mutual_matches[mutual_matches["same_cell"] == True]
.drop(["same_cell", "same_cell_match"], axis=1)
.to_dict()
)
inv_mutual_match_dict = {
v: k for k, v in mutual_match_dict["match_dfb_index"].items()
}
# added union operwtor to dictionaries in 2020 for 3.9+ (pep 584), discussed https://stackoverflow.com/questions/38987/how-do-i-merge-two-dictionaries-in-a-single-expression-in-python
mutual_matches_stacked = (
mutual_match_dict["match_dfb_index"] | inv_mutual_match_dict
)
return mutual_matches_stacked
matched_dfa_only = match_to_dfa[match_to_dfa["same_cell"] == True]
return matched_dfb_only, matched_dfa_only


def create_node_df_sankey(
Expand Down
50 changes: 50 additions & 0 deletions spatial_compare/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.spatial import distance
from sklearn.metrics import adjusted_rand_score
import scipy as sp


Expand Down Expand Up @@ -492,3 +494,51 @@ def compare_reference_and_spatial(
reference_anndata.obs[target_obs_key] = (
scale_factor * reference_anndata.obs[target_obs_key]
)


def calculate_hausdorff_dist(
seg_comp_df, anndata_a, anndata_b, comp_col, annotation_col="Class_label"
):
# get class labels for all matched cells for anndata a and b
matched_cells_a = (
seg_comp_df[seg_comp_df["source"] == self.data_names[0]]
.loc[:, comp_col]
.dropna()
.str.removeprefix(self.data_names[1] + "_")
.values.tolist()
)
matched_cells_b = (
seg_comp_df[seg_comp_df["source"] == self.data_names[1]]
.loc[:, comp_col]
.dropna()
.str.removeprefix(self.data_names[0] + "_")
.values.tolist()
)
annotations_a = anndata_a.obs.loc[matched_cells_b, annotation_col]
annotations_b = anndata_b.obs.loc[matched_cells_a, annotation_col]
ari = adjusted_rand_score(annotations_a, annotations_b)

# Calculate range considering the axis
coords_a = seg_comp_df[seg_comp_df["source"] == self.data_names[0]][
["center_x", "center_y"]
].values
coords_b = seg_comp_df[seg_comp_df["source"] == self.data_names[1]][
["center_x", "center_y"]
].values

# Calculate Hausdorff distance
hausdorff_dist = round(distance.directed_hausdorff(coords_a, coords_b)[0], 3)

plt.figure()
plt.scatter(
coords_a[:, 0], coords_a[:, 1], color="blue", s=1, label=self.data_names[0]
)
plt.scatter(
coords_b[:, 0], coords_b[:, 1], color="red", s=1, label=self.data_names[1]
)
plt.title(f" Hausdorff Distance: {hausdorff_dist}")
plt.legend(bbox_to_anchor=[1.05, 1.0])
plt.xlabel("spatial x coords")
plt.ylabel("spatial y coords")
plt.show()
return hausdorff_dist
Binary file added tests/data/sc_mtg_mini_a.h5ad
Binary file not shown.
Binary file added tests/data/sc_mtg_mini_b.h5ad
Binary file not shown.
39 changes: 39 additions & 0 deletions tests/test_spatial_compare.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
"CJ_BG_mini2.h5ad",
"CJ_BG_mini3.h5ad",
"CJ_BG_mini4.h5ad",
"sc_mtg_mini_a.h5ad",
"sc_mtg_mini_a.h5ad",
]

TEST_DIR = SC_DIR.joinpath("tests").joinpath("data")
Expand All @@ -25,6 +27,8 @@
TEST_DF = pd.DataFrame.from_records(TEST_DF_RECORDS)
TEST_DF.index = ["a", "b", "c"]

TEST_SEG_NAMES = ["XEN", "SIS"]


def test_get_column_ordering():
ordered_columns = get_column_ordering(TEST_DF, ordered_rows=["a", "b", "c"])
Expand All @@ -36,3 +40,38 @@ def test_SpatialCompare():
# mock up test
sc = SpatialCompare(TEST_ANNDATAS[0], TEST_ANNDATAS[1])
assert all(sc.ad_0[0].obs.columns == sc.ad_1[1].obs.columns)


def test_segmentation_comparison():
sc = SpatialCompare(
TEST_ANNDATAS[4],
TEST_ANNDATAS[5],
data_names=[TEST_SEG_NAMES[0], TEST_SEG_NAMES[1]],
obsm_key="spatial",
)
seg_comp_df = sc.collect_mutual_match_and_doublets(
bc="1370519421", save=False, reuse_saved=False, savepath=str(TEST_DIR)
)
seg_a_df = seg_comp_df[seg_comp_df["source"] == TEST_SEG_NAMES[0]]
seg_b_df = seg_comp_df[seg_comp_df["source"] == TEST_SEG_NAMES[1]]
assert len(TEST_ANNDATAS[4]) == len(seg_a_df)
assert len(TEST_ANNDATAS[5]) == len(seg_b_df)

high_q_a_cells = seg_a_df[seg_a_df["low_quality_cells"] == False]
high_q_b_cells = seg_b_df[seg_b_df["low_quality_cells"] == False]
assert len(high_q_a_cells.iloc[:, 4].dropna()) == len(
high_q_b_cells.iloc[:, 4].dropna()
)

matched_cells_a = (
seg_comp_df[seg_comp_df["source"] == TEST_SEG_NAMES[0]].iloc[:, 4].dropna()
)
matched_cells_b = (
seg_comp_df[seg_comp_df["source"] == TEST_SEG_NAMES[1]].iloc[:, 4].dropna()
)
assert (
seg_comp_df.loc[
matched_cells_a.index.values.tolist(), seg_comp_df.columns[4]
].values.tolist()
== matched_cells_a.values.tolist()
)
Loading