AllenInstitute · lukezhang-811 · Dec 10, 2024 · Dec 11, 2024 · Dec 11, 2024 · Dec 11, 2024
diff --git a/notebooks/Cross_segmentation_comparison_user_guide.ipynb b/notebooks/Cross_segmentation_comparison_user_guide.ipynb
diff --git a/notebooks/cross_seg_comp_user_guide.ipynb b/notebooks/cross_seg_comp_user_guide.ipynb
diff --git a/spatial_compare/spatial_compare.py b/spatial_compare/spatial_compare.py
@@ -6,10 +6,11 @@
 import scanpy as sc
 import seaborn as sns
 from scipy import sparse
-from scipy.spatial import cKDTree
+from scipy.spatial import cKDTree, distance
 from pathlib import Path
 import plotly.graph_objs as go
 from matplotlib.ticker import FormatStrFormatter
+
 import warnings
 
 from spatial_compare.utils import grouped_obs_mean
@@ -663,9 +664,13 @@ def collect_mutual_match_and_doublets(
         col_name_both_filt = (
             "match_" + self.data_names[0] + "_filt_" + self.data_names[1] + "_filt"
         )
-        seg_comp_df[col_name_both_filt] = seg_comp_df.index.map(
-            get_mutual_matches(dfa, dfb, nn_dist)
-        )  # filt both
+        matched_dfa_only, matched_dfb_only = get_mutual_matches(dfa, dfb, nn_dist)
+        seg_comp_df.loc[matched_dfb_only.index.values.tolist(), col_name_both_filt] = (
+            matched_dfb_only.iloc[:, 0]
+        )
+        seg_comp_df.loc[matched_dfa_only.index.values.tolist(), col_name_both_filt] = (
+            matched_dfa_only.iloc[:, 0]
+        )
 
         # mutual matches with seg a filtered only
         dfa = seg_comp_df[
@@ -676,9 +681,14 @@ def collect_mutual_match_and_doublets(
         col_name_afilt = (
             "match_" + self.data_names[0] + "_filt_" + self.data_names[1] + "_unfilt"
         )
-        seg_comp_df[col_name_afilt] = seg_comp_df.index.map(
-            get_mutual_matches(dfa, dfb, nn_dist)
-        )  # a filt only
+        # dfa filt only
+        matched_dfa_only, matched_dfb_only = get_mutual_matches(dfa, dfb, nn_dist)
+        seg_comp_df.loc[matched_dfb_only.index.values.tolist(), col_name_afilt] = (
+            matched_dfb_only.iloc[:, 0]
+        )
+        seg_comp_df.loc[matched_dfa_only.index.values.tolist(), col_name_afilt] = (
+            matched_dfa_only.iloc[:, 0]
+        )
 
         # mutual matches with seg b filtered only
         dfa = seg_comp_df[seg_comp_df["source"] == self.data_names[0]]
@@ -689,32 +699,20 @@ def collect_mutual_match_and_doublets(
         col_name_bfilt = (
             "match_" + self.data_names[0] + "_unfilt_" + self.data_names[1] + "_filt"
         )
-        seg_comp_df[col_name_bfilt] = seg_comp_df.index.map(
-            get_mutual_matches(dfa, dfb, nn_dist)
-        )  # b filt only
+        # dfa filt only
+        matched_dfa_only, matched_dfb_only = get_mutual_matches(dfa, dfb, nn_dist)
+        seg_comp_df.loc[matched_dfb_only.index.values.tolist(), col_name_bfilt] = (
+            matched_dfb_only.iloc[:, 0]
+        )
+        seg_comp_df.loc[matched_dfa_only.index.values.tolist(), col_name_bfilt] = (
+            matched_dfa_only.iloc[:, 0]
+        )
 
         # save results
         if save:
-            seg_comp_df.to_csv(
-                savepath
-                + bc
-                + "_seg_comp_df_"
-                + self.data_names[0]
-                + "_and_"
-                + self.data_names[1]
-                + "_populated.csv",
-                index=True,
-            )
-            print(
-                "Saved to: "
-                + savepath
-                + bc
-                + "_seg_comp_df_"
-                + self.data_names[0]
-                + "_and_"
-                + self.data_names[1]
-                + "_populated.csv"
-            )
+            result_path = f"{savepath}{bc}_seg_comp_df_{self.data_names[0]}_and_{self.data_names[1]}_populated.csv"
+            seg_comp_df.to_csv(result_path, index=True)
+            print("Saved to: " + result_path)
         return seg_comp_df
 
     def generate_sankey_diagram(
@@ -775,11 +773,12 @@ def generate_sankey_diagram(
                 + self.data_names[0]
                 + "_"
                 + self.data_names[1]
-                + ".html"
+                + ".png"
             )
-            fig.write_html(filepath)
+            fig.write_image(filepath)
+            return fig, unknown_unmatched_cells, filepath
         fig.show()
-        return fig, unknown_unmatched_cells, filepath
+        return fig, unknown_unmatched_cells
 
     def scaling_check(self, seg_comp_df):
         """
@@ -1183,47 +1182,6 @@ def get_column_ordering(df, ordered_rows):
     return output
 
 
-def create_seg_comp_df(barcode, seg_name, base_path, min_transcripts):
-    """
-    Gathers related segmentation results to create descriptive dataframe
-    INPUTS
-        barcode: unique identifier, for locating specific segmentation path. String.
-        seg name: descriptor of segmentation, i.e. algo name. String.
-        base path: path to segmentation results
-        min transcripts: minimum number of transcripts needed to define a cell too low quality to be considered for mapping
-    OUTPUTS
-        seg_df: dataframe with segmentation cell xy locations, low quality cell identifier, and source segmentation ID. index is cell IDs + segmentation identifier string
-    """
-    seg_path = Path(base_path).joinpath(barcode)
-    cell_check = [x for x in seg_path.glob("*.csv") if "cellpose" in x.stem]
-    if cell_check:
-        cxg = pd.read_table(
-            str(seg_path) + "/cellpose-cell-by-gene.csv", index_col=0, sep=","
-        )
-        metadata = pd.read_table(
-            str(seg_path) + "/cellpose_metadata.csv", index_col=0, sep=","
-        )
-    else:
-        cxg = pd.read_table(str(seg_path) + "/cell_by_gene.csv", index_col=0, sep=",")
-        meta = ad.read_h5ad(str(seg_path) + "/metadata.h5ad")
-        metadata = meta.obs
-
-    # assemble seg df with filt cells col, xy cols, and seg name
-    high_quality_cells = (
-        cxg.index[np.where((transcripts_per_cell(cxg) >= min_transcripts))[0]]
-        .astype(str)
-        .values.tolist()
-    )
-    seg_df = metadata[["center_x", "center_y"]].copy()
-    seg_df.index = seg_df.index.astype(str)
-    seg_df.loc[:, "source"] = seg_name
-    seg_df.loc[:, "low_quality_cells"] = np.where(
-        seg_df.index.isin(high_quality_cells), False, True
-    )
-    seg_df.index = seg_name + "_" + seg_df.index
-    return seg_df
-
-
 def get_segmentation_data(
     bc,
     anndata_a,
@@ -1261,7 +1219,7 @@ def get_segmentation_data(
             else:
                 seg_h5ad = anndata_b
                 name = seg_name_b
-            df_cols = ["center_x", "center_y", "source"]
+            df_cols = ["center_x", "center_y", "source", "low_quality_cells"]
             # sum across .X to get sum transcripts/cell > 40
             if "low_quality_cells" not in seg_h5ad.obs.columns.tolist():
                 high_quality_cells = [
@@ -1271,13 +1229,13 @@ def get_segmentation_data(
                 ]
                 seg_h5ad.obs["low_quality_cells"] = [True] * len(seg_h5ad.obs)
                 seg_h5ad.obs.iloc[high_quality_cells, -1] = False
-                df_cols.append("low_quality_cells")
             seg_h5ad.obs.loc[:, "source"] = name
             # save only necessary columns
             if "center_x" not in seg_h5ad.obs.columns.tolist():
                 seg_h5ad.obs["center_x"] = seg_h5ad.obsm["spatial"][:, 0]
                 seg_h5ad.obs["center_y"] = seg_h5ad.obsm["spatial"][:, 1]
             seg_df = seg_h5ad.obs[df_cols]
+            seg_df.index = f"{name}_" + seg_df.index
             seg_dfs.append(seg_df)
         seg_comp_df = pd.concat(seg_dfs, axis=0)
         seg_comp_df.index = seg_comp_df.index.astype(str)
@@ -1312,31 +1270,16 @@ def get_mutual_matches(dfa, dfb, nn_dist):
         columns=["match_dfa_index"],
     )
     match_to_dfb["same_cell"] = np.where(dists_a <= nn_dist, True, False)
+    matched_dfb_only = match_to_dfb[match_to_dfb["same_cell"] == True]
+
     match_to_dfa = pd.DataFrame(
-        data=dfa.index,
-        index=pd.Index(dfb.iloc[inds_b].index.values.tolist(), name="match_dfb_index"),
-        columns=["match_dfa_index"],
+        data=dfb.iloc[inds_b].index.values.tolist(),
+        index=pd.Index(dfa.index, name="match_dfa_index"),
+        columns=["match_dfb_index"],
     )
     match_to_dfa["same_cell"] = np.where(dists_b <= nn_dist, True, False)
-    mutual_matches = pd.merge(match_to_dfa, match_to_dfb, how="outer")
-    mutual_matches = mutual_matches.set_index("match_dfa_index").join(
-        match_to_dfa.reset_index().set_index("match_dfa_index"),
-        how="left",
-        rsuffix="_match",
-    )
-    mutual_match_dict = (
-        mutual_matches[mutual_matches["same_cell"] == True]
-        .drop(["same_cell", "same_cell_match"], axis=1)
-        .to_dict()
-    )
-    inv_mutual_match_dict = {
-        v: k for k, v in mutual_match_dict["match_dfb_index"].items()
-    }
-    # added union operwtor to dictionaries in 2020 for 3.9+ (pep 584), discussed https://stackoverflow.com/questions/38987/how-do-i-merge-two-dictionaries-in-a-single-expression-in-python
-    mutual_matches_stacked = (
-        mutual_match_dict["match_dfb_index"] | inv_mutual_match_dict
-    )
-    return mutual_matches_stacked
+    matched_dfa_only = match_to_dfa[match_to_dfa["same_cell"] == True]
+    return matched_dfb_only, matched_dfa_only
 
 
 def create_node_df_sankey(

diff --git a/spatial_compare/utils.py b/spatial_compare/utils.py
@@ -3,6 +3,8 @@
 import numpy as np
 import pandas as pd
 import seaborn as sns
+from scipy.spatial import distance
+from sklearn.metrics import adjusted_rand_score
 import scipy as sp
 
 
@@ -492,3 +494,51 @@ def compare_reference_and_spatial(
     reference_anndata.obs[target_obs_key] = (
         scale_factor * reference_anndata.obs[target_obs_key]
     )
+
+
+def calculate_hausdorff_dist(
+    seg_comp_df, anndata_a, anndata_b, comp_col, annotation_col="Class_label"
+):
+    # get class labels for all matched cells for anndata a and b
+    matched_cells_a = (
+        seg_comp_df[seg_comp_df["source"] == self.data_names[0]]
+        .loc[:, comp_col]
+        .dropna()
+        .str.removeprefix(self.data_names[1] + "_")
+        .values.tolist()
+    )
+    matched_cells_b = (
+        seg_comp_df[seg_comp_df["source"] == self.data_names[1]]
+        .loc[:, comp_col]
+        .dropna()
+        .str.removeprefix(self.data_names[0] + "_")
+        .values.tolist()
+    )
+    annotations_a = anndata_a.obs.loc[matched_cells_b, annotation_col]
+    annotations_b = anndata_b.obs.loc[matched_cells_a, annotation_col]
+    ari = adjusted_rand_score(annotations_a, annotations_b)
+
+    # Calculate range considering the axis
+    coords_a = seg_comp_df[seg_comp_df["source"] == self.data_names[0]][
+        ["center_x", "center_y"]
+    ].values
+    coords_b = seg_comp_df[seg_comp_df["source"] == self.data_names[1]][
+        ["center_x", "center_y"]
+    ].values
+
+    # Calculate Hausdorff distance
+    hausdorff_dist = round(distance.directed_hausdorff(coords_a, coords_b)[0], 3)
+
+    plt.figure()
+    plt.scatter(
+        coords_a[:, 0], coords_a[:, 1], color="blue", s=1, label=self.data_names[0]
+    )
+    plt.scatter(
+        coords_b[:, 0], coords_b[:, 1], color="red", s=1, label=self.data_names[1]
+    )
+    plt.title(f" Hausdorff Distance: {hausdorff_dist}")
+    plt.legend(bbox_to_anchor=[1.05, 1.0])
+    plt.xlabel("spatial x coords")
+    plt.ylabel("spatial y coords")
+    plt.show()
+    return hausdorff_dist
diff --git a/tests/data/sc_mtg_mini_a.h5ad b/tests/data/sc_mtg_mini_a.h5ad
diff --git a/tests/data/sc_mtg_mini_b.h5ad b/tests/data/sc_mtg_mini_b.h5ad
diff --git a/tests/test_spatial_compare.py b/tests/test_spatial_compare.py
@@ -11,6 +11,8 @@
     "CJ_BG_mini2.h5ad",
     "CJ_BG_mini3.h5ad",
     "CJ_BG_mini4.h5ad",
+    "sc_mtg_mini_a.h5ad",
+    "sc_mtg_mini_a.h5ad",
 ]
 
 TEST_DIR = SC_DIR.joinpath("tests").joinpath("data")
@@ -25,6 +27,8 @@
 TEST_DF = pd.DataFrame.from_records(TEST_DF_RECORDS)
 TEST_DF.index = ["a", "b", "c"]
 
+TEST_SEG_NAMES = ["XEN", "SIS"]
+
 
 def test_get_column_ordering():
     ordered_columns = get_column_ordering(TEST_DF, ordered_rows=["a", "b", "c"])
@@ -36,3 +40,38 @@ def test_SpatialCompare():
     # mock up test
     sc = SpatialCompare(TEST_ANNDATAS[0], TEST_ANNDATAS[1])
     assert all(sc.ad_0[0].obs.columns == sc.ad_1[1].obs.columns)
+
+
+def test_segmentation_comparison():
+    sc = SpatialCompare(
+        TEST_ANNDATAS[4],
+        TEST_ANNDATAS[5],
+        data_names=[TEST_SEG_NAMES[0], TEST_SEG_NAMES[1]],
+        obsm_key="spatial",
+    )
+    seg_comp_df = sc.collect_mutual_match_and_doublets(
+        bc="1370519421", save=False, reuse_saved=False, savepath=str(TEST_DIR)
+    )
+    seg_a_df = seg_comp_df[seg_comp_df["source"] == TEST_SEG_NAMES[0]]
+    seg_b_df = seg_comp_df[seg_comp_df["source"] == TEST_SEG_NAMES[1]]
+    assert len(TEST_ANNDATAS[4]) == len(seg_a_df)
+    assert len(TEST_ANNDATAS[5]) == len(seg_b_df)
+
+    high_q_a_cells = seg_a_df[seg_a_df["low_quality_cells"] == False]
+    high_q_b_cells = seg_b_df[seg_b_df["low_quality_cells"] == False]
+    assert len(high_q_a_cells.iloc[:, 4].dropna()) == len(
+        high_q_b_cells.iloc[:, 4].dropna()
+    )
+
+    matched_cells_a = (
+        seg_comp_df[seg_comp_df["source"] == TEST_SEG_NAMES[0]].iloc[:, 4].dropna()
+    )
+    matched_cells_b = (
+        seg_comp_df[seg_comp_df["source"] == TEST_SEG_NAMES[1]].iloc[:, 4].dropna()
+    )
+    assert (
+        seg_comp_df.loc[
+            matched_cells_a.index.values.tolist(), seg_comp_df.columns[4]
+        ].values.tolist()
+        == matched_cells_a.values.tolist()
+    )