neurobagel · alyssadai · Oct 29, 2024 · Oct 25, 2024 · Oct 25, 2024 · Oct 25, 2024
diff --git a/app/api/crud.py b/app/api/crud.py
@@ -199,6 +199,7 @@ async def get(
                     )
                 )
 
+                # Get the unique versions of each pipeline that was run on each session
                 pipeline_grouped_data = (
                     group.groupby(
                         [
@@ -207,27 +208,37 @@ async def get(
                             "session_type",
                             "pipeline_name",
                         ],
-                        dropna=True,
-                    )
-                    .agg(
+                        # Keep NaNs to ensure that when there are no pipeline_name values in the query result,
+                        # we don't end up with an empty dataframe for pipeline_grouped_data
+                        dropna=False,
+                    ).agg(
                         {
                             "pipeline_version": lambda x: list(
                                 x.dropna().unique()
                             )
                         }
                     )
+                    # Turn indices from the groupby back into dataframe columns
                     .reset_index()
                 )
 
+                # Aggregate all completed pipelines for each session
+                session_grouped_data = pipeline_grouped_data.groupby(
+                    ["sub_id", "session_id", "session_type"],
+                )
                 session_completed_pipeline_data = (
-                    pipeline_grouped_data.groupby(
-                        ["sub_id", "session_id", "session_type"]
-                    )
-                    .apply(
-                        lambda x: dict(
-                            zip(x["pipeline_name"], x["pipeline_version"])
-                        )
+                    session_grouped_data.apply(
+                        lambda x: {
+                            pname: pvers
+                            for pname, pvers in zip(
+                                x["pipeline_name"], x["pipeline_version"]
+                            )
+                            if not pd.isnull(pname)
+                        }
                     )
+                    # NOTE: This expects a pd.Series and will not work on a pd.DataFrame
+                    # (pd.DataFrame.reset_index() doesn't have a "name" arg)
+                    # See related https://github.com/pandas-dev/pandas/issues/55225
                     .reset_index(name="completed_pipelines")
                 )
 
@@ -238,11 +249,6 @@ async def get(
                     how="left",
                 )
 
-                # ensure that for sessions missing completed pipeline info, completed_pipelines is still a dict rather than null/nan
-                subject_data["completed_pipelines"] = subject_data[
-                    "completed_pipelines"
-                ].apply(lambda x: x if isinstance(x, dict) else {})
-
                 # TODO: Revisit this as there may be a more elegant solution.
                 # The following code replaces columns with all NaN values with values of None, to ensure they show up in the final JSON as `null`.
                 # This is needed as the above .agg() seems to turn NaN into None for object-type columns (which have some non-missing values)