cytomining · kenibrewer · Apr 29, 2026 · Apr 29, 2026 · Apr 29, 2026 · coderabbitai
@@ -63,7 +63,7 @@ repos:
     hooks:
     -   id: vulture
 -   repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.20.1
+    rev: v1.20.2
     hooks:
     -   id: mypy
         exclude: >

@@ -412,6 +412,7 @@ def _source_pageset_to_parquet(
             A string of the output filepath.
     """
 
+    import hashlib
     import pathlib
 
     import duckdb
@@ -426,10 +427,16 @@ def _source_pageset_to_parquet(
 
     source_type = str(source["source_path"].suffix).lower()
 
-    # attempt to build dest_path
+    # hash of parent path discriminates sources whose parent dirs share a name
+    # (e.g. analyses/{1,2}/analysis) — see cytomining/CytoTable#442
+    source_parent_hash = hashlib.sha1(
+        str(source["source_path"].parent).encode("utf-8"),
+        usedforsecurity=False,
+    ).hexdigest()[:12]
     source_dest_path = (
         f"{dest_path}/{str(AnyPath(source_group_name).stem).lower()}/"
-        f"{str(source['source_path'].parent.name).lower()}"
+        f"{str(source['source_path'].parent.name).lower()}/"
+        f"{source_parent_hash}"
     )
     pathlib.Path(source_dest_path).mkdir(parents=True, exist_ok=True)
 
@@ -825,13 +832,15 @@ def _concat_source_group(
                 # remove the file which was written in the concatted parquet file (we no longer need it)
                 pathlib.Path(table).unlink()
 
-            # attempt to clean up dir containing original table(s) only if it's empty
-            try:
-                pathlib.Path(pathlib.Path(source["table"][0]).parent).rmdir()
-            except OSError as os_err:
-                # raise only if we don't have a dir not empty errno
-                if os_err.errno != errno.ENOTEMPTY:
-                    raise
+            # clean up the per-source hash dir and its parent if empty
+            chunk_parent = pathlib.Path(source["table"][0]).parent
+            for cleanup_dir in (chunk_parent, chunk_parent.parent):
+                try:
+                    cleanup_dir.rmdir()
+                except OSError as os_err:
+                    if os_err.errno != errno.ENOTEMPTY:
+                        raise
+                    break
 
     # return the concatted parquet filename
     concatted[0]["table"] = [destination_path]

@@ -6,6 +6,7 @@
 
 
 import pathlib
+import shutil
 from typing import List
 
 import anndata as ad
@@ -470,3 +471,63 @@ def test_convert_nested_dirs(fx_tempdir: pathlib.Path):
 
     table = parquet.read_table(source=result)
     assert table.shape == (397, 6049)
+
+
+def test_convert_multi_source_colliding_parent_dir_names(
+    load_parsl_threaded: None,
+    fx_tempdir: str,
+    data_dir_cellprofiler: str,
+):
+    """
+    Regression test for cytomining/CytoTable#442: multi-source convert with
+    identical parent dir names (analyses/{1,2,3}/analysis) used to collide on
+    intermediate parquet paths.
+    """
+
+    src_root = pathlib.Path(fx_tempdir) / "analyses"
+    for site in ("1", "2", "3"):
+        site_dir = src_root / site / "analysis"
+        site_dir.mkdir(parents=True, exist_ok=True)
+        for table_name in ("Cells.csv", "Cytoplasm.csv", "Nuclei.csv", "Image.csv"):
+            shutil.copy(
+                f"{data_dir_cellprofiler}/ExampleHuman/{table_name}",
+                site_dir / table_name,
+            )
+
+    result = convert(
+        source_path=str(src_root),
+        dest_path=f"{fx_tempdir}/multi_site.parquet",
+        dest_datatype="parquet",
+        preset="cellprofiler_csv",
+        join=False,
+    )
+    assert isinstance(result, dict)
+
+    assert set(result.keys()) == {
+        "Cells.csv",
+        "Cytoplasm.csv",
+        "Nuclei.csv",
+        "Image.csv",
+    }
+
+    single_site_result = convert(
+        source_path=f"{data_dir_cellprofiler}/ExampleHuman",
+        dest_path=f"{fx_tempdir}/single_site.parquet",
+        dest_datatype="parquet",
+        source_datatype="csv",
+        preset="cellprofiler_csv",
+        join=False,
+    )
+    assert isinstance(single_site_result, dict)
+
+    for compartment in ("Cells.csv", "Cytoplasm.csv", "Nuclei.csv"):
+        multi_rows = parquet.read_table(
+            source=result[compartment][0]["table"][0]
+        ).num_rows
+        single_rows = parquet.read_table(
+            source=single_site_result[compartment][0]["table"][0]
+        ).num_rows
+        assert multi_rows == 3 * single_rows, (
+            f"{compartment}: expected 3x single-site rows ({3 * single_rows}),"
+            f" got {multi_rows}"
+        )