drivendataorg · pjbull · Mar 15, 2021 · Mar 15, 2021 · jayqi · Mar 15, 2021
diff --git a/HISTORY.md b/HISTORY.md
@@ -1,5 +1,8 @@
 # cloudpathlib Changelog
 
+## v0.4.1 (unreleased)
+ - Add `.cloud` accessor to `pandas.Series` and `pandas.Index` through [pandas-path](https://github.com/drivendataorg/pandas-path) integration. Registered if user does `from cloudpathlib.pandas import cloud`.
+
 ## v0.4.0 (2021-03-13)
 
 - Added rich comparison operator support to cloud paths, which means you can now use them with `sorted`. ([#129](https://github.com/drivendataorg/cloudpathlib/pull/129))

diff --git a/cloudpathlib/pandas/__init__.py b/cloudpathlib/pandas/__init__.py
diff --git a/cloudpathlib/pandas/cloud/__init__.py b/cloudpathlib/pandas/cloud/__init__.py
@@ -0,0 +1,9 @@
+try:
+    from pandas_path.accessor import register_path_accessor
+except ImportError:
+    raise ImportError("To use the .cloud accessor, you must pip install pandas-path.")
+
+from ...cloudpath import CloudPath
+
+
+register_path_accessor("cloud", CloudPath)
diff --git a/docs/docs/integrations.md b/docs/docs/integrations.md
@@ -34,5 +34,68 @@ fancy2.path
 #> PosixPath('mydir/myfile.txt')
 ```
 
+
+## Pandas and [pandas-path](https://github.com/drivendataorg/pandas-path)
+
+[`pandas-path`](https://github.com/drivendataorg/pandas-path) provides `pathlib` functions through as custom accessor (`.path`). The library also supports registering custom accessors for any class that implements that pathlib API.
+
+### The `.cloud` accessor
+
+We expose a `.cloud` accessor on `pandas.Series` and `pandas.Index` objects if you import it. This allows you to access any `CloudPath` method or property directly on a Series by just adding `.cloud`.
+
+To use the `.cloud` accessor, you must have `pandas-path` installed through pip:
+
+```
+pip install pandas-path
+```
+
+All you need to do to register the accessor is `from cloudpathlib.pandas import cloud`.
+
+For example:
+
+```python
+from cloudpathlib.pandas import cloud
+import pandas as pd
+
+pd.Series([
+    's3://cats/1.jpg',
+    's3://cats/2.jpg',
+    's3://dogs/1.jpg',
+    's3://dogs/2.jpg',
+]).cloud.bucket
+#> 0    cats
+#> 1    cats
+#> 2    dogs
+#> 3    dogs
+#> dtype: object
+```
+
+The way `pandas-path` works, it converts the items in the Series from strings to `CloudPath` objects and then back to strings again before returning so you don't end up with Python complex objects in your DataFrame. Because of this, you should set the default client if you need to pass any parameters to the client.
+
+For example, let's say that the account `special_account` had access to `special_bucket` but that the credentials were not accessible through an environment variable or credentials file.
+
+```python
+from cloudpathlib import S3Client
+from cloudpathlib.pandas import cloud
+
+import pandas as pd
+
+# default client will get used by `.cloud` accessor if we set ahead of time
+client = S3Client(aws_access_key_id="special_account", aws_secret_access_key="special_key")
+client.set_as_default_client()
+
+pd.Series([
+    's3://special_bucket/cats/1.jpg',
+    's3://special_bucket/cats/2.jpg',
+    's3://special_bucket/dogs/1.jpg',
+    's3://special_bucket/dogs/2.jpg',
+]).cloud.key
+#> 0    cats/1.jpg
+#> 1    cats/2.jpg
+#> 2    dogs/1.jpg
+#> 3    dogs/2.jpg
+#> dtype: object
+```
+
 ---
 <sup>Examples created with [reprexlite](https://github.com/jayqi/reprexlite)</sup>
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -12,6 +12,7 @@ mkdocs-material>=7
 mkdocstrings>=0.15
 mypy
 pandas
+pandas-path>=0.3.0
 pillow
 pydantic
 pytest

diff --git a/tests/test_pandas_accessor.py b/tests/test_pandas_accessor.py
@@ -0,0 +1,46 @@
+from cloudpathlib.pandas import cloud  # noqa
+
+import pandas as pd
+
+
+def test_joins(rig):
+    s = pd.Series(
+        [
+            f"{rig.cloud_prefix}bucket/a/b/c.txt",
+            f"{rig.cloud_prefix}bucket/a/b/c",
+            f"{rig.cloud_prefix}bucket/a/d/e.txt",
+        ]
+    )
+
+    # make sure we don't register the default `path` accessor from pandas-path
+    assert not hasattr(s, "path")
+
+    # test path manipulations
+    assert s.cloud.name.tolist() == ["c.txt", "c", "e.txt"]
+    assert s.cloud.stem.tolist() == ["c", "c", "e"]
+    assert s.cloud.parent.tolist() == [
+        f"{rig.cloud_prefix}bucket/a/b",
+        f"{rig.cloud_prefix}bucket/a/b",
+        f"{rig.cloud_prefix}bucket/a/d",
+    ]
+
+    # test cloud specific methods
+    if hasattr(rig.path_class, "bucket"):
+        assert s.cloud.bucket.tolist() == ["bucket"] * 3
+    elif hasattr(rig.path_class, "container"):
+        assert s.cloud.container.tolist() == ["bucket"] * 3
+
+    # test joins work as expected
+    s = pd.Series(
+        [
+            f"{rig.cloud_prefix}bucket/a/b",
+            f"{rig.cloud_prefix}bucket/a/c",
+            f"{rig.cloud_prefix}bucket/a/d",
+        ]
+    )
+
+    assert (s.cloud / ["file1.txt", "file2.txt", "file3.txt"]).tolist() == [
+        f"{rig.cloud_prefix}bucket/a/b/file1.txt",
+        f"{rig.cloud_prefix}bucket/a/c/file2.txt",
+        f"{rig.cloud_prefix}bucket/a/d/file3.txt",
+    ]
-Original file line number
+Diff line change
@@ Expand Up / @@ -12,6 +12,7 @@ mkdocs-material>=7 @@
     mkdocstrings>=0.15
     mypy
     pandas
+    pandas-path>=0.3.0
     pillow
     pydantic
     pytest
@@ Expand Down @@