Merge pull request #79 from awslabs/read-parquet

igorborgest · web-flow · commit 0776acf1eed4 · 2019-12-05T21:50:16.000-03:00
add Pandas.read_parquet()
diff --git a/awswrangler/pandas.py b/awswrangler/pandas.py
@@ -619,7 +619,7 @@ def to_parquet(self,
         Optionally writes metadata on AWS Glue.
 
         :param dataframe: Pandas Dataframe
-        :param path: AWS S3 path (E.g. s3://bucket-name/folder_name/
+        :param path: AWS S3 path (E.g. s3://bucket-name/folder_name/)
         :param database: AWS Glue Database name
         :param table: AWS Glue table name
         :param partition_cols: List of columns names that will be partitions on S3
@@ -1128,3 +1128,24 @@ def drop_duplicated_columns(dataframe: pd.DataFrame, inplace: bool = True) -> pd
         if len(duplicated_cols_names) > 0:
             logger.warning(f"Dropping repeated columns: {duplicated_cols_names}")
         return dataframe.loc[:, ~duplicated_cols]
+
+    def read_parquet(self,
+                     path: str,
+                     columns: Optional[List[str]] = None,
+                     filters: Optional[Union[List[Tuple[Any]], List[Tuple[Any]]]] = None,
+                     procs_cpu_bound: Optional[int] = None):
+        """
+        Read parquet data from S3
+
+        :param path: AWS S3 path (E.g. s3://bucket-name/folder_name/)
+        :param columns: Names of columns to read from the file
+        :param filters: List of filters to apply, like ``[[('x', '=', 0), ...], ...]``.
+        :param procs_cpu_bound: Number of cores used for CPU bound tasks
+        """
+        path = path[:-1] if path[-1] == "/" else path
+        procs_cpu_bound = 1 if self._session.procs_cpu_bound is None else self._session.procs_cpu_bound if procs_cpu_bound is None else procs_cpu_bound
+        use_threads: bool = True if procs_cpu_bound > 1 else False
+        fs = s3.get_fs(session_primitives=self._session.primitives)
+        fs = pa.filesystem._ensure_filesystem(fs)
+        return pq.read_table(source=path, columns=columns, filters=filters,
+                             filesystem=fs).to_pandas(use_threads=use_threads)
diff --git a/testing/test_awswrangler/test_pandas.py b/testing/test_awswrangler/test_pandas.py
@@ -1290,3 +1290,64 @@ def test_to_parquet_decimal(session, bucket, database):
     assert df2[df2.id == 2].iloc[0].decimal_5 is None
     assert df2[df2.id == 3].iloc[0].decimal_2 == Decimal((0, (1, 9, 0), -2))
     assert df2[df2.id == 3].iloc[0].decimal_5 == Decimal((0, (1, 9, 0, 0, 0, 0), -5))
+
+
+def test_read_parquet_dataset(session, bucket):
+    df = pd.DataFrame({
+        "id": [1, 2, 3],
+        "decimal_2": [Decimal((0, (1, 9, 9), -2)),
+                      Decimal((0, (1, 9, 9), -2)),
+                      Decimal((0, (1, 9, 0), -2))],
+        "decimal_5": [
+            Decimal((0, (1, 9, 9, 9, 9, 9), -5)),
+            Decimal((0, (1, 9, 9, 9, 9, 9), -5)),
+            Decimal((0, (1, 9, 0, 0, 0, 0), -5))
+        ],
+        "float": [1.1, 2.2, 3.3],
+        "list_int": [[1, 2], [1], [3, 4, 5]],
+        "list_float": [[1.0, 2.0, 3.0], [9.9], [4.0, 5.0]],
+        "list_string": [["foo"], ["xxx"], ["boo", "bar"]],
+        "list_timestamp": [[datetime(2019, 1, 1), datetime(2019, 1, 2)], [datetime(2019, 1, 3)], [datetime(2019, 1,
+                                                                                                           3)]],
+        "partition": [0, 0, 1]
+    })
+    path = f"s3://{bucket}/test_read_parquet/"
+    session.pandas.to_parquet(dataframe=df,
+                              path=path,
+                              mode="overwrite",
+                              preserve_index=False,
+                              procs_cpu_bound=4,
+                              partition_cols=["partition"])
+    df2 = session.pandas.read_parquet(path=path)
+    assert len(list(df.columns)) == len(list(df2.columns))
+    assert len(df.index) == len(df2.index)
+
+
+def test_read_parquet_file(session, bucket):
+    df = pd.DataFrame({
+        "id": [1, 2, 3],
+        "decimal_2": [Decimal((0, (1, 9, 9), -2)),
+                      Decimal((0, (1, 9, 9), -2)),
+                      Decimal((0, (1, 9, 0), -2))],
+        "decimal_5": [
+            Decimal((0, (1, 9, 9, 9, 9, 9), -5)),
+            Decimal((0, (1, 9, 9, 9, 9, 9), -5)),
+            Decimal((0, (1, 9, 0, 0, 0, 0), -5))
+        ],
+        "float": [1.1, 2.2, 3.3],
+        "list_int": [[1, 2], [1], [3, 4, 5]],
+        "list_float": [[1.0, 2.0, 3.0], [9.9], [4.0, 5.0]],
+        "list_string": [["foo"], ["xxx"], ["boo", "bar"]],
+        "list_timestamp": [[datetime(2019, 1, 1), datetime(2019, 1, 2)], [datetime(2019, 1, 3)], [datetime(2019, 1,
+                                                                                                           3)]],
+        "partition": [0, 0, 1]
+    })
+    path = f"s3://{bucket}/test_read_parquet/"
+    filepath = session.pandas.to_parquet(dataframe=df,
+                                         path=path,
+                                         mode="overwrite",
+                                         preserve_index=False,
+                                         procs_cpu_bound=1)
+    df2 = session.pandas.read_parquet(path=filepath[0])
+    assert len(list(df.columns)) == len(list(df2.columns))
+    assert len(df.index) == len(df2.index)