Make IO work

phofl · phofl · commit 44073371cd8c · 2023-08-06T00:42:58.000+02:00
diff --git a/README.md b/README.md
@@ -1,4 +1,74 @@
-pandas Expressions
-==================
+# pandas Expressions
 
-Nothing to see.
+Lazy pandas API POC.
+
+## Reading from parquet files
+
+Prepare the parquet file:
+
+```python
+import pandas as pd
+
+pd.DataFrame({"a": [1, 2, 3], "b": ["x", "y", "z"], "c": 1, "d": 1.5}).to_parquet("test.parquet")
+```
+
+```python
+from pandas_expr import read_parquet
+
+df = read_parquet("test.parquet")
+result = df[df["b"] == "x"][["a", "c"]]
+```
+
+Let's look at how this query looks:
+
+```python
+result.pprint()
+
+Projection: columns=['a', 'c']
+  Filter:
+    ReadParquet: path='test.parquet'
+    EQ: right='x'
+      Projection: columns='b'
+        ReadParquet: path='test.parquet'
+```
+
+No need to read all of the data, we can do better:
+
+```python
+result.optimize().pprint()
+
+ReadParquet: path='test.parquet' columns=['a', 'c'] filters=[[('b', '==', 'x')]]
+```
+
+We pushed the column selection and the filter into the ``read_parquet`` call.
+
+
+## DataFrame constructor
+
+The DataFrame constructor mirrors the regular pandas constructor, but it is
+lazy and does not trigger any actual computation.
+
+```python
+from pandas_expr import DataFrame
+
+df = DataFrame({"a": [1, 2, 3], "b": ["x", "y", "z"], "c": 1, "d": 1.5})
+df = df.replace(1, 5).fillna(100)[["a", "b"]]
+
+df.pprint()
+
+Projection: columns=['a', 'b']
+  Fillna: value=100
+    Replace: to_replace=1 value=5
+      PandasIO: data={'a': [1, 2, 3], 'b': ['x', 'y', 'z'], 'c': 1, 'd': 1.5}
+```
+
+We can again make this more efficient:
+
+```python
+df.optimize(fuse=False).pprint()
+
+Fillna: value=100
+  Replace: to_replace=1 value=5
+    Projection: columns=['a', 'b']
+      PandasIO: data={'a': [1, 2, 3], 'b': ['x', 'y', 'z'], 'c': 1, 'd': 1.5}
+```
diff --git a/pandas_expr/_collection.py b/pandas_expr/_collection.py
@@ -616,44 +616,21 @@ def read_parquet(
     path=None,
     columns=None,
     filters=None,
-    categories=None,
-    index=None,
     storage_options=None,
-    dtype_backend=None,
-    calculate_divisions=False,
-    ignore_metadata_file=False,
-    metadata_task_size=None,
-    split_row_groups="infer",
-    blocksize="default",
-    aggregate_files=None,
-    parquet_file_extension=(".parq", ".parquet", ".pq"),
-    filesystem="fsspec",
-    **kwargs,
+    dtype_backend=pd.api.extensions.no_default,
 ):
     from pandas_expr.io.parquet import ReadParquet
 
     if not isinstance(path, str):
         path = stringify_path(path)
 
-    kwargs["dtype_backend"] = dtype_backend
-
     return new_collection(
         ReadParquet(
             path,
             columns=_convert_to_list(columns),
             filters=filters,
-            categories=categories,
-            index=index,
             storage_options=storage_options,
-            calculate_divisions=calculate_divisions,
-            ignore_metadata_file=ignore_metadata_file,
-            metadata_task_size=metadata_task_size,
-            split_row_groups=split_row_groups,
-            blocksize=blocksize,
-            aggregate_files=aggregate_files,
-            parquet_file_extension=parquet_file_extension,
-            filesystem=filesystem,
-            kwargs=kwargs,
+            dtype_backend=dtype_backend,
         )
     )
 
diff --git a/pandas_expr/_concat.py b/pandas_expr/_concat.py
@@ -59,6 +59,6 @@ def _simplify_up(self, parent):
                 for frame, cols in zip(self._frames, columns_frame)
             ]
             return type(parent)(
-                type(self)(self.join, self._kwargs, *frames),
+                type(self)(self.join, *frames),
                 *parent.operands[1:],
             )
diff --git a/pandas_expr/io/csv.py b/pandas_expr/io/csv.py
@@ -1,15 +1,20 @@
+import functools
+
+import pandas as pd
+
 from pandas_expr.io.io import BlockwiseIO
 
 
 class ReadCSV(BlockwiseIO):
-    _parameters = ["filename", "usecols", "header", "_partitions", "storage_options"]
+    _parameters = ["filename", "usecols", "header", "storage_options"]
     _defaults = {
         "usecols": None,
         "header": "infer",
-        "_partitions": None,
         "storage_options": None,
     }
+    _keyword_only = ["usecols", "header", "storage_options"]
+    operation = staticmethod(pd.read_csv)
 
-    @property
+    @functools.cached_property
     def _meta(self):
-        return self._ddf._meta
+        return pd.read_csv(self.filename, **self._kwargs, nrows=1).iloc[:0]
diff --git a/pandas_expr/io/parquet.py b/pandas_expr/io/parquet.py
diff --git a/pandas_expr/io/tests/test_io.py b/pandas_expr/io/tests/test_io.py

Original file line number	Diff line number	Diff line change
`@@ -59,6 +59,6 @@ def _simplify_up(self, parent):`
`59`	`59`	`for frame, cols in zip(self._frames, columns_frame)`
`60`	`60`	`]`
`61`	`61`	`return type(parent)(`
`62`		`- type(self)(self.join, self._kwargs, *frames),`
	`62`	`+ type(self)(self.join, *frames),`
`63`	`63`	`*parent.operands[1:],`
`64`	`64`	`)`