feat: Implement parquet datasource with schema conversion

clflushopt · clflushopt · commit 6bcd82d450c1 · 2025-02-01T16:09:11.000-05:00
fix: some schema propagation bugs

feat: add a main java app to test on nyc trips dataset
diff --git a/.gitignore b/.gitignore
@@ -95,4 +95,7 @@ TODO.md
 TODO.rst
 TODO
 
-# End of https://www.toptal.com/developers/gitignore/api/java,maven,visualstudiocode,gradle
+### Datasets ###
+datasets/
+
+# End of https://www.toptal.com/developers/gitignore/api/java,maven,visualstudiocode,gradle
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -4,6 +4,13 @@
     // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
     "version": "0.2.0",
     "configurations": [
+        {
+            "type": "java",
+            "name": "NycTripsBenchmark",
+            "request": "launch",
+            "mainClass": "co.clflushopt.glint.NycTripsBenchmark",
+            "projectName": "glint"
+        },
         {
             "type": "java",
             "name": "Current File",
diff --git a/glint/pom.xml b/glint/pom.xml
@@ -61,6 +61,42 @@
         <artifactId>mockito-core</artifactId>
         <version>5.15.2</version>
     </dependency>
+    <dependency>
+      <groupId>org.apache.parquet</groupId>
+      <artifactId>parquet-common</artifactId>
+      <version>1.15.0</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.parquet</groupId>
+      <artifactId>parquet-encoding</artifactId>
+      <version>1.15.0</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.parquet</groupId>
+      <artifactId>parquet-column</artifactId>
+      <version>1.15.0</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.parquet</groupId>
+      <artifactId>parquet-hadoop</artifactId>
+      <version>1.15.0</version>
+    </dependency>
+    <dependency>
+        <groupId>org.apache.hadoop</groupId>
+        <artifactId>hadoop-hdfs</artifactId>
+        <version>3.3.0</version>
+        <scope>test</scope>
+    </dependency>
+    <dependency>
+        <groupId>org.apache.hadoop</groupId>
+        <artifactId>hadoop-common</artifactId>
+        <version>3.3.0</version>
+    </dependency>
+    <dependency>
+        <groupId>org.apache.hadoop</groupId>
+        <artifactId>hadoop-mapreduce-client-core</artifactId>
+        <version>3.3.0</version>
+    </dependency>
   </dependencies>
 
   <build>
diff --git a/glint/src/main/java/co/clflushopt/glint/App.java b/glint/src/main/java/co/clflushopt/glint/App.java
@@ -1,11 +1,77 @@
 package co.clflushopt.glint;
 
+import java.io.FileNotFoundException;
+import java.util.Iterator;
+
+import org.apache.arrow.vector.types.pojo.ArrowType;
+
+import co.clflushopt.glint.core.ExecutionContext;
+import co.clflushopt.glint.dataframe.DataFrame;
+import co.clflushopt.glint.query.logical.expr.AggregateExpr;
+import co.clflushopt.glint.query.logical.expr.CastExpr;
+import co.clflushopt.glint.query.logical.expr.ColumnExpr;
+import co.clflushopt.glint.query.logical.expr.LogicalExpr;
+import co.clflushopt.glint.query.logical.plan.LogicalPlan;
+import co.clflushopt.glint.query.optimizer.QueryOptimizer;
+import co.clflushopt.glint.types.RecordBatch;
+
 /**
  * Hello world!
  *
  */
 public class App {
     public static void main(String[] args) {
         System.out.println("Welcome to the Glint query compiler");
+        try {
+            nycTripsBenchmark(args);
+        } catch (FileNotFoundException e) {
+            e.printStackTrace();
+        }
+    }
+
+    public static void nycTripsBenchmark(String[] args) throws FileNotFoundException {
+        // Create execution context
+        ExecutionContext ctx = ExecutionContext.create().build();
+
+        long startTime = System.currentTimeMillis();
+        try {
+
+            // Create DataFrame and apply transformations
+            DataFrame df = ctx.readParquet("./datasets/yellow_tripdata_2019-01.parquet", null);
+
+            System.out.println("Logical Plan:\t" + LogicalPlan.format(df.getLogicalPlan()));
+            System.out.println("Schema:\t" + df.getSchema());
+
+            // Optimize and execute the plan
+            LogicalPlan optimizedPlan = QueryOptimizer.optimize(df.getLogicalPlan());
+            System.out.println("Optimized Plan:\t" + LogicalPlan.format(optimizedPlan));
+
+            // Execute and print results
+            Iterator<RecordBatch> results = ctx.execute(optimizedPlan);
+
+            while (results.hasNext()) {
+                RecordBatch batch = results.next();
+                System.out.println(batch.getSchema());
+                System.out.println(batch.toCsv());
+
+            }
+
+        } finally {
+            long endTime = System.currentTimeMillis();
+            System.out.println("Query took " + (endTime - startTime) + " ms");
+        }
+    }
+
+    // Helper methods for creating expressions
+    private static LogicalExpr col(String name) {
+        return new ColumnExpr(name);
+    }
+
+    private static LogicalExpr cast(LogicalExpr expr, ArrowType targetType) {
+        return new CastExpr(expr, targetType);
+    }
+
+    private static AggregateExpr max(LogicalExpr expr) {
+        return new AggregateExpr.Max(expr);
     }
-}
+}
diff --git a/glint/src/main/java/co/clflushopt/glint/core/ExecutionContext.java b/glint/src/main/java/co/clflushopt/glint/core/ExecutionContext.java
@@ -3,12 +3,18 @@
 import java.io.FileNotFoundException;
 import java.util.Collections;
 import java.util.HashMap;
+import java.util.Iterator;
 import java.util.Optional;
 
 import co.clflushopt.glint.dataframe.DataFrame;
 import co.clflushopt.glint.dataframe.DataFrameImpl;
 import co.clflushopt.glint.datasource.CsvDataSource;
+import co.clflushopt.glint.datasource.ParquetDataSource;
+import co.clflushopt.glint.query.logical.plan.LogicalPlan;
 import co.clflushopt.glint.query.logical.plan.Scan;
+import co.clflushopt.glint.query.optimizer.QueryOptimizer;
+import co.clflushopt.glint.query.planner.QueryPlanner;
+import co.clflushopt.glint.types.RecordBatch;
 import co.clflushopt.glint.types.Schema;
 
 public class ExecutionContext {
@@ -22,6 +28,13 @@ private ExecutionContext(HashMap<String, Object> context, Configuration configur
         this.config = configuration;
     }
 
+    public Iterator<RecordBatch> execute(LogicalPlan plan) {
+        // Implementation to execute logical plan
+        var optimizedPlan = QueryOptimizer.optimize(plan);
+        var physicalPlan = QueryPlanner.createPhysicalPlan(optimizedPlan);
+        return physicalPlan.execute();
+    }
+
     /**
      * Configuration class that encapsulates all execution settings. Uses builder
      * pattern for a clean configuration API.
@@ -74,6 +87,32 @@ public DataFrame readCsv(String path, Optional<Schema> schema, CsvReaderOptions
         return new DataFrameImpl(new Scan(options.getTableName(), source, Collections.emptyList()));
     }
 
+    /**
+     * Creates a DataFrame from a CSV file with default options.
+     *
+     *
+     * @param name
+     * @param df
+     */
+    public DataFrame readCsv(String path, Schema schema, CsvReaderOptions options)
+            throws FileNotFoundException {
+        var source = new CsvDataSource(path, Optional.of(schema), options.hasHeader(),
+                defaultBatchSize);
+        return new DataFrameImpl(new Scan(options.getTableName(), source, Collections.emptyList()));
+    }
+
+    /**
+     * Creates a DataFrame from a Parquet file with the specified options.
+     *
+     *
+     * @param name
+     * @param df
+     */
+    public DataFrame readParquet(String path, Optional<Schema> schema) {
+        var source = new ParquetDataSource(path);
+        return new DataFrameImpl(new Scan("parquet_scan", source, Collections.emptyList()));
+    }
+
     /**
      * Creates a temporary table from a DataFrame for use in subsequent queries.
      */
diff --git a/glint/src/main/java/co/clflushopt/glint/datasource/ParquetDataSource.java b/glint/src/main/java/co/clflushopt/glint/datasource/ParquetDataSource.java
@@ -0,0 +1,158 @@
+package co.clflushopt.glint.datasource;
+
+import java.io.IOException;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+import java.util.NoSuchElementException;
+import java.util.stream.Collectors;
+
+import org.apache.arrow.memory.RootAllocator;
+import org.apache.arrow.vector.VectorSchemaRoot;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.parquet.column.page.PageReadStore;
+import org.apache.parquet.hadoop.ParquetFileReader;
+import org.apache.parquet.hadoop.util.HadoopInputFile;
+
+import co.clflushopt.glint.types.ArrowFieldVector;
+import co.clflushopt.glint.types.RecordBatch;
+import co.clflushopt.glint.types.Schema;
+import co.clflushopt.glint.types.SchemaConverter;
+
+public class ParquetDataSource implements DataSource {
+    private final String filename;
+
+    public ParquetDataSource(String filename) {
+        this.filename = filename;
+    }
+
+    @Override
+    public Schema getSchema() {
+        try (ParquetScan scan = new ParquetScan(filename, Collections.emptyList())) {
+            org.apache.arrow.vector.types.pojo.Schema arrowSchema = SchemaConverter
+                    .fromParquet(scan.getSchema()).toArrow();
+            return SchemaConverter.fromArrow(arrowSchema);
+        } catch (IOException e) {
+            throw new RuntimeException("Failed to read schema from Parquet file", e);
+        }
+    }
+
+    @SuppressWarnings("resource")
+    @Override
+    public Iterable<RecordBatch> scan(List<String> projection) {
+        // Return an Iterable that creates a new ParquetScan each time iterator() is
+        // called
+        return () -> {
+            try {
+                return new ParquetScan(filename, projection).iterator();
+            } catch (IOException e) {
+                throw new RuntimeException("Failed to create ParquetScan", e);
+            }
+        };
+    }
+}
+
+class ParquetScan implements AutoCloseable {
+    private final ParquetFileReader reader;
+    private final List<String> columns;
+    private final org.apache.parquet.schema.MessageType schema;
+
+    public ParquetScan(String filename, List<String> columns) throws IOException {
+        this.columns = columns;
+        this.reader = ParquetFileReader
+                .open(HadoopInputFile.fromPath(new Path(filename), new Configuration()));
+        this.schema = reader.getFooter().getFileMetaData().getSchema();
+    }
+
+    public Iterator<RecordBatch> iterator() {
+        return new ParquetIterator(reader, columns);
+    }
+
+    @Override
+    public void close() throws IOException {
+        reader.close();
+    }
+
+    public org.apache.parquet.schema.MessageType getSchema() {
+        return schema;
+    }
+}
+
+class ParquetIterator implements Iterator<RecordBatch> {
+    private final ParquetFileReader reader;
+    private final List<String> projectedColumns;
+    private final org.apache.parquet.schema.MessageType schema;
+    private final org.apache.arrow.vector.types.pojo.Schema arrowSchema;
+    private final org.apache.arrow.vector.types.pojo.Schema projectedArrowSchema;
+    private RecordBatch batch;
+
+    public ParquetIterator(ParquetFileReader reader, List<String> projectedColumns) {
+        this.reader = reader;
+        this.projectedColumns = projectedColumns;
+        this.schema = reader.getFooter().getFileMetaData().getSchema();
+        this.arrowSchema = SchemaConverter.fromParquet(schema).toArrow();
+
+        if (projectedColumns.isEmpty()) {
+            // Project all columns
+            this.projectedArrowSchema = arrowSchema;
+        } else {
+            // Create projected schema
+            List<org.apache.arrow.vector.types.pojo.Field> projectedFields = projectedColumns
+                    .stream().map(
+                            name -> arrowSchema.getFields().stream()
+                                    .filter(f -> f.getName().equals(name)).findFirst()
+                                    .orElseThrow(() -> new IllegalArgumentException(
+                                            "Column not found: " + name)))
+                    .collect(Collectors.toList());
+
+            this.projectedArrowSchema = new org.apache.arrow.vector.types.pojo.Schema(
+                    projectedFields);
+        }
+
+    }
+
+    @Override
+    public boolean hasNext() {
+        batch = nextBatch();
+        return batch != null;
+    }
+
+    @Override
+    public RecordBatch next() {
+        if (batch == null) {
+            throw new NoSuchElementException();
+        }
+        RecordBatch result = batch;
+        batch = null;
+        return result;
+    }
+
+    private RecordBatch nextBatch() {
+        try (PageReadStore pages = reader.readNextRowGroup()) {
+            if (pages == null) {
+                return null;
+            }
+
+            if (pages.getRowCount() > Integer.MAX_VALUE) {
+                throw new IllegalStateException("Row count exceeds maximum integer value");
+            }
+
+            int rows = (int) pages.getRowCount();
+
+            VectorSchemaRoot root = VectorSchemaRoot.create(projectedArrowSchema,
+                    new RootAllocator(Long.MAX_VALUE));
+            root.allocateNew();
+            root.setRowCount(rows);
+
+            Schema convertedSchema = SchemaConverter.fromArrow(projectedArrowSchema);
+
+            return new RecordBatch(convertedSchema, root.getFieldVectors().stream()
+                    .map(ArrowFieldVector::new).collect(Collectors.toList()));
+        } catch (IOException e) {
+            // TODO Auto-generated catch block
+            e.printStackTrace();
+        }
+        return batch;
+    }
+}
diff --git a/glint/src/main/java/co/clflushopt/glint/query/logical/plan/Scan.java b/glint/src/main/java/co/clflushopt/glint/query/logical/plan/Scan.java
@@ -48,6 +48,9 @@ public List<LogicalPlan> getChildren() {
 
     private Schema infer() {
         var schema = this.dataSource.getSchema();
+        assert schema != null;
+        assert schema.getFields().size() > 0;
+
         if (projections.isEmpty()) {
             return schema;
         }
diff --git a/glint/src/main/java/co/clflushopt/glint/query/planner/QueryPlanner.java b/glint/src/main/java/co/clflushopt/glint/query/planner/QueryPlanner.java
diff --git a/glint/src/main/java/co/clflushopt/glint/types/RecordBatch.java b/glint/src/main/java/co/clflushopt/glint/types/RecordBatch.java
diff --git a/glint/src/main/java/co/clflushopt/glint/types/SchemaConverter.java b/glint/src/main/java/co/clflushopt/glint/types/SchemaConverter.java

Original file line number	Diff line number	Diff line change
`@@ -48,6 +48,9 @@ public List<LogicalPlan> getChildren() {`
`48`	`48`
`49`	`49`	`private Schema infer() {`
`50`	`50`	`var schema = this.dataSource.getSchema();`
	`51`	`+ assert schema != null;`
	`52`	`+ assert schema.getFields().size() > 0;`
	`53`	`+`
`51`	`54`	`if (projections.isEmpty()) {`
`52`	`55`	`return schema;`
`53`	`56`	`}`