fix: correct schema type checking in native_iceberg_compat (#1755)

parthchandra · web-flow · commit 15bbadb40f1b · 2025-05-23T08:41:09.000-07:00
diff --git a/common/src/main/java/org/apache/comet/parquet/NativeBatchReader.java b/common/src/main/java/org/apache/comet/parquet/NativeBatchReader.java
@@ -25,11 +25,11 @@
 import java.lang.reflect.InvocationTargetException;
 import java.lang.reflect.Method;
 import java.net.URI;
-import java.net.URISyntaxException;
 import java.nio.channels.Channels;
 import java.util.*;
 
 import scala.Option;
+import scala.collection.JavaConverters;
 import scala.collection.Seq;
 import scala.collection.mutable.Buffer;
 
@@ -52,6 +52,7 @@
 import org.apache.parquet.column.ColumnDescriptor;
 import org.apache.parquet.hadoop.metadata.BlockMetaData;
 import org.apache.parquet.hadoop.metadata.ParquetMetadata;
+import org.apache.parquet.schema.GroupType;
 import org.apache.parquet.schema.MessageType;
 import org.apache.parquet.schema.Type;
 import org.apache.spark.TaskContext;
@@ -61,6 +62,7 @@
 import org.apache.spark.sql.comet.parquet.CometParquetReadSupport;
 import org.apache.spark.sql.comet.util.Utils$;
 import org.apache.spark.sql.execution.datasources.PartitionedFile;
+import org.apache.spark.sql.execution.datasources.parquet.ParquetColumn;
 import org.apache.spark.sql.execution.datasources.parquet.ParquetToSparkSchemaConverter;
 import org.apache.spark.sql.execution.metric.SQLMetric;
 import org.apache.spark.sql.types.DataType;
@@ -76,8 +78,6 @@
 import org.apache.comet.vector.CometVector;
 import org.apache.comet.vector.NativeUtil;
 
-import static org.apache.comet.parquet.TypeUtil.isEqual;
-
 /**
  * A vectorized Parquet reader that reads a Parquet file in a batched fashion.
  *
@@ -113,6 +113,7 @@ public class NativeBatchReader extends RecordReader<Void, ColumnarBatch> impleme
 
   private StructType sparkSchema;
   private StructType dataSchema;
+  MessageType fileSchema;
   private MessageType requestedSchema;
   private CometVector[] vectors;
   private AbstractColumnReader[] columnReaders;
@@ -124,6 +125,8 @@ public class NativeBatchReader extends RecordReader<Void, ColumnarBatch> impleme
   private ParquetMetadata footer;
   private byte[] nativeFilter;
 
+  private ParquetColumn parquetColumn;
+
   /**
    * Whether the native scan should always return decimal represented by 128 bits, regardless of its
    * precision. Normally, this should be true if native execution is enabled, since Arrow compute
@@ -229,7 +232,13 @@ public NativeBatchReader(AbstractColumnReader[] columnReaders) {
    * Initialize this reader. The reason we don't do it in the constructor is that we want to close
    * any resource hold by this reader when error happens during the initialization.
    */
-  public void init() throws URISyntaxException, IOException {
+  public void init() throws Throwable {
+
+    conf.set("spark.sql.parquet.binaryAsString", "false");
+    conf.set("spark.sql.parquet.int96AsTimestamp", "false");
+    conf.set("spark.sql.caseSensitive", "false");
+    conf.set("spark.sql.parquet.inferTimestampNTZ.enabled", "true");
+    conf.set("spark.sql.legacy.parquet.nanosAsLong", "false");
 
     useDecimal128 =
         conf.getBoolean(
@@ -257,10 +266,11 @@ public void init() throws URISyntaxException, IOException {
             CometInputFile.fromPath(path, conf), footer, readOptions, cometReadOptions, metrics)) {
 
       requestedSchema = footer.getFileMetaData().getSchema();
-      MessageType fileSchema = requestedSchema;
+      fileSchema = requestedSchema;
+      ParquetToSparkSchemaConverter converter = new ParquetToSparkSchemaConverter(conf);
 
       if (sparkSchema == null) {
-        sparkSchema = new ParquetToSparkSchemaConverter(conf).convert(requestedSchema);
+        sparkSchema = converter.convert(requestedSchema);
       } else {
         requestedSchema =
             CometParquetReadSupport.clipParquetSchema(
@@ -269,9 +279,11 @@ public void init() throws URISyntaxException, IOException {
           throw new IllegalArgumentException(
               String.format(
                   "Spark schema has %d columns while " + "Parquet schema has %d columns",
-                  sparkSchema.size(), requestedSchema.getColumns().size()));
+                  sparkSchema.size(), requestedSchema.getFieldCount()));
         }
       }
+      this.parquetColumn =
+          converter.convertParquetColumn(requestedSchema, Option.apply(this.sparkSchema));
 
       String timeZoneId = conf.get("spark.sql.session.timeZone");
       // Native code uses "UTC" always as the timeZoneId when converting from spark to arrow schema.
@@ -283,6 +295,8 @@ public void init() throws URISyntaxException, IOException {
       // Create Column readers
       List<Type> fields = requestedSchema.getFields();
       List<Type> fileFields = fileSchema.getFields();
+      ParquetColumn[] parquetFields =
+          JavaConverters.seqAsJavaList(parquetColumn.children()).toArray(new ParquetColumn[0]);
       int numColumns = fields.size();
       if (partitionSchema != null) numColumns += partitionSchema.size();
       columnReaders = new AbstractColumnReader[numColumns];
@@ -332,9 +346,8 @@ public void init() throws URISyntaxException, IOException {
         } else if (optFileField.isPresent()) {
           // The column we are reading may be a complex type in which case we check if each field in
           // the requested type is in the file type (and the same data type)
-          if (!isEqual(field, optFileField.get())) {
-            throw new UnsupportedOperationException("Schema evolution is not supported");
-          }
+          // This makes the same check as Spark's VectorizedParquetReader
+          checkColumn(parquetFields[i]);
           missingColumns[i] = false;
         } else {
           if (field.getRepetition() == Type.Repetition.REQUIRED) {
@@ -407,6 +420,77 @@ public void init() throws URISyntaxException, IOException {
     isInitialized = true;
   }
 
+  private void checkParquetType(ParquetColumn column) throws IOException {
+    String[] path = JavaConverters.seqAsJavaList(column.path()).toArray(new String[0]);
+    if (containsPath(fileSchema, path)) {
+      if (column.isPrimitive()) {
+        ColumnDescriptor desc = column.descriptor().get();
+        ColumnDescriptor fd = fileSchema.getColumnDescription(desc.getPath());
+        TypeUtil.checkParquetType(fd, column.sparkType());
+      } else {
+        for (ParquetColumn childColumn : JavaConverters.seqAsJavaList(column.children())) {
+          checkColumn(childColumn);
+        }
+      }
+    } else { // A missing column which is either primitive or complex
+      if (column.required()) {
+        // Column is missing in data but the required data is non-nullable. This file is invalid.
+        throw new IOException(
+            "Required column is missing in data file. Col: " + Arrays.toString(path));
+      }
+    }
+  }
+
+  /**
+   * Checks whether the given 'path' exists in 'parquetType'. The difference between this and {@link
+   * MessageType#containsPath(String[])} is that the latter only support paths to leaf From Spark:
+   * VectorizedParquetRecordReader Check whether a column from requested schema is missing from the
+   * file schema, or whether it conforms to the type of the file schema.
+   */
+  private void checkColumn(ParquetColumn column) throws IOException {
+    String[] path = JavaConverters.seqAsJavaList(column.path()).toArray(new String[0]);
+    if (containsPath(fileSchema, path)) {
+      if (column.isPrimitive()) {
+        ColumnDescriptor desc = column.descriptor().get();
+        ColumnDescriptor fd = fileSchema.getColumnDescription(desc.getPath());
+        if (!fd.equals(desc)) {
+          throw new UnsupportedOperationException("Schema evolution not supported.");
+        }
+      } else {
+        for (ParquetColumn childColumn : JavaConverters.seqAsJavaList(column.children())) {
+          checkColumn(childColumn);
+        }
+      }
+    } else { // A missing column which is either primitive or complex
+      if (column.required()) {
+        // Column is missing in data but the required data is non-nullable. This file is invalid.
+        throw new IOException(
+            "Required column is missing in data file. Col: " + Arrays.toString(path));
+      }
+    }
+  }
+
+  /**
+   * Checks whether the given 'path' exists in 'parquetType'. The difference between this and {@link
+   * MessageType#containsPath(String[])} is that the latter only support paths to leaf nodes, while
+   * this support paths both to leaf and non-leaf nodes.
+   */
+  private boolean containsPath(Type parquetType, String[] path) {
+    return containsPath(parquetType, path, 0);
+  }
+
+  private boolean containsPath(Type parquetType, String[] path, int depth) {
+    if (path.length == depth) return true;
+    if (parquetType instanceof GroupType) {
+      String fieldName = path[depth];
+      GroupType parquetGroupType = (GroupType) parquetType;
+      if (parquetGroupType.containsField(fieldName)) {
+        return containsPath(parquetGroupType.getType(fieldName), path, depth + 1);
+      }
+    }
+    return false;
+  }
+
   public void setSparkSchema(StructType schema) {
     this.sparkSchema = schema;
   }
@@ -532,7 +616,10 @@ private int loadNextBatch() throws Throwable {
     if (importer != null) importer.close();
     importer = new CometSchemaImporter(ALLOCATOR);
 
-    List<ColumnDescriptor> columns = requestedSchema.getColumns();
+    for (ParquetColumn childColumn : JavaConverters.seqAsJavaList(parquetColumn.children())) {
+      checkParquetType(childColumn);
+    }
+
     List<Type> fields = requestedSchema.getFields();
     for (int i = 0; i < fields.size(); i++) {
       if (!missingColumns[i]) {
diff --git a/common/src/main/java/org/apache/comet/parquet/TypeUtil.java b/common/src/main/java/org/apache/comet/parquet/TypeUtil.java
@@ -20,8 +20,6 @@
 package org.apache.comet.parquet;
 
 import java.util.Arrays;
-import java.util.List;
-import java.util.Optional;
 
 import org.apache.parquet.column.ColumnDescriptor;
 import org.apache.parquet.schema.*;
@@ -319,60 +317,4 @@ private static boolean isUnsignedIntTypeMatched(
   private static boolean isSpark40Plus() {
     return package$.MODULE$.SPARK_VERSION().compareTo("4.0") >= 0;
   }
-
-  public static boolean isComplexType(Type t) {
-    return !t.isPrimitive() || t.isRepetition(Type.Repetition.REPEATED);
-  }
-
-  // From Parquet Type.java
-  public static boolean eqOrBothNull(Object o1, Object o2) {
-    return (o1 == null && o2 == null) || (o1 != null && o1.equals(o2));
-  }
-
-  // From Parquet Type.java
-  public static boolean equals(Type one, Type other) {
-    return one.getName().equals(other.getName())
-        && one.getRepetition() == other.getRepetition()
-        && eqOrBothNull(one.getRepetition(), other.getRepetition())
-        && eqOrBothNull(one.getId(), other.getId())
-        && eqOrBothNull(one.getLogicalTypeAnnotation(), other.getLogicalTypeAnnotation());
-  }
-
-  //
-  // Compare a field with another field and return true if they are the same. Unlike
-  // the equals method for Type (and derived classes), allows requested to have fields
-  // that are not in actual.
-  //
-  public static boolean isEqual(Type requested, Type actual) {
-    if (requested == null && actual == null) {
-      return true;
-    }
-    if (requested == null || actual == null) {
-      return false;
-    }
-    if (requested.isPrimitive() && actual.isPrimitive()) {
-      return requested.asPrimitiveType().equals(actual.asPrimitiveType());
-    } else if (!requested.isPrimitive() && !actual.isPrimitive()) {
-      if (equals(requested, actual)) {
-        // GroupType.equals also checks if LogicalTypeAnnotation is the same.
-        // But it really is not necessary here.
-        List<Type> requestedFields = requested.asGroupType().getFields();
-        List<Type> actualFields = requested.asGroupType().getFields();
-        for (Type field : requestedFields) {
-          Optional<Type> optActualField =
-              actualFields.stream().filter(f -> f.getName().equals(field.getName())).findFirst();
-          if (optActualField.isPresent()) {
-            if (!isEqual(field, optActualField.get())) {
-              return false;
-            }
-          }
-        }
-      } else {
-        return false;
-      }
-    } else {
-      return false; // one is a primitive type and the other is not.
-    }
-    return true;
-  }
 }
diff --git a/common/src/main/scala/org/apache/spark/sql/comet/parquet/CometParquetReadSupport.scala b/common/src/main/scala/org/apache/spark/sql/comet/parquet/CometParquetReadSupport.scala
@@ -194,8 +194,6 @@ object CometParquetReadSupport {
           .addField(clipParquetType(repeatedGroup, elementType, caseSensitive, useFieldId))
           .named(parquetList.getName)
       } else {
-        // Otherwise, the repeated field's type is the element type with the repeated field's
-        // repetition.
         val newRepeatedGroup = Types
           .repeatedGroup()
           .addField(
@@ -208,15 +206,11 @@ object CometParquetReadSupport {
           newRepeatedGroup
         }
 
+        // Otherwise, the repeated field's type is the element type with the repeated field's
+        // repetition.
         Types
           .buildGroup(parquetList.getRepetition)
           .as(LogicalTypeAnnotation.listType())
-          .addField(
-            Types
-              .repeatedGroup()
-              .addField(
-                clipParquetType(repeatedGroup.getType(0), elementType, caseSensitive, useFieldId))
-              .named(repeatedGroup.getName))
           .addField(newElementType)
           .named(parquetList.getName)
       }
diff --git a/spark/src/test/scala/org/apache/comet/parquet/ParquetReadSuite.scala b/spark/src/test/scala/org/apache/comet/parquet/ParquetReadSuite.scala
@@ -1233,7 +1233,8 @@ abstract class ParquetReadSuite extends CometTestBase {
 
             withParquetDataFrame(data, schema = Some(readSchema)) { df =>
               // TODO: validate with Spark 3.x and 'usingDataFusionParquetExec=true'
-              if (enableSchemaEvolution || usingDataSourceExec(conf)) {
+              if (enableSchemaEvolution || CometConf.COMET_NATIVE_SCAN_IMPL
+                  .get(conf) == CometConf.SCAN_NATIVE_DATAFUSION) {
                 checkAnswer(df, data.map(Row.fromTuple))
               } else {
                 assertThrows[SparkException](df.collect())