25
25
import java .lang .reflect .InvocationTargetException ;
26
26
import java .lang .reflect .Method ;
27
27
import java .net .URI ;
28
- import java .net .URISyntaxException ;
29
28
import java .nio .channels .Channels ;
30
29
import java .util .*;
31
30
32
31
import scala .Option ;
32
+ import scala .collection .JavaConverters ;
33
33
import scala .collection .Seq ;
34
34
import scala .collection .mutable .Buffer ;
35
35
52
52
import org .apache .parquet .column .ColumnDescriptor ;
53
53
import org .apache .parquet .hadoop .metadata .BlockMetaData ;
54
54
import org .apache .parquet .hadoop .metadata .ParquetMetadata ;
55
+ import org .apache .parquet .schema .GroupType ;
55
56
import org .apache .parquet .schema .MessageType ;
56
57
import org .apache .parquet .schema .Type ;
57
58
import org .apache .spark .TaskContext ;
61
62
import org .apache .spark .sql .comet .parquet .CometParquetReadSupport ;
62
63
import org .apache .spark .sql .comet .util .Utils$ ;
63
64
import org .apache .spark .sql .execution .datasources .PartitionedFile ;
65
+ import org .apache .spark .sql .execution .datasources .parquet .ParquetColumn ;
64
66
import org .apache .spark .sql .execution .datasources .parquet .ParquetToSparkSchemaConverter ;
65
67
import org .apache .spark .sql .execution .metric .SQLMetric ;
66
68
import org .apache .spark .sql .types .DataType ;
76
78
import org .apache .comet .vector .CometVector ;
77
79
import org .apache .comet .vector .NativeUtil ;
78
80
79
- import static org .apache .comet .parquet .TypeUtil .isEqual ;
80
-
81
81
/**
82
82
* A vectorized Parquet reader that reads a Parquet file in a batched fashion.
83
83
*
@@ -113,6 +113,7 @@ public class NativeBatchReader extends RecordReader<Void, ColumnarBatch> impleme
113
113
114
114
private StructType sparkSchema ;
115
115
private StructType dataSchema ;
116
+ MessageType fileSchema ;
116
117
private MessageType requestedSchema ;
117
118
private CometVector [] vectors ;
118
119
private AbstractColumnReader [] columnReaders ;
@@ -124,6 +125,8 @@ public class NativeBatchReader extends RecordReader<Void, ColumnarBatch> impleme
124
125
private ParquetMetadata footer ;
125
126
private byte [] nativeFilter ;
126
127
128
+ private ParquetColumn parquetColumn ;
129
+
127
130
/**
128
131
* Whether the native scan should always return decimal represented by 128 bits, regardless of its
129
132
* precision. Normally, this should be true if native execution is enabled, since Arrow compute
@@ -229,7 +232,13 @@ public NativeBatchReader(AbstractColumnReader[] columnReaders) {
229
232
* Initialize this reader. The reason we don't do it in the constructor is that we want to close
230
233
* any resource hold by this reader when error happens during the initialization.
231
234
*/
232
- public void init () throws URISyntaxException , IOException {
235
+ public void init () throws Throwable {
236
+
237
+ conf .set ("spark.sql.parquet.binaryAsString" , "false" );
238
+ conf .set ("spark.sql.parquet.int96AsTimestamp" , "false" );
239
+ conf .set ("spark.sql.caseSensitive" , "false" );
240
+ conf .set ("spark.sql.parquet.inferTimestampNTZ.enabled" , "true" );
241
+ conf .set ("spark.sql.legacy.parquet.nanosAsLong" , "false" );
233
242
234
243
useDecimal128 =
235
244
conf .getBoolean (
@@ -257,10 +266,11 @@ public void init() throws URISyntaxException, IOException {
257
266
CometInputFile .fromPath (path , conf ), footer , readOptions , cometReadOptions , metrics )) {
258
267
259
268
requestedSchema = footer .getFileMetaData ().getSchema ();
260
- MessageType fileSchema = requestedSchema ;
269
+ fileSchema = requestedSchema ;
270
+ ParquetToSparkSchemaConverter converter = new ParquetToSparkSchemaConverter (conf );
261
271
262
272
if (sparkSchema == null ) {
263
- sparkSchema = new ParquetToSparkSchemaConverter ( conf ) .convert (requestedSchema );
273
+ sparkSchema = converter .convert (requestedSchema );
264
274
} else {
265
275
requestedSchema =
266
276
CometParquetReadSupport .clipParquetSchema (
@@ -269,9 +279,11 @@ public void init() throws URISyntaxException, IOException {
269
279
throw new IllegalArgumentException (
270
280
String .format (
271
281
"Spark schema has %d columns while " + "Parquet schema has %d columns" ,
272
- sparkSchema .size (), requestedSchema .getColumns (). size ()));
282
+ sparkSchema .size (), requestedSchema .getFieldCount ()));
273
283
}
274
284
}
285
+ this .parquetColumn =
286
+ converter .convertParquetColumn (requestedSchema , Option .apply (this .sparkSchema ));
275
287
276
288
String timeZoneId = conf .get ("spark.sql.session.timeZone" );
277
289
// Native code uses "UTC" always as the timeZoneId when converting from spark to arrow schema.
@@ -283,6 +295,8 @@ public void init() throws URISyntaxException, IOException {
283
295
// Create Column readers
284
296
List <Type > fields = requestedSchema .getFields ();
285
297
List <Type > fileFields = fileSchema .getFields ();
298
+ ParquetColumn [] parquetFields =
299
+ JavaConverters .seqAsJavaList (parquetColumn .children ()).toArray (new ParquetColumn [0 ]);
286
300
int numColumns = fields .size ();
287
301
if (partitionSchema != null ) numColumns += partitionSchema .size ();
288
302
columnReaders = new AbstractColumnReader [numColumns ];
@@ -332,9 +346,8 @@ public void init() throws URISyntaxException, IOException {
332
346
} else if (optFileField .isPresent ()) {
333
347
// The column we are reading may be a complex type in which case we check if each field in
334
348
// the requested type is in the file type (and the same data type)
335
- if (!isEqual (field , optFileField .get ())) {
336
- throw new UnsupportedOperationException ("Schema evolution is not supported" );
337
- }
349
+ // This makes the same check as Spark's VectorizedParquetReader
350
+ checkColumn (parquetFields [i ]);
338
351
missingColumns [i ] = false ;
339
352
} else {
340
353
if (field .getRepetition () == Type .Repetition .REQUIRED ) {
@@ -407,6 +420,77 @@ public void init() throws URISyntaxException, IOException {
407
420
isInitialized = true ;
408
421
}
409
422
423
+ private void checkParquetType (ParquetColumn column ) throws IOException {
424
+ String [] path = JavaConverters .seqAsJavaList (column .path ()).toArray (new String [0 ]);
425
+ if (containsPath (fileSchema , path )) {
426
+ if (column .isPrimitive ()) {
427
+ ColumnDescriptor desc = column .descriptor ().get ();
428
+ ColumnDescriptor fd = fileSchema .getColumnDescription (desc .getPath ());
429
+ TypeUtil .checkParquetType (fd , column .sparkType ());
430
+ } else {
431
+ for (ParquetColumn childColumn : JavaConverters .seqAsJavaList (column .children ())) {
432
+ checkColumn (childColumn );
433
+ }
434
+ }
435
+ } else { // A missing column which is either primitive or complex
436
+ if (column .required ()) {
437
+ // Column is missing in data but the required data is non-nullable. This file is invalid.
438
+ throw new IOException (
439
+ "Required column is missing in data file. Col: " + Arrays .toString (path ));
440
+ }
441
+ }
442
+ }
443
+
444
+ /**
445
+ * Checks whether the given 'path' exists in 'parquetType'. The difference between this and {@link
446
+ * MessageType#containsPath(String[])} is that the latter only support paths to leaf From Spark:
447
+ * VectorizedParquetRecordReader Check whether a column from requested schema is missing from the
448
+ * file schema, or whether it conforms to the type of the file schema.
449
+ */
450
+ private void checkColumn (ParquetColumn column ) throws IOException {
451
+ String [] path = JavaConverters .seqAsJavaList (column .path ()).toArray (new String [0 ]);
452
+ if (containsPath (fileSchema , path )) {
453
+ if (column .isPrimitive ()) {
454
+ ColumnDescriptor desc = column .descriptor ().get ();
455
+ ColumnDescriptor fd = fileSchema .getColumnDescription (desc .getPath ());
456
+ if (!fd .equals (desc )) {
457
+ throw new UnsupportedOperationException ("Schema evolution not supported." );
458
+ }
459
+ } else {
460
+ for (ParquetColumn childColumn : JavaConverters .seqAsJavaList (column .children ())) {
461
+ checkColumn (childColumn );
462
+ }
463
+ }
464
+ } else { // A missing column which is either primitive or complex
465
+ if (column .required ()) {
466
+ // Column is missing in data but the required data is non-nullable. This file is invalid.
467
+ throw new IOException (
468
+ "Required column is missing in data file. Col: " + Arrays .toString (path ));
469
+ }
470
+ }
471
+ }
472
+
473
+ /**
474
+ * Checks whether the given 'path' exists in 'parquetType'. The difference between this and {@link
475
+ * MessageType#containsPath(String[])} is that the latter only support paths to leaf nodes, while
476
+ * this support paths both to leaf and non-leaf nodes.
477
+ */
478
+ private boolean containsPath (Type parquetType , String [] path ) {
479
+ return containsPath (parquetType , path , 0 );
480
+ }
481
+
482
+ private boolean containsPath (Type parquetType , String [] path , int depth ) {
483
+ if (path .length == depth ) return true ;
484
+ if (parquetType instanceof GroupType ) {
485
+ String fieldName = path [depth ];
486
+ GroupType parquetGroupType = (GroupType ) parquetType ;
487
+ if (parquetGroupType .containsField (fieldName )) {
488
+ return containsPath (parquetGroupType .getType (fieldName ), path , depth + 1 );
489
+ }
490
+ }
491
+ return false ;
492
+ }
493
+
410
494
public void setSparkSchema (StructType schema ) {
411
495
this .sparkSchema = schema ;
412
496
}
@@ -532,7 +616,10 @@ private int loadNextBatch() throws Throwable {
532
616
if (importer != null ) importer .close ();
533
617
importer = new CometSchemaImporter (ALLOCATOR );
534
618
535
- List <ColumnDescriptor > columns = requestedSchema .getColumns ();
619
+ for (ParquetColumn childColumn : JavaConverters .seqAsJavaList (parquetColumn .children ())) {
620
+ checkParquetType (childColumn );
621
+ }
622
+
536
623
List <Type > fields = requestedSchema .getFields ();
537
624
for (int i = 0 ; i < fields .size (); i ++) {
538
625
if (!missingColumns [i ]) {
0 commit comments