Skip to content

Commit 601ef29

Browse files
committed
Change FieldSummary {upper,lower}_bound to ByteBuf
1 parent aa24cf4 commit 601ef29

File tree

7 files changed

+188
-330
lines changed

7 files changed

+188
-330
lines changed

crates/iceberg/src/expr/visitors/manifest_evaluator.rs

Lines changed: 82 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
// under the License.
1717

1818
use fnv::FnvHashSet;
19-
19+
use serde_bytes::ByteBuf;
2020
use crate::expr::visitors::bound_predicate_visitor::{BoundPredicateVisitor, visit};
2121
use crate::expr::{BoundPredicate, BoundReference};
2222
use crate::spec::{Datum, FieldSummary, ManifestFile, PrimitiveLiteral, Type};
@@ -42,13 +42,13 @@ impl ManifestEvaluator {
4242
/// see if this `ManifestFile` could possibly contain data that matches
4343
/// the scan's filter.
4444
pub(crate) fn eval(&self, manifest_file: &ManifestFile) -> Result<bool> {
45-
if manifest_file.partitions.is_empty() {
46-
return Ok(true);
45+
match &manifest_file.partitions {
46+
Some(p) if !p.is_empty() => {
47+
let mut evaluator = ManifestFilterVisitor::new(p);
48+
visit(&mut evaluator, &self.partition_filter)
49+
}
50+
_ => Ok(true)
4751
}
48-
49-
let mut evaluator = ManifestFilterVisitor::new(&manifest_file.partitions);
50-
51-
visit(&mut evaluator, &self.partition_filter)
5252
}
5353
}
5454

@@ -154,9 +154,16 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> {
154154
_predicate: &BoundPredicate,
155155
) -> crate::Result<bool> {
156156
let field = self.field_summary_for_reference(reference);
157+
157158
match &field.lower_bound {
158-
Some(bound) if datum <= bound => ROWS_CANNOT_MATCH,
159-
Some(_) => ROWS_MIGHT_MATCH,
159+
Some(bound_bytes) => {
160+
let bound = ManifestFilterVisitor::bytes_to_datum(bound_bytes, reference.field().field_type.clone());
161+
if datum <= &bound {
162+
ROWS_CANNOT_MATCH
163+
} else {
164+
ROWS_MIGHT_MATCH
165+
}
166+
}
160167
None => ROWS_CANNOT_MATCH,
161168
}
162169
}
@@ -169,8 +176,14 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> {
169176
) -> crate::Result<bool> {
170177
let field = self.field_summary_for_reference(reference);
171178
match &field.lower_bound {
172-
Some(bound) if datum < bound => ROWS_CANNOT_MATCH,
173-
Some(_) => ROWS_MIGHT_MATCH,
179+
Some(bound_bytes) => {
180+
let bound = ManifestFilterVisitor::bytes_to_datum(bound_bytes, reference.field().field_type.clone());
181+
if datum < &bound {
182+
ROWS_CANNOT_MATCH
183+
} else {
184+
ROWS_MIGHT_MATCH
185+
}
186+
}
174187
None => ROWS_CANNOT_MATCH,
175188
}
176189
}
@@ -183,8 +196,14 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> {
183196
) -> crate::Result<bool> {
184197
let field = self.field_summary_for_reference(reference);
185198
match &field.upper_bound {
186-
Some(bound) if datum >= bound => ROWS_CANNOT_MATCH,
187-
Some(_) => ROWS_MIGHT_MATCH,
199+
Some(bound_bytes) => {
200+
let bound = ManifestFilterVisitor::bytes_to_datum(bound_bytes, reference.field().field_type.clone());
201+
if datum >= &bound {
202+
ROWS_CANNOT_MATCH
203+
} else {
204+
ROWS_MIGHT_MATCH
205+
}
206+
}
188207
None => ROWS_CANNOT_MATCH,
189208
}
190209
}
@@ -197,8 +216,14 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> {
197216
) -> crate::Result<bool> {
198217
let field = self.field_summary_for_reference(reference);
199218
match &field.upper_bound {
200-
Some(bound) if datum > bound => ROWS_CANNOT_MATCH,
201-
Some(_) => ROWS_MIGHT_MATCH,
219+
Some(bound_bytes) => {
220+
let bound = ManifestFilterVisitor::bytes_to_datum(bound_bytes, reference.field().field_type.clone());
221+
if datum > &bound {
222+
ROWS_CANNOT_MATCH
223+
} else {
224+
ROWS_MIGHT_MATCH
225+
}
226+
}
202227
None => ROWS_CANNOT_MATCH,
203228
}
204229
}
@@ -215,14 +240,16 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> {
215240
return ROWS_CANNOT_MATCH;
216241
}
217242

218-
if let Some(lower_bound) = &field.lower_bound {
219-
if lower_bound > datum {
243+
if let Some(lower_bound_bytes) = &field.lower_bound {
244+
let lower_bound = ManifestFilterVisitor::bytes_to_datum(lower_bound_bytes, reference.field().field_type.clone());
245+
if datum > &lower_bound {
220246
return ROWS_CANNOT_MATCH;
221247
}
222248
}
223249

224-
if let Some(upper_bound) = &field.upper_bound {
225-
if upper_bound < datum {
250+
if let Some(upper_bound_bytes) = &field.upper_bound {
251+
let upper_bound = ManifestFilterVisitor::bytes_to_datum(upper_bound_bytes, reference.field().field_type.clone());
252+
if datum < &upper_bound {
226253
return ROWS_CANNOT_MATCH;
227254
}
228255
}
@@ -260,23 +287,15 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> {
260287
let prefix_len = prefix.len();
261288

262289
if let Some(lower_bound) = &field.lower_bound {
263-
let lower_bound_str = ManifestFilterVisitor::datum_as_str(
264-
lower_bound,
265-
"Cannot perform starts_with on non-string lower bound",
266-
)?;
267-
let min_len = lower_bound_str.len().min(prefix_len);
268-
if prefix.as_bytes().lt(&lower_bound_str.as_bytes()[..min_len]) {
290+
let min_len = lower_bound.len().min(prefix_len);
291+
if prefix.as_bytes().lt(&lower_bound[..min_len]) {
269292
return ROWS_CANNOT_MATCH;
270293
}
271294
}
272295

273296
if let Some(upper_bound) = &field.upper_bound {
274-
let upper_bound_str = ManifestFilterVisitor::datum_as_str(
275-
upper_bound,
276-
"Cannot perform starts_with on non-string upper bound",
277-
)?;
278-
let min_len = upper_bound_str.len().min(prefix_len);
279-
if prefix.as_bytes().gt(&upper_bound_str.as_bytes()[..min_len]) {
297+
let min_len = upper_bound.len().min(prefix_len);
298+
if prefix.as_bytes().gt(&upper_bound[..min_len]) {
280299
return ROWS_CANNOT_MATCH;
281300
}
282301
}
@@ -305,34 +324,24 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> {
305324
// not_starts_with will match unless all values must start with the prefix. This happens when
306325
// the lower and upper bounds both start with the prefix.
307326
if let Some(lower_bound) = &field.lower_bound {
308-
let lower_bound_str = ManifestFilterVisitor::datum_as_str(
309-
lower_bound,
310-
"Cannot perform not_starts_with on non-string lower bound",
311-
)?;
312-
313327
// if lower is shorter than the prefix then lower doesn't start with the prefix
314-
if prefix_len > lower_bound_str.len() {
328+
if prefix_len > lower_bound.len() {
315329
return ROWS_MIGHT_MATCH;
316330
}
317331

318332
if prefix
319333
.as_bytes()
320-
.eq(&lower_bound_str.as_bytes()[..prefix_len])
334+
.eq(&lower_bound[..prefix_len])
321335
{
322336
if let Some(upper_bound) = &field.upper_bound {
323-
let upper_bound_str = ManifestFilterVisitor::datum_as_str(
324-
upper_bound,
325-
"Cannot perform not_starts_with on non-string upper bound",
326-
)?;
327-
328337
// if upper is shorter than the prefix then upper can't start with the prefix
329-
if prefix_len > upper_bound_str.len() {
338+
if prefix_len > upper_bound.len() {
330339
return ROWS_MIGHT_MATCH;
331340
}
332341

333342
if prefix
334343
.as_bytes()
335-
.eq(&upper_bound_str.as_bytes()[..prefix_len])
344+
.eq(&upper_bound[..prefix_len])
336345
{
337346
return ROWS_CANNOT_MATCH;
338347
}
@@ -359,13 +368,19 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> {
359368
}
360369

361370
if let Some(lower_bound) = &field.lower_bound {
362-
if literals.iter().all(|datum| lower_bound > datum) {
371+
let d = ManifestFilterVisitor::bytes_to_datum(lower_bound, reference.field().clone().field_type);
372+
if literals.iter().all(|datum| {
373+
&d < datum
374+
}) {
363375
return ROWS_CANNOT_MATCH;
364376
}
365377
}
366378

367379
if let Some(upper_bound) = &field.upper_bound {
368-
if literals.iter().all(|datum| upper_bound < datum) {
380+
let d = ManifestFilterVisitor::bytes_to_datum(upper_bound, reference.field().clone().field_type);
381+
if literals.iter().all(|datum| {
382+
&d < datum
383+
}) {
369384
return ROWS_CANNOT_MATCH;
370385
}
371386
}
@@ -414,6 +429,11 @@ impl ManifestFilterVisitor<'_> {
414429
};
415430
Ok(bound)
416431
}
432+
433+
fn bytes_to_datum<'a>(bytes: &ByteBuf, t: Box<Type>) -> Datum {
434+
let p = t.as_primitive_type().unwrap();
435+
Datum::try_from_bytes(bytes, p.clone()).unwrap()
436+
}
417437
}
418438

419439
#[cfg(test)]
@@ -520,8 +540,8 @@ mod test {
520540
FieldSummary {
521541
contains_null: false,
522542
contains_nan: None,
523-
lower_bound: Some(Datum::int(INT_MIN_VALUE)),
524-
upper_bound: Some(Datum::int(INT_MAX_VALUE)),
543+
lower_bound: Some(Datum::int(INT_MIN_VALUE).to_bytes().unwrap()),
544+
upper_bound: Some(Datum::int(INT_MAX_VALUE).to_bytes().unwrap()),
525545
},
526546
// all_nulls_missing_nan
527547
FieldSummary {
@@ -534,22 +554,22 @@ mod test {
534554
FieldSummary {
535555
contains_null: true,
536556
contains_nan: None,
537-
lower_bound: Some(Datum::string(STRING_MIN_VALUE)),
538-
upper_bound: Some(Datum::string(STRING_MAX_VALUE)),
557+
lower_bound: Some(Datum::string(STRING_MIN_VALUE).to_bytes().unwrap()),
558+
upper_bound: Some(Datum::string(STRING_MAX_VALUE).to_bytes().unwrap()),
539559
},
540560
// no_nulls
541561
FieldSummary {
542562
contains_null: false,
543563
contains_nan: None,
544-
lower_bound: Some(Datum::string(STRING_MIN_VALUE)),
545-
upper_bound: Some(Datum::string(STRING_MAX_VALUE)),
564+
lower_bound: Some(Datum::string(STRING_MIN_VALUE).to_bytes().unwrap()),
565+
upper_bound: Some(Datum::string(STRING_MAX_VALUE).to_bytes().unwrap()),
546566
},
547567
// float
548568
FieldSummary {
549569
contains_null: true,
550570
contains_nan: None,
551-
lower_bound: Some(Datum::float(0.0)),
552-
upper_bound: Some(Datum::float(20.0)),
571+
lower_bound: Some(Datum::float(0.0).to_bytes().unwrap()),
572+
upper_bound: Some(Datum::float(20.0).to_bytes().unwrap()),
553573
},
554574
// all_nulls_double
555575
FieldSummary {
@@ -583,8 +603,8 @@ mod test {
583603
FieldSummary {
584604
contains_null: false,
585605
contains_nan: Some(false),
586-
lower_bound: Some(Datum::float(0.0)),
587-
upper_bound: Some(Datum::float(20.0)),
606+
lower_bound: Some(Datum::float(0.0).to_bytes().unwrap()),
607+
upper_bound: Some(Datum::float(20.0).to_bytes().unwrap()),
588608
},
589609
// all_nulls_missing_nan_float
590610
FieldSummary {
@@ -597,15 +617,15 @@ mod test {
597617
FieldSummary {
598618
contains_null: true,
599619
contains_nan: None,
600-
lower_bound: Some(Datum::string(STRING_MIN_VALUE)),
601-
upper_bound: Some(Datum::string(STRING_MIN_VALUE)),
620+
lower_bound: Some(Datum::string(STRING_MIN_VALUE).to_bytes().unwrap()),
621+
upper_bound: Some(Datum::string(STRING_MIN_VALUE).to_bytes().unwrap()),
602622
},
603623
// no_nulls_same_value_a
604624
FieldSummary {
605625
contains_null: false,
606626
contains_nan: None,
607-
lower_bound: Some(Datum::string(STRING_MIN_VALUE)),
608-
upper_bound: Some(Datum::string(STRING_MIN_VALUE)),
627+
lower_bound: Some(Datum::string(STRING_MIN_VALUE).to_bytes().unwrap()),
628+
upper_bound: Some(Datum::string(STRING_MIN_VALUE).to_bytes().unwrap()),
609629
},
610630
]
611631
}
@@ -625,7 +645,7 @@ mod test {
625645
added_rows_count: None,
626646
existing_rows_count: None,
627647
deleted_rows_count: None,
628-
partitions,
648+
partitions: Some(partitions),
629649
key_metadata: vec![],
630650
}
631651
}

crates/iceberg/src/inspect/manifests.rs

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ use futures::{StreamExt, stream};
2929
use crate::Result;
3030
use crate::arrow::schema_to_arrow_schema;
3131
use crate::scan::ArrowRecordBatchStream;
32-
use crate::spec::{FieldSummary, ListType, NestedField, PrimitiveType, StructType, Type};
32+
use crate::spec::{Datum, FieldSummary, ListType, NestedField, PartitionSpecRef, PrimitiveType, Schema, StructType, Type};
3333
use crate::table::Table;
3434

3535
/// Manifests table.
@@ -181,7 +181,7 @@ impl<'a> ManifestsTable<'a> {
181181
.append_value(manifest.existing_files_count.unwrap_or(0) as i32);
182182
deleted_delete_files_count
183183
.append_value(manifest.deleted_files_count.unwrap_or(0) as i32);
184-
self.append_partition_summaries(&mut partition_summaries, &manifest.partitions);
184+
self.append_partition_summaries(&mut partition_summaries, &manifest.partitions.clone().unwrap_or_else(Vec::new), &self.schema(), &self.table.metadata().partition_spec_by_id(manifest.partition_spec_id));
185185
}
186186
}
187187

@@ -230,9 +230,11 @@ impl<'a> ManifestsTable<'a> {
230230
&self,
231231
builder: &mut GenericListBuilder<i32, StructBuilder>,
232232
partitions: &[FieldSummary],
233+
schema: &Schema,
234+
spec: &Option<&PartitionSpecRef>,
233235
) {
234236
let partition_summaries_builder = builder.values();
235-
for summary in partitions {
237+
for (summary, field) in partitions.iter().zip(spec.unwrap().fields()) {
236238
partition_summaries_builder
237239
.field_builder::<BooleanBuilder>(0)
238240
.unwrap()
@@ -241,14 +243,16 @@ impl<'a> ManifestsTable<'a> {
241243
.field_builder::<BooleanBuilder>(1)
242244
.unwrap()
243245
.append_option(summary.contains_nan);
246+
247+
let field_type = schema.field_by_id(field.source_id).unwrap().field_type.as_primitive_type().unwrap();
244248
partition_summaries_builder
245249
.field_builder::<StringBuilder>(2)
246250
.unwrap()
247-
.append_option(summary.lower_bound.as_ref().map(|v| v.to_string()));
251+
.append_option(summary.lower_bound.as_ref().map(|v| Datum::try_from_bytes(v, field_type.clone()).unwrap().to_string()));
248252
partition_summaries_builder
249253
.field_builder::<StringBuilder>(3)
250254
.unwrap()
251-
.append_option(summary.upper_bound.as_ref().map(|v| v.to_string()));
255+
.append_option(summary.upper_bound.as_ref().map(|v| Datum::try_from_bytes(v, field_type.clone()).unwrap().to_string()));
252256
partition_summaries_builder.append(true);
253257
}
254258
builder.append(true);

0 commit comments

Comments
 (0)