Skip to content

Commit 1da092a

Browse files
committed
Change FieldSummary {upper,lower}_bound to ByteBuf
1 parent aa24cf4 commit 1da092a

File tree

7 files changed

+261
-366
lines changed

7 files changed

+261
-366
lines changed

crates/iceberg/src/expr/visitors/manifest_evaluator.rs

Lines changed: 102 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
// under the License.
1717

1818
use fnv::FnvHashSet;
19+
use serde_bytes::ByteBuf;
1920

2021
use crate::expr::visitors::bound_predicate_visitor::{BoundPredicateVisitor, visit};
2122
use crate::expr::{BoundPredicate, BoundReference};
@@ -42,13 +43,13 @@ impl ManifestEvaluator {
4243
/// see if this `ManifestFile` could possibly contain data that matches
4344
/// the scan's filter.
4445
pub(crate) fn eval(&self, manifest_file: &ManifestFile) -> Result<bool> {
45-
if manifest_file.partitions.is_empty() {
46-
return Ok(true);
46+
match &manifest_file.partitions {
47+
Some(p) if !p.is_empty() => {
48+
let mut evaluator = ManifestFilterVisitor::new(p);
49+
visit(&mut evaluator, &self.partition_filter)
50+
}
51+
_ => Ok(true),
4752
}
48-
49-
let mut evaluator = ManifestFilterVisitor::new(&manifest_file.partitions);
50-
51-
visit(&mut evaluator, &self.partition_filter)
5253
}
5354
}
5455

@@ -154,9 +155,19 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> {
154155
_predicate: &BoundPredicate,
155156
) -> crate::Result<bool> {
156157
let field = self.field_summary_for_reference(reference);
158+
157159
match &field.lower_bound {
158-
Some(bound) if datum <= bound => ROWS_CANNOT_MATCH,
159-
Some(_) => ROWS_MIGHT_MATCH,
160+
Some(bound_bytes) => {
161+
let bound = ManifestFilterVisitor::bytes_to_datum(
162+
bound_bytes,
163+
reference.field().field_type.clone(),
164+
);
165+
if datum <= &bound {
166+
ROWS_CANNOT_MATCH
167+
} else {
168+
ROWS_MIGHT_MATCH
169+
}
170+
}
160171
None => ROWS_CANNOT_MATCH,
161172
}
162173
}
@@ -169,8 +180,17 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> {
169180
) -> crate::Result<bool> {
170181
let field = self.field_summary_for_reference(reference);
171182
match &field.lower_bound {
172-
Some(bound) if datum < bound => ROWS_CANNOT_MATCH,
173-
Some(_) => ROWS_MIGHT_MATCH,
183+
Some(bound_bytes) => {
184+
let bound = ManifestFilterVisitor::bytes_to_datum(
185+
bound_bytes,
186+
reference.field().field_type.clone(),
187+
);
188+
if datum < &bound {
189+
ROWS_CANNOT_MATCH
190+
} else {
191+
ROWS_MIGHT_MATCH
192+
}
193+
}
174194
None => ROWS_CANNOT_MATCH,
175195
}
176196
}
@@ -183,8 +203,17 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> {
183203
) -> crate::Result<bool> {
184204
let field = self.field_summary_for_reference(reference);
185205
match &field.upper_bound {
186-
Some(bound) if datum >= bound => ROWS_CANNOT_MATCH,
187-
Some(_) => ROWS_MIGHT_MATCH,
206+
Some(bound_bytes) => {
207+
let bound = ManifestFilterVisitor::bytes_to_datum(
208+
bound_bytes,
209+
reference.field().field_type.clone(),
210+
);
211+
if datum >= &bound {
212+
ROWS_CANNOT_MATCH
213+
} else {
214+
ROWS_MIGHT_MATCH
215+
}
216+
}
188217
None => ROWS_CANNOT_MATCH,
189218
}
190219
}
@@ -197,8 +226,17 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> {
197226
) -> crate::Result<bool> {
198227
let field = self.field_summary_for_reference(reference);
199228
match &field.upper_bound {
200-
Some(bound) if datum > bound => ROWS_CANNOT_MATCH,
201-
Some(_) => ROWS_MIGHT_MATCH,
229+
Some(bound_bytes) => {
230+
let bound = ManifestFilterVisitor::bytes_to_datum(
231+
bound_bytes,
232+
reference.field().field_type.clone(),
233+
);
234+
if datum > &bound {
235+
ROWS_CANNOT_MATCH
236+
} else {
237+
ROWS_MIGHT_MATCH
238+
}
239+
}
202240
None => ROWS_CANNOT_MATCH,
203241
}
204242
}
@@ -215,14 +253,22 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> {
215253
return ROWS_CANNOT_MATCH;
216254
}
217255

218-
if let Some(lower_bound) = &field.lower_bound {
219-
if lower_bound > datum {
256+
if let Some(lower_bound_bytes) = &field.lower_bound {
257+
let lower_bound = ManifestFilterVisitor::bytes_to_datum(
258+
lower_bound_bytes,
259+
reference.field().field_type.clone(),
260+
);
261+
if datum > &lower_bound {
220262
return ROWS_CANNOT_MATCH;
221263
}
222264
}
223265

224-
if let Some(upper_bound) = &field.upper_bound {
225-
if upper_bound < datum {
266+
if let Some(upper_bound_bytes) = &field.upper_bound {
267+
let upper_bound = ManifestFilterVisitor::bytes_to_datum(
268+
upper_bound_bytes,
269+
reference.field().field_type.clone(),
270+
);
271+
if datum < &upper_bound {
226272
return ROWS_CANNOT_MATCH;
227273
}
228274
}
@@ -260,23 +306,15 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> {
260306
let prefix_len = prefix.len();
261307

262308
if let Some(lower_bound) = &field.lower_bound {
263-
let lower_bound_str = ManifestFilterVisitor::datum_as_str(
264-
lower_bound,
265-
"Cannot perform starts_with on non-string lower bound",
266-
)?;
267-
let min_len = lower_bound_str.len().min(prefix_len);
268-
if prefix.as_bytes().lt(&lower_bound_str.as_bytes()[..min_len]) {
309+
let min_len = lower_bound.len().min(prefix_len);
310+
if prefix.as_bytes().lt(&lower_bound[..min_len]) {
269311
return ROWS_CANNOT_MATCH;
270312
}
271313
}
272314

273315
if let Some(upper_bound) = &field.upper_bound {
274-
let upper_bound_str = ManifestFilterVisitor::datum_as_str(
275-
upper_bound,
276-
"Cannot perform starts_with on non-string upper bound",
277-
)?;
278-
let min_len = upper_bound_str.len().min(prefix_len);
279-
if prefix.as_bytes().gt(&upper_bound_str.as_bytes()[..min_len]) {
316+
let min_len = upper_bound.len().min(prefix_len);
317+
if prefix.as_bytes().gt(&upper_bound[..min_len]) {
280318
return ROWS_CANNOT_MATCH;
281319
}
282320
}
@@ -305,35 +343,19 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> {
305343
// not_starts_with will match unless all values must start with the prefix. This happens when
306344
// the lower and upper bounds both start with the prefix.
307345
if let Some(lower_bound) = &field.lower_bound {
308-
let lower_bound_str = ManifestFilterVisitor::datum_as_str(
309-
lower_bound,
310-
"Cannot perform not_starts_with on non-string lower bound",
311-
)?;
312-
313346
// if lower is shorter than the prefix then lower doesn't start with the prefix
314-
if prefix_len > lower_bound_str.len() {
347+
if prefix_len > lower_bound.len() {
315348
return ROWS_MIGHT_MATCH;
316349
}
317350

318-
if prefix
319-
.as_bytes()
320-
.eq(&lower_bound_str.as_bytes()[..prefix_len])
321-
{
351+
if prefix.as_bytes().eq(&lower_bound[..prefix_len]) {
322352
if let Some(upper_bound) = &field.upper_bound {
323-
let upper_bound_str = ManifestFilterVisitor::datum_as_str(
324-
upper_bound,
325-
"Cannot perform not_starts_with on non-string upper bound",
326-
)?;
327-
328353
// if upper is shorter than the prefix then upper can't start with the prefix
329-
if prefix_len > upper_bound_str.len() {
354+
if prefix_len > upper_bound.len() {
330355
return ROWS_MIGHT_MATCH;
331356
}
332357

333-
if prefix
334-
.as_bytes()
335-
.eq(&upper_bound_str.as_bytes()[..prefix_len])
336-
{
358+
if prefix.as_bytes().eq(&upper_bound[..prefix_len]) {
337359
return ROWS_CANNOT_MATCH;
338360
}
339361
}
@@ -359,13 +381,21 @@ impl BoundPredicateVisitor for ManifestFilterVisitor<'_> {
359381
}
360382

361383
if let Some(lower_bound) = &field.lower_bound {
362-
if literals.iter().all(|datum| lower_bound > datum) {
384+
let d = ManifestFilterVisitor::bytes_to_datum(
385+
lower_bound,
386+
reference.field().clone().field_type,
387+
);
388+
if literals.iter().all(|datum| &d < datum) {
363389
return ROWS_CANNOT_MATCH;
364390
}
365391
}
366392

367393
if let Some(upper_bound) = &field.upper_bound {
368-
if literals.iter().all(|datum| upper_bound < datum) {
394+
let d = ManifestFilterVisitor::bytes_to_datum(
395+
upper_bound,
396+
reference.field().clone().field_type,
397+
);
398+
if literals.iter().all(|datum| &d < datum) {
369399
return ROWS_CANNOT_MATCH;
370400
}
371401
}
@@ -414,6 +444,11 @@ impl ManifestFilterVisitor<'_> {
414444
};
415445
Ok(bound)
416446
}
447+
448+
fn bytes_to_datum<'a>(bytes: &ByteBuf, t: Box<Type>) -> Datum {
449+
let p = t.as_primitive_type().unwrap();
450+
Datum::try_from_bytes(bytes, p.clone()).unwrap()
451+
}
417452
}
418453

419454
#[cfg(test)]
@@ -520,8 +555,8 @@ mod test {
520555
FieldSummary {
521556
contains_null: false,
522557
contains_nan: None,
523-
lower_bound: Some(Datum::int(INT_MIN_VALUE)),
524-
upper_bound: Some(Datum::int(INT_MAX_VALUE)),
558+
lower_bound: Some(Datum::int(INT_MIN_VALUE).to_bytes().unwrap()),
559+
upper_bound: Some(Datum::int(INT_MAX_VALUE).to_bytes().unwrap()),
525560
},
526561
// all_nulls_missing_nan
527562
FieldSummary {
@@ -534,22 +569,22 @@ mod test {
534569
FieldSummary {
535570
contains_null: true,
536571
contains_nan: None,
537-
lower_bound: Some(Datum::string(STRING_MIN_VALUE)),
538-
upper_bound: Some(Datum::string(STRING_MAX_VALUE)),
572+
lower_bound: Some(Datum::string(STRING_MIN_VALUE).to_bytes().unwrap()),
573+
upper_bound: Some(Datum::string(STRING_MAX_VALUE).to_bytes().unwrap()),
539574
},
540575
// no_nulls
541576
FieldSummary {
542577
contains_null: false,
543578
contains_nan: None,
544-
lower_bound: Some(Datum::string(STRING_MIN_VALUE)),
545-
upper_bound: Some(Datum::string(STRING_MAX_VALUE)),
579+
lower_bound: Some(Datum::string(STRING_MIN_VALUE).to_bytes().unwrap()),
580+
upper_bound: Some(Datum::string(STRING_MAX_VALUE).to_bytes().unwrap()),
546581
},
547582
// float
548583
FieldSummary {
549584
contains_null: true,
550585
contains_nan: None,
551-
lower_bound: Some(Datum::float(0.0)),
552-
upper_bound: Some(Datum::float(20.0)),
586+
lower_bound: Some(Datum::float(0.0).to_bytes().unwrap()),
587+
upper_bound: Some(Datum::float(20.0).to_bytes().unwrap()),
553588
},
554589
// all_nulls_double
555590
FieldSummary {
@@ -583,8 +618,8 @@ mod test {
583618
FieldSummary {
584619
contains_null: false,
585620
contains_nan: Some(false),
586-
lower_bound: Some(Datum::float(0.0)),
587-
upper_bound: Some(Datum::float(20.0)),
621+
lower_bound: Some(Datum::float(0.0).to_bytes().unwrap()),
622+
upper_bound: Some(Datum::float(20.0).to_bytes().unwrap()),
588623
},
589624
// all_nulls_missing_nan_float
590625
FieldSummary {
@@ -597,15 +632,15 @@ mod test {
597632
FieldSummary {
598633
contains_null: true,
599634
contains_nan: None,
600-
lower_bound: Some(Datum::string(STRING_MIN_VALUE)),
601-
upper_bound: Some(Datum::string(STRING_MIN_VALUE)),
635+
lower_bound: Some(Datum::string(STRING_MIN_VALUE).to_bytes().unwrap()),
636+
upper_bound: Some(Datum::string(STRING_MIN_VALUE).to_bytes().unwrap()),
602637
},
603638
// no_nulls_same_value_a
604639
FieldSummary {
605640
contains_null: false,
606641
contains_nan: None,
607-
lower_bound: Some(Datum::string(STRING_MIN_VALUE)),
608-
upper_bound: Some(Datum::string(STRING_MIN_VALUE)),
642+
lower_bound: Some(Datum::string(STRING_MIN_VALUE).to_bytes().unwrap()),
643+
upper_bound: Some(Datum::string(STRING_MIN_VALUE).to_bytes().unwrap()),
609644
},
610645
]
611646
}
@@ -625,7 +660,7 @@ mod test {
625660
added_rows_count: None,
626661
existing_rows_count: None,
627662
deleted_rows_count: None,
628-
partitions,
663+
partitions: Some(partitions),
629664
key_metadata: vec![],
630665
}
631666
}

crates/iceberg/src/inspect/manifests.rs

Lines changed: 33 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,10 @@ use futures::{StreamExt, stream};
2929
use crate::Result;
3030
use crate::arrow::schema_to_arrow_schema;
3131
use crate::scan::ArrowRecordBatchStream;
32-
use crate::spec::{FieldSummary, ListType, NestedField, PrimitiveType, StructType, Type};
32+
use crate::spec::{
33+
Datum, FieldSummary, ListType, NestedField, PartitionSpecRef, PrimitiveType, Schema,
34+
StructType, Type,
35+
};
3336
use crate::table::Table;
3437

3538
/// Manifests table.
@@ -181,7 +184,15 @@ impl<'a> ManifestsTable<'a> {
181184
.append_value(manifest.existing_files_count.unwrap_or(0) as i32);
182185
deleted_delete_files_count
183186
.append_value(manifest.deleted_files_count.unwrap_or(0) as i32);
184-
self.append_partition_summaries(&mut partition_summaries, &manifest.partitions);
187+
self.append_partition_summaries(
188+
&mut partition_summaries,
189+
&manifest.partitions.clone().unwrap_or_else(Vec::new),
190+
&self.schema(),
191+
&self
192+
.table
193+
.metadata()
194+
.partition_spec_by_id(manifest.partition_spec_id),
195+
);
185196
}
186197
}
187198

@@ -230,9 +241,11 @@ impl<'a> ManifestsTable<'a> {
230241
&self,
231242
builder: &mut GenericListBuilder<i32, StructBuilder>,
232243
partitions: &[FieldSummary],
244+
schema: &Schema,
245+
spec: &Option<&PartitionSpecRef>,
233246
) {
234247
let partition_summaries_builder = builder.values();
235-
for summary in partitions {
248+
for (summary, field) in partitions.iter().zip(spec.unwrap().fields()) {
236249
partition_summaries_builder
237250
.field_builder::<BooleanBuilder>(0)
238251
.unwrap()
@@ -241,14 +254,29 @@ impl<'a> ManifestsTable<'a> {
241254
.field_builder::<BooleanBuilder>(1)
242255
.unwrap()
243256
.append_option(summary.contains_nan);
257+
258+
let field_type = schema
259+
.field_by_id(field.source_id)
260+
.unwrap()
261+
.field_type
262+
.as_primitive_type()
263+
.unwrap();
244264
partition_summaries_builder
245265
.field_builder::<StringBuilder>(2)
246266
.unwrap()
247-
.append_option(summary.lower_bound.as_ref().map(|v| v.to_string()));
267+
.append_option(summary.lower_bound.as_ref().map(|v| {
268+
Datum::try_from_bytes(v, field_type.clone())
269+
.unwrap()
270+
.to_string()
271+
}));
248272
partition_summaries_builder
249273
.field_builder::<StringBuilder>(3)
250274
.unwrap()
251-
.append_option(summary.upper_bound.as_ref().map(|v| v.to_string()));
275+
.append_option(summary.upper_bound.as_ref().map(|v| {
276+
Datum::try_from_bytes(v, field_type.clone())
277+
.unwrap()
278+
.to_string()
279+
}));
252280
partition_summaries_builder.append(true);
253281
}
254282
builder.append(true);

0 commit comments

Comments
 (0)