Skip to content

Minor: Add a parquet row_filter test, reduce some test boiler plate #7522

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
May 20, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
170 changes: 95 additions & 75 deletions parquet/src/arrow/async_reader/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1124,6 +1124,16 @@ mod tests {
requests: Arc<Mutex<Vec<Range<usize>>>>,
}

impl TestReader {
fn new(data: Bytes) -> Self {
Self {
data,
metadata: Default::default(),
requests: Default::default(),
}
}
}

impl AsyncFileReader for TestReader {
fn get_bytes(&mut self, range: Range<u64>) -> BoxFuture<'_, Result<Bytes>> {
let range = range.clone();
Expand Down Expand Up @@ -1156,11 +1166,7 @@ mod tests {
let path = format!("{testdata}/alltypes_plain.parquet");
let data = Bytes::from(std::fs::read(path).unwrap());

let async_reader = TestReader {
data: data.clone(),
metadata: Default::default(),
requests: Default::default(),
};
let async_reader = TestReader::new(data.clone());

let requests = async_reader.requests.clone();
let builder = ParquetRecordBatchStreamBuilder::new(async_reader)
Expand Down Expand Up @@ -1209,11 +1215,7 @@ mod tests {
let path = format!("{testdata}/alltypes_plain.parquet");
let data = Bytes::from(std::fs::read(path).unwrap());

let async_reader = TestReader {
data: data.clone(),
metadata: Default::default(),
requests: Default::default(),
};
let async_reader = TestReader::new(data.clone());

let requests = async_reader.requests.clone();
let builder = ParquetRecordBatchStreamBuilder::new(async_reader)
Expand Down Expand Up @@ -1270,11 +1272,7 @@ mod tests {
let path = format!("{testdata}/alltypes_tiny_pages_plain.parquet");
let data = Bytes::from(std::fs::read(path).unwrap());

let async_reader = TestReader {
data: data.clone(),
metadata: Default::default(),
requests: Default::default(),
};
let async_reader = TestReader::new(data.clone());

let options = ArrowReaderOptions::new().with_page_index(true);
let builder = ParquetRecordBatchStreamBuilder::new_with_options(async_reader, options)
Expand Down Expand Up @@ -1339,11 +1337,7 @@ mod tests {

assert_eq!(metadata.num_row_groups(), 1);

let async_reader = TestReader {
data: data.clone(),
metadata: Default::default(),
requests: Default::default(),
};
let async_reader = TestReader::new(data.clone());

let builder = ParquetRecordBatchStreamBuilder::new(async_reader)
.await
Expand Down Expand Up @@ -1380,11 +1374,7 @@ mod tests {
let path = format!("{testdata}/alltypes_tiny_pages_plain.parquet");
let data = Bytes::from(std::fs::read(path).unwrap());

let async_reader = TestReader {
data: data.clone(),
metadata: Default::default(),
requests: Default::default(),
};
let async_reader = TestReader::new(data.clone());

let options = ArrowReaderOptions::new().with_page_index(true);
let builder = ParquetRecordBatchStreamBuilder::new_with_options(async_reader, options)
Expand Down Expand Up @@ -1458,11 +1448,7 @@ mod tests {

let selection = RowSelection::from(selectors);

let async_reader = TestReader {
data: data.clone(),
metadata: Default::default(),
requests: Default::default(),
};
let async_reader = TestReader::new(data.clone());

let options = ArrowReaderOptions::new().with_page_index(true);
let builder = ParquetRecordBatchStreamBuilder::new_with_options(async_reader, options)
Expand Down Expand Up @@ -1524,11 +1510,7 @@ mod tests {

let selection = RowSelection::from(selectors);

let async_reader = TestReader {
data: data.clone(),
metadata: Default::default(),
requests: Default::default(),
};
let async_reader = TestReader::new(data.clone());

let options = ArrowReaderOptions::new().with_page_index(true);
let builder = ParquetRecordBatchStreamBuilder::new_with_options(async_reader, options)
Expand All @@ -1555,6 +1537,70 @@ mod tests {

#[tokio::test]
async fn test_row_filter() {
let a = StringArray::from_iter_values(["a", "b", "b", "b", "c", "c"]);
let b = StringArray::from_iter_values(["1", "2", "3", "4", "5", "6"]);
let data = RecordBatch::try_from_iter([
("a", Arc::new(a) as ArrayRef),
("b", Arc::new(b) as ArrayRef),
])
.unwrap();

let mut buf = Vec::with_capacity(1024);
let mut writer = ArrowWriter::try_new(&mut buf, data.schema(), None).unwrap();
writer.write(&data).unwrap();
writer.close().unwrap();

let data: Bytes = buf.into();
let metadata = ParquetMetaDataReader::new()
.parse_and_finish(&data)
.unwrap();
let parquet_schema = metadata.file_metadata().schema_descr_ptr();

let test = TestReader::new(data);
let requests = test.requests.clone();

let a_scalar = StringArray::from_iter_values(["b"]);
let a_filter = ArrowPredicateFn::new(
ProjectionMask::leaves(&parquet_schema, vec![0]),
move |batch| eq(batch.column(0), &Scalar::new(&a_scalar)),
);

let filter = RowFilter::new(vec![Box::new(a_filter)]);

let mask = ProjectionMask::leaves(&parquet_schema, vec![0, 1]);
let stream = ParquetRecordBatchStreamBuilder::new(test)
.await
.unwrap()
.with_projection(mask.clone())
.with_batch_size(1024)
.with_row_filter(filter)
.build()
.unwrap();

let batches: Vec<_> = stream.try_collect().await.unwrap();
assert_eq!(batches.len(), 1);

let batch = &batches[0];
assert_eq!(batch.num_columns(), 2);

// Filter should have kept only rows with "b" in column 0
assert_eq!(
batch.column(0).as_ref(),
&StringArray::from_iter_values(["b", "b", "b"])
);
assert_eq!(
batch.column(1).as_ref(),
&StringArray::from_iter_values(["2", "3", "4"])
);

// Should only have made 2 requests:
// * First request fetches data for evaluating the predicate
// * Second request fetches data for evaluating the projection
assert_eq!(requests.lock().unwrap().len(), 2);
}

#[tokio::test]
async fn test_two_row_filters() {
let a = StringArray::from_iter_values(["a", "b", "b", "b", "c", "c"]);
let b = StringArray::from_iter_values(["1", "2", "3", "4", "5", "6"]);
let c = Int32Array::from_iter(0..6);
Expand All @@ -1576,11 +1622,7 @@ mod tests {
.unwrap();
let parquet_schema = metadata.file_metadata().schema_descr_ptr();

let test = TestReader {
data,
metadata: Default::default(),
requests: Default::default(),
};
let test = TestReader::new(data);
let requests = test.requests.clone();

let a_scalar = StringArray::from_iter_values(["b"]);
Expand Down Expand Up @@ -1623,6 +1665,9 @@ mod tests {
assert_eq!(val, 3);

// Should only have made 3 requests
// * First request fetches data for evaluating the first predicate
// * Second request fetches data for evaluating the second predicate
// * Third request fetches data for evaluating the projection
assert_eq!(requests.lock().unwrap().len(), 3);
}

Expand Down Expand Up @@ -1653,11 +1698,7 @@ mod tests {

assert_eq!(metadata.num_row_groups(), 2);

let test = TestReader {
data,
metadata: Default::default(),
requests: Default::default(),
};
let test = TestReader::new(data);

let stream = ParquetRecordBatchStreamBuilder::new(test.clone())
.await
Expand Down Expand Up @@ -1744,11 +1785,7 @@ mod tests {

assert_eq!(metadata.num_row_groups(), 1);

let async_reader = TestReader {
data: data.clone(),
metadata: Default::default(),
requests: Default::default(),
};
let async_reader = TestReader::new(data.clone());

let a_filter =
ArrowPredicateFn::new(ProjectionMask::leaves(&parquet_schema, vec![1]), |batch| {
Expand Down Expand Up @@ -1812,11 +1849,7 @@ mod tests {

assert_eq!(metadata.num_row_groups(), 1);

let async_reader = TestReader {
data: data.clone(),
metadata: Default::default(),
requests: Default::default(),
};
let async_reader = TestReader::new(data.clone());

let requests = async_reader.requests.clone();
let (_, fields) = parquet_to_arrow_schema_and_fields(
Expand Down Expand Up @@ -1882,11 +1915,7 @@ mod tests {
let path = format!("{testdata}/alltypes_plain.parquet");
let data = Bytes::from(std::fs::read(path).unwrap());

let async_reader = TestReader {
data: data.clone(),
metadata: Default::default(),
requests: Default::default(),
};
let async_reader = TestReader::new(data.clone());

let builder = ParquetRecordBatchStreamBuilder::new(async_reader)
.await
Expand Down Expand Up @@ -2025,11 +2054,7 @@ mod tests {
let testdata = arrow::util::test_util::parquet_test_data();
let path = format!("{testdata}/data_index_bloom_encoding_stats.parquet");
let data = Bytes::from(std::fs::read(path).unwrap());
let async_reader = TestReader {
data: data.clone(),
metadata: Default::default(),
requests: Default::default(),
};
let async_reader = TestReader::new(data.clone());
let builder = ParquetRecordBatchStreamBuilder::new(async_reader)
.await
.unwrap();
Expand All @@ -2052,11 +2077,7 @@ mod tests {
}

async fn test_get_row_group_column_bloom_filter(data: Bytes, with_length: bool) {
let async_reader = TestReader {
data: data.clone(),
metadata: Default::default(),
requests: Default::default(),
};
let async_reader = TestReader::new(data.clone());

let mut builder = ParquetRecordBatchStreamBuilder::new(async_reader)
.await
Expand Down Expand Up @@ -2195,11 +2216,7 @@ mod tests {
.unwrap();
let parquet_schema = metadata.file_metadata().schema_descr_ptr();

let test = TestReader {
data,
metadata: Default::default(),
requests: Default::default(),
};
let test = TestReader::new(data);
let requests = test.requests.clone();

let a_scalar = StringArray::from_iter_values(["b"]);
Expand Down Expand Up @@ -2250,6 +2267,9 @@ mod tests {
assert_eq!(val, 3);

// Should only have made 3 requests
// * First request fetches data for evaluating the first predicate
// * Second request fetches data for evaluating the second predicate
// * Third request fetches data for evaluating the projection
assert_eq!(requests.lock().unwrap().len(), 3);
}

Expand Down
Loading