Skip to content

Add support for file row numbers in Parquet readers #7307

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
4 changes: 4 additions & 0 deletions parquet/examples/read_with_rowgroup.rs
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,10 @@ impl RowGroups for InMemoryRowGroup {
}
}
}

fn row_groups(&self) -> Box<dyn Iterator<Item = &RowGroupMetaData> + '_> {
Box::new(std::iter::once(&self.metadata))
}
}

impl InMemoryRowGroup {
Expand Down
84 changes: 74 additions & 10 deletions parquet/src/arrow/array_reader/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,13 @@
// specific language governing permissions and limitations
// under the License.

use arrow_schema::{DataType, Field, Fields, SchemaBuilder};
use std::sync::Arc;

use arrow_schema::{DataType, Fields, SchemaBuilder};

use crate::arrow::array_reader::byte_view_array::make_byte_view_array_reader;
use crate::arrow::array_reader::empty_array::make_empty_array_reader;
use crate::arrow::array_reader::fixed_len_byte_array::make_fixed_len_byte_array_reader;
use crate::arrow::array_reader::row_number::RowNumberReader;
use crate::arrow::array_reader::{
make_byte_array_dictionary_reader, make_byte_array_reader, ArrayReader,
FixedSizeListArrayReader, ListArrayReader, MapArrayReader, NullArrayReader,
Expand All @@ -39,9 +39,27 @@ pub fn build_array_reader(
field: Option<&ParquetField>,
mask: &ProjectionMask,
row_groups: &dyn RowGroups,
row_number_column: Option<&str>,
) -> Result<Box<dyn ArrayReader>> {
let reader = field
.and_then(|field| build_reader(field, mask, row_groups).transpose())
.and_then(|field| build_reader(field, mask, row_groups, row_number_column).transpose())
.or_else(|| {
row_number_column.map(|column| {
let row_number_reader = build_row_number_reader(row_groups)?;
let reader: Box<dyn ArrayReader> = Box::new(StructArrayReader::new(
DataType::Struct(Fields::from(vec![Field::new(
column,
row_number_reader.get_data_type().clone(),
false,
)])),
vec![row_number_reader],
0,
0,
false,
));
Ok(reader)
})
})
.transpose()?
.unwrap_or_else(|| make_empty_array_reader(row_groups.num_rows()));

Expand All @@ -52,12 +70,13 @@ fn build_reader(
field: &ParquetField,
mask: &ProjectionMask,
row_groups: &dyn RowGroups,
row_number_column: Option<&str>,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe a crazy idea, but wouldn't the implementation be simpler (and more flexible) with a RowNumber extension type? Then users could do e.g.

Field::new("row_index", DataType::Int64, false).with_extension_type(RowNumber))

and build_primitive_reader could just check for it, no matter where in the schema it hides, instead of implicitly adding an extra column to the schema?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Update: I don't think raw parquet types support metadata, so this may not be an option.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This would simplify usage of the feature. Having to keep track of the additional row number column is quite cumbersome in clients of this API. One option could be to extend ParquetFieldType with an additional row number type and add it based on the extension type in ArrowReaderMetadata::with_supplied_metadata? @etseidl @alamb what do you think about this approach?

) -> Result<Option<Box<dyn ArrayReader>>> {
match field.field_type {
ParquetFieldType::Primitive { .. } => build_primitive_reader(field, mask, row_groups),
ParquetFieldType::Group { .. } => match &field.arrow_type {
DataType::Map(_, _) => build_map_reader(field, mask, row_groups),
DataType::Struct(_) => build_struct_reader(field, mask, row_groups),
DataType::Struct(_) => build_struct_reader(field, mask, row_groups, row_number_column),
DataType::List(_) => build_list_reader(field, mask, false, row_groups),
DataType::LargeList(_) => build_list_reader(field, mask, true, row_groups),
DataType::FixedSizeList(_, _) => build_fixed_size_list_reader(field, mask, row_groups),
Expand All @@ -66,6 +85,10 @@ fn build_reader(
}
}

fn build_row_number_reader(row_groups: &dyn RowGroups) -> Result<Box<dyn ArrayReader>> {
Ok(Box::new(RowNumberReader::try_new(row_groups.row_groups())?))
}

/// Build array reader for map type.
fn build_map_reader(
field: &ParquetField,
Expand All @@ -75,8 +98,8 @@ fn build_map_reader(
let children = field.children().unwrap();
assert_eq!(children.len(), 2);

let key_reader = build_reader(&children[0], mask, row_groups)?;
let value_reader = build_reader(&children[1], mask, row_groups)?;
let key_reader = build_reader(&children[0], mask, row_groups, None)?;
let value_reader = build_reader(&children[1], mask, row_groups, None)?;

match (key_reader, value_reader) {
(Some(key_reader), Some(value_reader)) => {
Expand Down Expand Up @@ -127,7 +150,7 @@ fn build_list_reader(
let children = field.children().unwrap();
assert_eq!(children.len(), 1);

let reader = match build_reader(&children[0], mask, row_groups)? {
let reader = match build_reader(&children[0], mask, row_groups, None)? {
Some(item_reader) => {
// Need to retrieve underlying data type to handle projection
let item_type = item_reader.get_data_type().clone();
Expand Down Expand Up @@ -173,7 +196,7 @@ fn build_fixed_size_list_reader(
let children = field.children().unwrap();
assert_eq!(children.len(), 1);

let reader = match build_reader(&children[0], mask, row_groups)? {
let reader = match build_reader(&children[0], mask, row_groups, None)? {
Some(item_reader) => {
let item_type = item_reader.get_data_type().clone();
let reader = match &field.arrow_type {
Expand Down Expand Up @@ -300,6 +323,7 @@ fn build_struct_reader(
field: &ParquetField,
mask: &ProjectionMask,
row_groups: &dyn RowGroups,
row_number_column: Option<&str>,
) -> Result<Option<Box<dyn ArrayReader>>> {
let arrow_fields = match &field.arrow_type {
DataType::Struct(children) => children,
Expand All @@ -312,14 +336,24 @@ fn build_struct_reader(
let mut builder = SchemaBuilder::with_capacity(children.len());

for (arrow, parquet) in arrow_fields.iter().zip(children) {
if let Some(reader) = build_reader(parquet, mask, row_groups)? {
if let Some(reader) = build_reader(parquet, mask, row_groups, None)? {
// Need to retrieve underlying data type to handle projection
let child_type = reader.get_data_type().clone();
builder.push(arrow.as_ref().clone().with_data_type(child_type));
readers.push(reader);
}
}

if let Some(row_number_column) = row_number_column {
let reader = build_row_number_reader(row_groups)?;
builder.push(Field::new(
row_number_column,
reader.get_data_type().clone(),
false,
));
readers.push(reader);
}

if readers.is_empty() {
return Ok(None);
}
Expand Down Expand Up @@ -356,7 +390,7 @@ mod tests {
)
.unwrap();

let array_reader = build_array_reader(fields.as_ref(), &mask, &file_reader).unwrap();
let array_reader = build_array_reader(fields.as_ref(), &mask, &file_reader, None).unwrap();

// Create arrow types
let arrow_type = DataType::Struct(Fields::from(vec![Field::new(
Expand All @@ -367,4 +401,34 @@ mod tests {

assert_eq!(array_reader.get_data_type(), &arrow_type);
}

#[test]
fn test_create_array_reader_with_row_numbers() {
let file = get_test_file("nulls.snappy.parquet");
let file_reader: Arc<dyn FileReader> = Arc::new(SerializedFileReader::new(file).unwrap());

let file_metadata = file_reader.metadata().file_metadata();
let mask = ProjectionMask::leaves(file_metadata.schema_descr(), [0]);
let (_, fields) = parquet_to_arrow_schema_and_fields(
file_metadata.schema_descr(),
ProjectionMask::all(),
file_metadata.key_value_metadata(),
)
.unwrap();

let array_reader =
build_array_reader(fields.as_ref(), &mask, &file_reader, Some("row_number")).unwrap();

// Create arrow types
let arrow_type = DataType::Struct(Fields::from(vec![
Field::new(
"b_struct",
DataType::Struct(vec![Field::new("b_c_int", DataType::Int32, true)].into()),
true,
),
Field::new("row_number", DataType::Int64, false),
]));

assert_eq!(array_reader.get_data_type(), &arrow_type);
}
}
3 changes: 2 additions & 1 deletion parquet/src/arrow/array_reader/list_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -563,7 +563,8 @@ mod tests {
)
.unwrap();

let mut array_reader = build_array_reader(fields.as_ref(), &mask, &file_reader).unwrap();
let mut array_reader =
build_array_reader(fields.as_ref(), &mask, &file_reader, None).unwrap();

let batch = array_reader.next_batch(100).unwrap();
assert_eq!(batch.data_type(), array_reader.get_data_type());
Expand Down
9 changes: 9 additions & 0 deletions parquet/src/arrow/array_reader/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,13 @@ mod list_array;
mod map_array;
mod null_array;
mod primitive_array;
mod row_number;
mod struct_array;

#[cfg(test)]
mod test_util;

use crate::file::metadata::RowGroupMetaData;
pub use builder::build_array_reader;
pub use byte_array::make_byte_array_reader;
pub use byte_array_dictionary::make_byte_array_dictionary_reader;
Expand Down Expand Up @@ -113,6 +115,9 @@ pub trait RowGroups {

/// Returns a [`PageIterator`] for the column chunks with the given leaf column index
fn column_chunks(&self, i: usize) -> Result<Box<dyn PageIterator>>;

/// Returns an iterator over the row groups in this collection
fn row_groups(&self) -> Box<dyn Iterator<Item = &RowGroupMetaData> + '_>;
}

impl RowGroups for Arc<dyn FileReader> {
Expand All @@ -124,6 +129,10 @@ impl RowGroups for Arc<dyn FileReader> {
let iterator = FilePageIterator::new(column_index, Arc::clone(self))?;
Ok(Box::new(iterator))
}

fn row_groups(&self) -> Box<dyn Iterator<Item = &RowGroupMetaData> + '_> {
Box::new(self.metadata().row_groups().iter())
}
}

/// Uses `record_reader` to read up to `batch_size` records from `pages`
Expand Down
84 changes: 84 additions & 0 deletions parquet/src/arrow/array_reader/row_number.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

use crate::arrow::array_reader::ArrayReader;
use crate::errors::{ParquetError, Result};
use crate::file::metadata::RowGroupMetaData;
use arrow_array::{ArrayRef, Int64Array};
use arrow_schema::DataType;
use std::any::Any;
use std::sync::Arc;

pub(crate) struct RowNumberReader {
buffered_row_numbers: Vec<i64>,
remaining_row_numbers: std::iter::Flatten<std::vec::IntoIter<std::ops::Range<i64>>>,
}

impl RowNumberReader {
pub(crate) fn try_new<'a>(
row_groups: impl Iterator<Item = &'a RowGroupMetaData>,
) -> Result<Self> {
let ranges = row_groups
.map(|rg| {
let first_row_number = rg.first_row_index().ok_or(ParquetError::General(
"Row group missing row number".to_string(),
))?;
Ok(first_row_number..first_row_number + rg.num_rows())
})
.collect::<Result<Vec<_>>>()?;
Ok(Self {
buffered_row_numbers: Vec::new(),
remaining_row_numbers: ranges.into_iter().flatten(),
})
}
}

impl ArrayReader for RowNumberReader {
fn read_records(&mut self, batch_size: usize) -> Result<usize> {
let starting_len = self.buffered_row_numbers.len();
self.buffered_row_numbers
.extend((&mut self.remaining_row_numbers).take(batch_size));
Ok(self.buffered_row_numbers.len() - starting_len)
}

fn skip_records(&mut self, num_records: usize) -> Result<usize> {
// TODO: Use advance_by when it stabilizes to improve performance
Ok((&mut self.remaining_row_numbers).take(num_records).count())
}

fn as_any(&self) -> &dyn Any {
self
}

fn get_data_type(&self) -> &DataType {
&DataType::Int64
}

fn consume_batch(&mut self) -> Result<ArrayRef> {
Ok(Arc::new(Int64Array::from_iter(
self.buffered_row_numbers.drain(..),
)))
}

fn get_def_levels(&self) -> Option<&[i16]> {
None
}

fn get_rep_levels(&self) -> Option<&[i16]> {
None
}
}
Loading