Skip to content

Commit 85779a4

Browse files
adamreevemapleFU
andauthored
GH-45185: [C++][Parquet] Raise an error for invalid repetition levels when delimiting records (#45186)
### Rationale for this change See #45185. Invalid repetition levels would previously only cause a fatal error in debug builds. ### What changes are included in this PR? Converts an existing `ARROW_DCHECK_EQ` of the repetition level with a check that will raise an exception in release builds too. ### Are these changes tested? Yes, using a new example file (apache/parquet-testing#67) ### Are there any user-facing changes? Yes, reading columns with invalid repetition levels as Arrow arrays will now raise an exception. * GitHub Issue: #45185 Lead-authored-by: Adam Reeve <[email protected]> Co-authored-by: mwish <[email protected]> Signed-off-by: mwish <[email protected]>
1 parent cc10aa5 commit 85779a4

File tree

3 files changed

+22
-3
lines changed

3 files changed

+22
-3
lines changed

cpp/src/parquet/arrow/arrow_reader_writer_test.cc

+15-1
Original file line numberDiff line numberDiff line change
@@ -3978,7 +3978,8 @@ TEST(TestImpalaConversion, ArrowTimestampToImpalaTimestamp) {
39783978
}
39793979

39803980
void TryReadDataFile(const std::string& path,
3981-
::arrow::StatusCode expected_code = ::arrow::StatusCode::OK) {
3981+
::arrow::StatusCode expected_code = ::arrow::StatusCode::OK,
3982+
const std::string& expected_message = "") {
39823983
auto pool = ::arrow::default_memory_pool();
39833984

39843985
std::unique_ptr<FileReader> arrow_reader;
@@ -3992,6 +3993,12 @@ void TryReadDataFile(const std::string& path,
39923993
ASSERT_EQ(s.code(), expected_code)
39933994
<< "Expected reading file to return " << arrow::Status::CodeAsString(expected_code)
39943995
<< ", but got " << s.ToString();
3996+
3997+
if (!expected_message.empty()) {
3998+
ASSERT_EQ(s.message().find(expected_message), 0)
3999+
<< "Expected an error message beginning with '" << expected_message
4000+
<< "', but got '" << s.message() << "'";
4001+
}
39954002
}
39964003

39974004
TEST(TestArrowReaderAdHoc, Int96BadMemoryAccess) {
@@ -4005,6 +4012,13 @@ TEST(TestArrowReaderAdHoc, CorruptedSchema) {
40054012
TryReadDataFile(path, ::arrow::StatusCode::IOError);
40064013
}
40074014

4015+
TEST(TestArrowReaderAdHoc, InvalidRepetitionLevels) {
4016+
// GH-45185 - Repetition levels start with 1 instead of 0
4017+
auto path = test::get_data_file("ARROW-GH-45185.parquet", /*is_good=*/false);
4018+
TryReadDataFile(path, ::arrow::StatusCode::IOError,
4019+
"The repetition level at the start of a record must be 0 but got 1");
4020+
}
4021+
40084022
TEST(TestArrowReaderAdHoc, LARGE_MEMORY_TEST(LargeStringColumn)) {
40094023
// ARROW-3762
40104024
::arrow::StringBuilder builder;

cpp/src/parquet/column_reader.cc

+6-1
Original file line numberDiff line numberDiff line change
@@ -1611,7 +1611,12 @@ class TypedRecordReader : public TypedColumnReaderImpl<DType>,
16111611
// another record start or exhausting the ColumnChunk
16121612
int64_t level = levels_position_;
16131613
if (at_record_start_) {
1614-
ARROW_DCHECK_EQ(0, rep_levels[levels_position_]);
1614+
if (ARROW_PREDICT_FALSE(rep_levels[levels_position_] != 0)) {
1615+
std::stringstream ss;
1616+
ss << "The repetition level at the start of a record must be 0 but got "
1617+
<< rep_levels[levels_position_];
1618+
throw ParquetException(ss.str());
1619+
}
16151620
++levels_position_;
16161621
// We have decided to consume the level at this position; therefore we
16171622
// must advance until we find another record boundary

0 commit comments

Comments
 (0)