Skip to content

Commit a8bfeef

Browse files
corregoerchiragCarlos Orrego
authored
Dictionary nulls multi page fix (tensorflow#58)
* Optimize page reading by not create array from bytes owner * - fixes misaligned data from dictionary-encoded columns with null values (tensorflow#57) - added test and test data file Co-authored-by: Chirag Gupta (AZURE) <[email protected]> Co-authored-by: Carlos Orrego <[email protected]>
1 parent 7c311a9 commit a8bfeef

File tree

3 files changed

+29
-1
lines changed

3 files changed

+29
-1
lines changed

src/Parquet.Test/ParquetReaderTest.cs

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,30 @@ public void Read_multiple_data_pages()
144144
Assert.Equal(0.867111980015206, seq.Max(p => p.v), 5);
145145
}
146146
}
147+
148+
[Fact]
149+
public void Read_multi_page_dictionary_with_nulls()
150+
{
151+
using (var reader = new ParquetReader(OpenTestFile("/special/multi_page_dictionary_with_nulls.parquet")))
152+
{
153+
DataColumn[] columns = reader.ReadEntireRowGroup();
154+
var rg = reader.OpenRowGroupReader(0);
155+
156+
// reading columns
157+
var data = (string[]) columns[0].Data;
158+
159+
// ground truth from spark
160+
// check page boundary (first page contains 107432 rows)
161+
Assert.Equal("xc3w4eudww", data[107432]);
162+
Assert.Equal("bpywp4wtwk", data[107433]);
163+
Assert.Equal("z6x8652rle", data[107434]);
164+
165+
// check near the end of the file
166+
Assert.Null(data[310028]);
167+
Assert.Equal("wok86kie6c", data[310029]);
168+
Assert.Equal("le9i7kbbib", data[310030]);
169+
}
170+
}
147171

148172
[Fact]
149173
public void Read_bit_packed_at_page_boundary()
Binary file not shown.

src/Parquet/File/DataColumnReader.cs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,11 @@ private void ReadDataPage(Thrift.PageHeader ph, ColumnRawData cd, long maxValues
209209

210210
if (ph.Data_page_header == null) throw new ParquetException($"column '{_dataField.Path}' is missing data page header, file is corrupt");
211211

212-
ReadColumn(reader, ph.Data_page_header.Encoding, maxValues, ph.Data_page_header.Num_values, cd);
212+
// if statistics are defined, use null count to determine the exact number of items we should read
213+
// however, I don't know if all parquet files with null values have stats defined. Maybe a better solution would
214+
// be using a count of defined values (from reading definitions?)
215+
int maxReadCount = ph.Data_page_header.Num_values - (int)(ph.Data_page_header.Statistics?.Null_count ?? 0);
216+
ReadColumn(reader, ph.Data_page_header.Encoding, maxValues, maxReadCount, cd);
213217
}
214218
}
215219
}

0 commit comments

Comments
 (0)