Skip to content

Commit 3765aef

Browse files
committed
support decoding legacy lists with no inner element (tensorflow#286)
1 parent 2fd609f commit 3765aef

File tree

6 files changed

+110
-28
lines changed

6 files changed

+110
-28
lines changed

src/Parquet.Test/ParquetReaderOnTestFilesTest.cs

+12
Original file line numberDiff line numberDiff line change
@@ -133,5 +133,17 @@ public async Task Read_col_names_with_trailing_dots() {
133133
Table tbl = await ParquetReader.ReadTableFromStreamAsync(s);
134134
Assert.NotNull(tbl);
135135
}
136+
137+
[Fact]
138+
public async Task Read_legacy_list() {
139+
using Stream s = OpenTestFile("special/legacy-list.parquet");
140+
using ParquetReader r = await ParquetReader.CreateAsync(s);
141+
DataColumn[] cols = await r.ReadEntireRowGroupAsync();
142+
143+
Assert.Equal(3, cols.Length);
144+
Assert.Equal(new string[] { "1_0", "1_0" }, cols[0].Data);
145+
Assert.Equal(new double[] { 2004, 2004 }, cols[1].Data);
146+
Assert.Equal(Enumerable.Range(0, 168).Concat(Enumerable.Range(0, 168)).ToArray(), cols[2].Data);
147+
}
136148
}
137149
}

src/Parquet.Test/Schema/SchemaTest.cs

+50
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
using CT = Parquet.Thrift.ConvertedType;
1111
using System.Numerics;
1212
using Parquet.Encodings;
13+
using Parquet.File;
1314

1415
namespace Parquet.Test.Schema {
1516
public class SchemaTest : TestBase {
@@ -336,5 +337,54 @@ public void SystemTypeToThriftMapping(Type t, TT expectedTT, CT? expectedCT) {
336337
Assert.Equal(expectedTT, foundTT);
337338
Assert.Equal(expectedCT, foundCT);
338339
}
340+
341+
[Fact]
342+
public void Decode_list_normal() {
343+
ParquetSchema schema = ThriftFooter.Parse(
344+
new Thrift.SchemaElement("my_list") {
345+
Converted_type = CT.LIST,
346+
Num_children = 1
347+
},
348+
new Thrift.SchemaElement("list") {
349+
Repetition_type = Thrift.FieldRepetitionType.REPEATED,
350+
Num_children = 1
351+
},
352+
new Thrift.SchemaElement("element") {
353+
Repetition_type = Thrift.FieldRepetitionType.REQUIRED,
354+
Type = TT.INT32
355+
});
356+
357+
Field f = schema[0];
358+
if(f is ListField lf) {
359+
Assert.Equal("my_list", lf.Name);
360+
Assert.Equal("element", lf.Item.Name);
361+
} else {
362+
Assert.Fail("list expected");
363+
}
364+
}
365+
366+
[Fact]
367+
public void Decode_list_legacy_no_mid_group() {
368+
ParquetSchema schema = ThriftFooter.Parse(
369+
new Thrift.SchemaElement("my_list") {
370+
Converted_type = CT.LIST
371+
},
372+
new Thrift.SchemaElement("list") {
373+
Repetition_type = Thrift.FieldRepetitionType.REPEATED,
374+
Num_children = 1
375+
},
376+
new Thrift.SchemaElement("element") {
377+
Repetition_type = Thrift.FieldRepetitionType.REQUIRED,
378+
Type = TT.INT32
379+
});
380+
381+
Field f = schema[0];
382+
if(f is ListField lf) {
383+
Assert.Equal("my_list", lf.Name);
384+
Assert.Equal("element", lf.Item.Name);
385+
} else {
386+
Assert.Fail("list expected");
387+
}
388+
}
339389
}
340390
}
Binary file not shown.

src/Parquet/Encodings/SchemaEncoder.cs

+16-19
Original file line numberDiff line numberDiff line change
@@ -169,22 +169,28 @@ static bool TryBuildList(List<Thrift.SchemaElement> schema,
169169
ref int index, out int ownedChildren,
170170
out ListField? field) {
171171

172-
Thrift.SchemaElement se = schema[index];
172+
Thrift.SchemaElement outerGroup = schema[index];
173173

174-
if(!(se.__isset.converted_type && se.Converted_type == Thrift.ConvertedType.LIST)) {
174+
if(!(outerGroup.__isset.converted_type && outerGroup.Converted_type == Thrift.ConvertedType.LIST)) {
175175
ownedChildren = 0;
176176
field = null;
177177
return false;
178178
}
179179

180-
Thrift.SchemaElement tseList = schema[index];
181-
field = ListField.CreateWithNoItem(tseList.Name, tseList.Repetition_type != FieldRepetitionType.REQUIRED);
180+
field = ListField.CreateWithNoItem(outerGroup.Name, outerGroup.Repetition_type != FieldRepetitionType.REQUIRED);
182181

183182
//https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#backward-compatibility-rules
184-
Thrift.SchemaElement tseRepeated = schema[index + 1];
183+
Thrift.SchemaElement midGroup = schema[index + 1];
184+
bool midIsGroup = midGroup.Num_children > 0;
185185

186186
// Rule 1. If the repeated field is not a group, then its type is the element type and elements are required.
187-
// todo: not implemented
187+
if(!midIsGroup) {
188+
field.Path = new FieldPath(outerGroup.Name);
189+
field.ThriftSchemaElement = outerGroup;
190+
index += 1; //only skip this element
191+
ownedChildren = 1; // next element is list's item
192+
return true;
193+
}
188194

189195
// Rule 2. If the repeated field is a group with multiple fields, then its type is the element type and elements are required.
190196
// todo: not implemented
@@ -194,20 +200,11 @@ static bool TryBuildList(List<Thrift.SchemaElement> schema,
194200
// type and elements are required.
195201
// todo: not implemented fully, only "array"
196202

197-
// "group with one field and is named either array":
198-
if(tseList.Num_children == 1 && tseRepeated.Name == "array") {
199-
field.Path = new FieldPath(tseList.Name);
200-
index += 1; //only skip this element
201-
ownedChildren = 1;
202-
return true;
203-
}
204-
205203
// Normal "modern" LIST:
206-
//as we are skipping elements set path hint
207-
Thrift.SchemaElement tseRepeatedGroup = schema[index + 1];
208-
field.Path = new FieldPath(tseList.Name, tseRepeatedGroup.Name);
209-
field.ThriftSchemaElement = se;
210-
field.GroupSchemaElement = tseRepeatedGroup;
204+
// as we are skipping elements set path hint
205+
field.Path = new FieldPath(outerGroup.Name, midGroup.Name);
206+
field.ThriftSchemaElement = outerGroup;
207+
field.GroupSchemaElement = midGroup;
211208
index += 2; //skip this element and child container
212209
ownedChildren = 1; //we should get this element assigned back
213210
return true;

src/Parquet/File/ThriftFooter.cs

+12
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,18 @@ public ThriftFooter(Thrift.FileMetaData fileMeta) {
2424
_tree = new ThriftSchemaTree(_fileMeta.Schema);
2525
}
2626

27+
internal static ParquetSchema Parse(params Thrift.SchemaElement[] elements) {
28+
29+
var slst = new List<Thrift.SchemaElement> {
30+
new Thrift.SchemaElement("root") { Num_children = 1 },
31+
};
32+
slst.AddRange(elements);
33+
34+
return new ThriftFooter(new Thrift.FileMetaData {
35+
Schema = slst
36+
}).CreateModelSchema(null);
37+
}
38+
2739
public ThriftFooter(ParquetSchema schema, long totalRowCount) {
2840
if(schema == null) {
2941
throw new ArgumentNullException(nameof(schema));

src/Parquet/Schema/ListField.cs

+20-9
Original file line numberDiff line numberDiff line change
@@ -96,16 +96,27 @@ internal override FieldPath? PathPrefix {
9696

9797
internal override void PropagateLevels(int parentRepetitionLevel, int parentDefinitionLevel) {
9898

99-
// both get
10099
MaxDefinitionLevel = parentDefinitionLevel;
101-
MaxRepetitionLevel = parentRepetitionLevel + 1; // because it's repeated ;)
102-
103-
if(IsNullable) {
104-
MaxDefinitionLevel++;
105-
}
106-
107-
if(GroupSchemaElement == null || GroupSchemaElement.Repetition_type != Thrift.FieldRepetitionType.REQUIRED) {
108-
MaxDefinitionLevel++;
100+
MaxRepetitionLevel = parentRepetitionLevel;
101+
102+
if(ThriftSchemaElement != null) {
103+
// building from file
104+
if(IsNullable)
105+
MaxDefinitionLevel += 1;
106+
107+
if(GroupSchemaElement != null) {
108+
if(GroupSchemaElement.Repetition_type != Thrift.FieldRepetitionType.REQUIRED)
109+
MaxDefinitionLevel += 1;
110+
111+
MaxRepetitionLevel += 1;
112+
}
113+
} else {
114+
// probably building manually
115+
if(IsNullable)
116+
MaxDefinitionLevel += 1;
117+
118+
MaxDefinitionLevel += 1; // assuming optional group
119+
MaxRepetitionLevel += 1; // assuming non-legacy lists, which have repeated group
109120
}
110121

111122
//push to child item

0 commit comments

Comments
 (0)