Skip to content

Commit 4fe2758

Browse files
azurecoderIvan Gavryliuk
authored and
Ivan Gavryliuk
committed
61 plain non nullable types (tensorflow#68)
* added new row indexer for parquet data frame * updated all tests and code to use DateTimeOffset * added logical JSON type * added new dataset handling of rows through pivoting * Update PlainValuesReader.cs * built more single responsibility around ParquetReader type to ensure efficient deallocation of resources using IDisposable * updated reader to look at nulls * added branches to set type IList as either nullable or non-nullable and done this against the required attribute on the column header * moved BigDecimal to own file
1 parent 1e8407c commit 4fe2758

File tree

3 files changed

+106
-76
lines changed

3 files changed

+106
-76
lines changed

src/Parquet/File/Values/BigDecimal.cs

+31
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
using System.Numerics;
2+
3+
namespace Parquet.File.Values
4+
{
5+
public struct BigDecimal
6+
{
7+
public decimal Integer { get; set; }
8+
public int Scale { get; set; }
9+
public int Precision { get; set; }
10+
11+
public BigDecimal(BigInteger integer, int scale, int precision) : this()
12+
{
13+
Integer = (decimal) integer;
14+
Scale = scale;
15+
Precision = precision;
16+
while (Scale > 0)
17+
{
18+
Integer /= 10;
19+
Scale -= 1;
20+
}
21+
Scale = scale;
22+
}
23+
24+
public static explicit operator decimal(BigDecimal bd)
25+
{
26+
return bd.Integer;
27+
}
28+
29+
// TODO: Add to byte array for writer
30+
}
31+
}

src/Parquet/File/Values/PlainValuesReader.cs

+13-59
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
using TType = Parquet.Thrift.Type;
1010
using System.Runtime.CompilerServices;
1111
using System.Numerics;
12+
using System.Reflection;
1213

1314
namespace Parquet.File.Values
1415
{
@@ -65,7 +66,6 @@ private static void ReadPlainBoolean(byte[] data, IList destination, long maxVal
6566
int ibit = 0;
6667
int ibyte = 0;
6768
byte b = data[0];
68-
var destinationTyped = (List<bool?>)destination;
6969

7070
for(int ires = 0; ires < maxValues; ires++)
7171
{
@@ -76,70 +76,64 @@ private static void ReadPlainBoolean(byte[] data, IList destination, long maxVal
7676
}
7777

7878
bool set = ((b >> ibit++) & 1) == 1;
79-
destinationTyped.Add(set);
79+
destination.Add(set);
8080
}
8181
}
8282

8383
[MethodImpl(MethodImplOptions.AggressiveInlining)]
8484
private static void ReadInt32(byte[] data, SchemaElement schema, IList destination)
8585
{
86-
if(schema.Converted_type == ConvertedType.DATE)
86+
if (schema.Converted_type == ConvertedType.DATE)
8787
{
88-
List<DateTimeOffset?> destinationTyped = (List<DateTimeOffset?>)destination;
8988
for (int i = 0; i < data.Length; i += 4)
9089
{
9190
int iv = BitConverter.ToInt32(data, i);
92-
destinationTyped.Add(new DateTimeOffset(iv.FromUnixTime(), TimeSpan.Zero));
91+
destination.Add(new DateTimeOffset(iv.FromUnixTime(), TimeSpan.Zero));
9392
}
9493
}
9594
else
9695
{
97-
List<int?> destinationTyped = (List<int?>)destination;
9896
for (int i = 0; i < data.Length; i += 4)
9997
{
10098
int iv = BitConverter.ToInt32(data, i);
101-
destinationTyped.Add(iv);
99+
destination.Add(iv);
102100
}
103101
}
104102
}
105103

106104
[MethodImpl(MethodImplOptions.AggressiveInlining)]
107105
private static void ReadFloat(byte[] data, SchemaElement schema, IList destination)
108106
{
109-
List<float?> destinationTyped = (List<float?>)destination;
110107
for (int i = 0; i < data.Length; i += 4)
111108
{
112109
float iv = BitConverter.ToSingle(data, i);
113-
destinationTyped.Add(iv);
110+
destination.Add(iv);
114111
}
115112
}
116113

117114
[MethodImpl(MethodImplOptions.AggressiveInlining)]
118115
private static void ReadLong(byte[] data, SchemaElement schema, IList destination)
119116
{
120-
List<long?> destinationTyped = (List<long?>)destination;
121117
for (int i = 0; i < data.Length; i += 8)
122118
{
123119
long lv = BitConverter.ToInt64(data, i);
124-
destinationTyped.Add(lv);
120+
destination.Add(lv);
125121
}
126122
}
127123

128124
[MethodImpl(MethodImplOptions.AggressiveInlining)]
129125
private static void ReadDouble(byte[] data, SchemaElement schema, IList destination)
130126
{
131-
List<double?> destinationTyped = (List<double?>)destination;
132127
for (int i = 0; i < data.Length; i += 8)
133128
{
134129
double lv = BitConverter.ToDouble(data, i);
135-
destinationTyped.Add(lv);
130+
destination.Add(lv);
136131
}
137132
}
138133

139134
[MethodImpl(MethodImplOptions.AggressiveInlining)]
140135
private static void ReadFixedLenByteArray(byte[] data, SchemaElement schema, IList destination)
141136
{
142-
List<decimal?> destinationTyped = (List<decimal?>) destination;
143137
for (int i = 0; i < data.Length; i += schema.Type_length)
144138
{
145139
if (schema.Converted_type != ConvertedType.DECIMAL) continue;
@@ -149,21 +143,15 @@ private static void ReadFixedLenByteArray(byte[] data, SchemaElement schema, ILi
149143
var bigInt = new BigDecimal(new BigInteger(dataNew.Reverse().ToArray()), schema.Scale, schema.Precision);
150144

151145
decimal dc = (decimal) bigInt;
152-
destinationTyped.Add(dc);
146+
destination.Add(dc);
153147
}
154148
}
155149

156150
[MethodImpl(MethodImplOptions.AggressiveInlining)]
157151
private static void ReadInt96(byte[] data, SchemaElement schema, IList destination)
158152
{
159-
#if !SPARK_TYPES
160-
List<BigInteger?> destinationTyped = (List<BigInteger?>)destination;
161-
#else
162-
List<DateTimeOffset?> destinationTyped = (List<DateTimeOffset?>)destination;
163-
#endif
164153

165-
//todo: this is a sample how to read int96, not tested this yet
166-
// todo: need to work this out because Spark is not encoding per spec - working with the Spark encoding instead
154+
167155
#if !SPARK_TYPES
168156
//var r96 = new List<BigInteger>(data.Length / 12);
169157
#else
@@ -188,7 +176,7 @@ private static void ReadInt96(byte[] data, SchemaElement schema, IList destinati
188176
double millis = (double) nanosToInt64 / 1000000D;
189177
bi = bi.AddMilliseconds(millis);
190178
#endif
191-
destinationTyped.Add(new DateTimeOffset(bi));
179+
destination.Add(new DateTimeOffset(bi));
192180

193181
}
194182
}
@@ -203,62 +191,28 @@ private void ReadByteArray(byte[] data, SchemaElement schemaElement, IList desti
203191
schemaElement.Converted_type == ConvertedType.UTF8 || schemaElement.Converted_type == ConvertedType.JSON ||
204192
_options.TreatByteArrayAsString)
205193
{
206-
List<string> destinationTyped = (List<string>)destination;
207194
for (int i = 0; i < data.Length;)
208195
{
209196
int length = BitConverter.ToInt32(data, i);
210197
i += 4; //fast-forward to data
211198
string s = UTF8.GetString(data, i, length);
212199
i += length; //fast-forward to the next element
213-
destinationTyped.Add(s);
200+
destination.Add(s);
214201
}
215202
}
216203
else
217204
{
218-
List<byte[]> destinationTyped = (List<byte[]>)destination;
219205
for (int i = 0; i < data.Length;)
220206
{
221207
int length = BitConverter.ToInt32(data, i);
222208
i += 4; //fast-forward to data
223209
byte[] ar = new byte[length];
224210
Array.Copy(data, i, ar, 0, length);
225211
i += length; //fast-forward to the next element
226-
destinationTyped.Add(ar);
212+
destination.Add(ar);
227213
}
228214
}
229215
}
230216

231-
}
232-
233-
234-
struct BigDecimal
235-
{
236-
public decimal Integer { get; set; }
237-
public int Scale { get; set; }
238-
public int Precision { get; set; }
239-
240-
public BigDecimal(BigInteger integer, int scale, int precision) : this()
241-
{
242-
Integer = (decimal) integer;
243-
Scale = scale;
244-
Precision = precision;
245-
while (Scale > 0)
246-
{
247-
Integer /= 10;
248-
Scale -= 1;
249-
}
250-
Scale = scale;
251-
}
252-
253-
public static explicit operator decimal(BigDecimal bd)
254-
{
255-
return bd.Integer;
256-
}
257-
258-
// TODO: Add to byte array for writer
259-
260-
261-
262-
263217
}
264218
}

src/Parquet/ParquetColumn.cs

+62-17
Original file line numberDiff line numberDiff line change
@@ -189,28 +189,58 @@ internal static IList CreateValuesList(SchemaElement schema, out Type systemType
189189
switch(schema.Type)
190190
{
191191
case TType.BOOLEAN:
192-
systemType = typeof(bool?);
193-
return new List<bool?>();
192+
if (schema.Repetition_type == FieldRepetitionType.OPTIONAL)
193+
{
194+
systemType = typeof(bool?);
195+
return new List<bool?>();
196+
}
197+
systemType = typeof(bool);
198+
return new List<bool>();
194199
case TType.INT32:
195200
if(schema.Converted_type == ConvertedType.DATE)
196201
{
197-
systemType = typeof(DateTimeOffset?);
198-
return new List<DateTimeOffset?>();
202+
if (schema.Repetition_type == FieldRepetitionType.OPTIONAL)
203+
{
204+
systemType = typeof(DateTimeOffset?);
205+
return new List<DateTimeOffset?>();
206+
}
207+
systemType = typeof(DateTimeOffset);
208+
return new List<DateTimeOffset>();
199209
}
200210
else
201211
{
202-
systemType = typeof(int?);
203-
return new List<int?>();
212+
if (schema.Repetition_type == FieldRepetitionType.OPTIONAL)
213+
{
214+
systemType = typeof(int?);
215+
return new List<int?>();
216+
}
217+
systemType = typeof(int);
218+
return new List<int>();
204219
}
205220
case TType.FLOAT:
206-
systemType = typeof(float?);
207-
return new List<float?>();
221+
if (schema.Repetition_type == FieldRepetitionType.OPTIONAL)
222+
{
223+
systemType = typeof(float?);
224+
return new List<float?>();
225+
}
226+
systemType = typeof(float);
227+
return new List<float>();
208228
case TType.INT64:
209-
systemType = typeof(long?);
210-
return new List<long?>();
229+
if (schema.Repetition_type == FieldRepetitionType.OPTIONAL)
230+
{
231+
systemType = typeof(long?);
232+
return new List<long?>();
233+
}
234+
systemType = typeof(long);
235+
return new List<long>();
211236
case TType.DOUBLE:
212-
systemType = typeof(double?);
213-
return new List<double?>();
237+
if (schema.Repetition_type == FieldRepetitionType.OPTIONAL)
238+
{
239+
systemType = typeof(double?);
240+
return new List<double?>();
241+
}
242+
systemType = typeof(double);
243+
return new List<double>();
214244
case TType.INT96:
215245
#if !SPARK_TYPES
216246
systemType = typeof(DateTimeOffset?);
@@ -227,20 +257,35 @@ internal static IList CreateValuesList(SchemaElement schema, out Type systemType
227257
}
228258
else
229259
{
230-
systemType = typeof(bool?);
231-
return new List<bool?>();
260+
if (schema.Repetition_type == FieldRepetitionType.OPTIONAL)
261+
{
262+
systemType = typeof(bool?);
263+
return new List<bool?>();
264+
}
265+
systemType = typeof(bool);
266+
return new List<bool>();
232267
}
233268
case TType.FIXED_LEN_BYTE_ARRAY:
234269
// TODO: Converted type should work differently shouldn't inline in this way
235270
if (schema.Converted_type == ConvertedType.DECIMAL)
236271
{
272+
if (schema.Repetition_type == FieldRepetitionType.OPTIONAL)
273+
{
274+
systemType = typeof(decimal?);
275+
return new List<decimal?>();
276+
}
237277
systemType = typeof(decimal);
238-
return new List<decimal?>();
278+
return new List<decimal>();
239279
}
240280
else
241281
{
242-
systemType = typeof(byte?[]);
243-
return new List<byte?[]>();
282+
if (schema.Repetition_type == FieldRepetitionType.OPTIONAL)
283+
{
284+
systemType = typeof(byte?[]);
285+
return new List<byte?[]>();
286+
}
287+
systemType = typeof(byte[]);
288+
return new List<byte[]>();
244289
}
245290
default:
246291
throw new NotImplementedException($"type {schema.Type} not implemented");

0 commit comments

Comments
 (0)