ramon-garcia
diff --git a/‎.github/workflows/full.yml
+1-1 b/‎.github/workflows/full.yml
+1-1
diff --git a/‎docs/README.md
+3-3 b/‎docs/README.md
+3-3
diff --git a/‎docs/column.md
+36 b/‎docs/column.md
+36
diff --git a/‎docs/writing.md
+25-20 b/‎docs/writing.md
+25-20
diff --git a/‎perf/pyarrow/1.parquet
792 Bytes b/‎perf/pyarrow/1.parquet
792 Bytes
diff --git a/‎perf/pyarrow/run.py
+1 b/‎perf/pyarrow/run.py
+1
diff --git a/‎src/Parquet.Test/DocRef.cs
+17-15 b/‎src/Parquet.Test/DocRef.cs
+17-15
diff --git a/‎src/Parquet.Test/DumbStreamTest.cs
+1-1 b/‎src/Parquet.Test/DumbStreamTest.cs
+1-1
diff --git a/‎src/Parquet.Test/ParquetWriterTest.cs
+4-51 b/‎src/Parquet.Test/ParquetWriterTest.cs
+4-51
@@ -1,7 +1,7 @@
 name: 'Full Workflow'
 
 env:
-  VERSION: 4.6.2
+  VERSION: 4.7.0
   ASM_VERSION: 4.0.0
 
 on:
 
@@ -119,15 +119,15 @@ Then, data columns need to be prepared for writing. As parquet is column-based f
 
 ```csharp
 var column1 = new DataColumn(
-    (DataField)schema[0],
+    schema.DataFields[0],
     Enumerable.Range(0, 1_000_000).Select(i => DateTime.UtcNow.AddSeconds(i)).ToArray());
 
 var column2 = new DataColumn(
-    (DataField)schema[1],
+    schema.DataFields[1],
     Enumerable.Range(0, 1_000_000).Select(i => i % 2 == 0 ? "on" : "off").ToArray());
 
 var column3 = new DataColumn(
-    (DataField)schema[2],
+    schema.DataFields[2],
     Enumerable.Range(0, 1_000_000).Select(i => (double)i).ToArray());
 ```
 
 
@@ -0,0 +1,36 @@
+# DataColumn
+
+`DataColumn` is an essential part for low-level serialization. It represents a column that has actual data.
+
+For simple records that contain atomic types (int, string etc.) schema only consists of DataColumns.
+
+Here is a sample logical representation of `DataColumn` class:
+
+```mermaid
+classDiagram
+    class DataColumn {
+      +Field Field
+      +Array DefinedData
+      +Array Data
+      +int[]? DefinitionLevels;
+      +int[]? RepetitionLevels;
+      +DataColumn(DataField field, Array definedData, int[]? definitionLevels, int[]? repetitionLevels)
+      +DataColumn(DataField field, Array data, int[]? repetitionLevels = null)
+    }
+
+```
+
+`Field` is a schema field that defines this column. You can obtain this field from a schema you define.
+
+`DefinedData` is raw data that is defined by `Field`'s type. If your field is nullable, `DefinedData` represents non-nullable values. On the other hand, `Data` represents data as-is, including nulls. If you are reading `DataColumn` and need to access the data, `Data` is your field. If you need to access data as it's stored in parquet file, use `DefinedData`. The names are chosen mostly due to backward compatibility reasons.
+
+Going further, if you need to access *repetition and definition levels* as they are stored in parquet file, you can use the corresponding `DefinitionLevels` and `RepetitionLevels` fields.
+
+## Creating DataColumn
+
+There are two public constuctors available (see above diagram). For convenience and backward compatibility, the second constuctor accepts `DataField` and two parameters:
+
+1. `data` is data to write, including nulls if the field is nullable. DataColumn will decompose the data array into `DefinitionLevels` and `DefinedData` on construction.
+2. `repetitionLevels` are only required if a field is a part of a nested type.
+
+The first constructor is more granular and allows you to specify all three parts when constructing a colum.
@@ -10,20 +10,24 @@ Writing files is a multi stage process, giving you the full flexibility on what
 4. When required, repeat from step (2) to create more row groups. A row groups is like a physical data partition that should fit in memory for processing. It's a guess game how much data should be in a single row group, but a number of at least 5 thousand rows per column is great. Remember that parquet format works best on large chunks of data.
 
 ```csharp
+// create file schema
+var schema = new ParquetSchema(
+    new DataField<int>("id"),
+    new DataField<string>("city"));
+
 //create data columns with schema metadata and the data you need
 var idColumn = new DataColumn(
-   new DataField<int>("id"),
+   schema.DataFields[0],
    new int[] { 1, 2 });
 
 var cityColumn = new DataColumn(
-   new DataField<string>("city"),
+   schema.DataFields[1],
    new string[] { "London", "Derby" });
 
-// create file schema
-var schema = new ParquetSchema(idColumn.Field, cityColumn.Field);
-
 using(Stream fileStream = System.IO.File.OpenWrite("c:\\test.parquet")) {
     using(ParquetWriter parquetWriter = await ParquetWriter.CreateAsync(schema, fileStream)) {
+        parquetWriter.CompressionMethod = CompressionMethod.Gzip;
+        parquetWriter.CompressionLevel = System.IO.Compression.CompressionLevel.Optimal;
         // create a new row group in the file
         using(ParquetRowGroupWriter groupWriter = parquetWriter.CreateRowGroup()) {
             await groupWriter.WriteColumnAsync(idColumn);
@@ -33,7 +37,9 @@ using(Stream fileStream = System.IO.File.OpenWrite("c:\\test.parquet")) {
 }
 ```
 
-# Specifying Compression Method and Level
+To read more about DataColumn, see [this page](column.md).
+
+### Specifying Compression Method and Level
 
 After constructing `ParquetWriter` you can optionally set compression method ([`CompressionMethod`](../src/Parquet/CompressionMethod.cs)) and/or compression level ([`CompressionLevel`](https://learn.microsoft.com/en-us/dotnet/api/system.io.compression.compressionlevel?view=net-7.0)) which defaults to `Snappy`.  Unless you have specific needs to override compression, the default are very reasonable.
 
@@ -48,28 +54,28 @@ using(ParquetWriter parquetWriter = await ParquetWriter.CreateAsync(schema, file
 ```
 
 
-# Appending to Files
+### Appending to Files
 
 This lib supports pseudo appending to files, however it's worth keeping in mind that *row groups are immutable* by design, therefore the only way to append is to create a new row group at the end of the file. It's worth mentioning that small row groups make data compression and reading extremely ineffective, therefore the larger your row group the better.
 
 This should make you The following code snippet illustrates this:
 
 ```csharp
 //write a file with a single row group
-var id = new DataField<int>("id");
+var schema = new ParquetSchema(new DataField<int>("id"));
 var ms = new MemoryStream();
 
-using(ParquetWriter writer = await ParquetWriter.CreateAsync(new ParquetSchema(id), ms)) {
+using(ParquetWriter writer = await ParquetWriter.CreateAsync(schema, ms)) {
     using(ParquetRowGroupWriter rg = writer.CreateRowGroup()) {
-        await rg.WriteColumnAsync(new DataColumn(id, new int[] { 1, 2 }));
+        await rg.WriteColumnAsync(new DataColumn(schema.DataFields[0], new int[] { 1, 2 }));
     }
 }
 
 //append to this file. Note that you cannot append to existing row group, therefore create a new one
 ms.Position = 0;    // this is to rewind our memory stream, no need to do it in real code.
-using(ParquetWriter writer = await ParquetWriter.CreateAsync(new ParquetSchema(id), ms, append: true)) {
+using(ParquetWriter writer = await ParquetWriter.CreateAsync(schema, ms, append: true)) {
     using(ParquetRowGroupWriter rg = writer.CreateRowGroup()) {
-        await rg.WriteColumnAsync(new DataColumn(id, new int[] { 3, 4 }));
+        await rg.WriteColumnAsync(new DataColumn(schema.DataFields[0], new int[] { 3, 4 }));
     }
 }
 
@@ -80,39 +86,38 @@ using(ParquetReader reader = await ParquetReader.CreateAsync(ms)) {
 
     using(ParquetRowGroupReader rg = reader.OpenRowGroupReader(0)) {
         Assert.Equal(2, rg.RowCount);
-        Assert.Equal(new int[] { 1, 2 }, (await rg.ReadColumnAsync(id)).Data);
+        Assert.Equal(new int[] { 1, 2 }, (await rg.ReadColumnAsync(schema.DataFields[0])).Data);
     }
 
     using(ParquetRowGroupReader rg = reader.OpenRowGroupReader(1)) {
         Assert.Equal(2, rg.RowCount);
-        Assert.Equal(new int[] { 3, 4 }, (await rg.ReadColumnAsync(id)).Data);
+        Assert.Equal(new int[] { 3, 4 }, (await rg.ReadColumnAsync(schema.DataFields[0])).Data);
     }
 
 }
-
 ```
 
 Note that you have to specify that you are opening `ParquetWriter` in **append** mode in it's constructor explicitly - `new ParquetWriter(new Schema(id), ms, append: true)`. Doing so makes parquet.net open the file, find the file footer and delete it, rewinding current stream position to the end of actual data. Then, creating more row groups simply writes data to the file as usual, and `.Dispose()` on `ParquetWriter` generates a new file footer, writes it to the file and closes down the stream.
 
 Please keep in mind that row groups are designed to hold a large amount of data (5'0000 rows on average) therefore try to find a large enough batch to append to the file. Do not treat parquet file as a row stream by creating a row group and placing 1-2 rows in it, because this will both increase file size massively and cause a huge performance degradation for a client reading such a file.
 
-# Custom Metadata
+### Custom Metadata
 
 To read and write custom file metadata, you can use `CustomMetadata` property on `ParquetFileReader` and `ParquetFileWriter`, i.e.
 
 ```csharp
 var ms = new MemoryStream();
-var id = new DataField<int>("id");
+var schema = new ParquetSchema(new DataField<int>("id"));
 
 //write
-using(ParquetWriter writer = await ParquetWriter.CreateAsync(new ParquetSchema(id), ms)) {
+using(ParquetWriter writer = await ParquetWriter.CreateAsync(schema, ms)) {
     writer.CustomMetadata = new Dictionary<string, string> {
         ["key1"] = "value1",
         ["key2"] = "value2"
     };
 
     using(ParquetRowGroupWriter rg = writer.CreateRowGroup()) {
-        await rg.WriteColumnAsync(new DataColumn(id, new[] { 1, 2, 3, 4 }));
+        await rg.WriteColumnAsync(new DataColumn(schema.DataFields[0], new[] { 1, 2, 3, 4 }));
     }
 }
 
@@ -123,6 +128,6 @@ using(ParquetReader reader = await ParquetReader.CreateAsync(ms)) {
 }
 ```
 
-# Complex Types
+### Complex Types
 
 To write complex types (arrays, lists, maps, structs) read [this guide](nested_types.md).
@@ -10,6 +10,7 @@
 
 df = pd.DataFrame({
     "ints": [1, 2, 3],
+    "tags": [[1, 2], [3, 4], [5, 6]]
 }, index=list("abc"))
 
 table = pa.Table.from_pandas(df, preserve_index=False)
 
@@ -125,18 +125,20 @@ public async Task ReadIntro() {
         }
 
         public async Task WriteIntro() {
+            // create file schema
+            var schema = new ParquetSchema(
+                new DataField<int>("id"),
+                new DataField<string>("city"));
+
             //create data columns with schema metadata and the data you need
             var idColumn = new DataColumn(
-               new DataField<int>("id"),
+               schema.DataFields[0],
                new int[] { 1, 2 });
 
             var cityColumn = new DataColumn(
-               new DataField<string>("city"),
+               schema.DataFields[1],
                new string[] { "London", "Derby" });
 
-            // create file schema
-            var schema = new ParquetSchema(idColumn.Field, cityColumn.Field);
-
             using(Stream fileStream = System.IO.File.OpenWrite("c:\\test.parquet")) {
                 using(ParquetWriter parquetWriter = await ParquetWriter.CreateAsync(schema, fileStream)) {
                     parquetWriter.CompressionMethod = CompressionMethod.Gzip;
@@ -152,20 +154,20 @@ public async Task WriteIntro() {
 
         public async Task AppendDemo() {
             //write a file with a single row group
-            var id = new DataField<int>("id");
+            var schema = new ParquetSchema(new DataField<int>("id"));
             var ms = new MemoryStream();
 
-            using(ParquetWriter writer = await ParquetWriter.CreateAsync(new ParquetSchema(id), ms)) {
+            using(ParquetWriter writer = await ParquetWriter.CreateAsync(schema, ms)) {
                 using(ParquetRowGroupWriter rg = writer.CreateRowGroup()) {
-                    await rg.WriteColumnAsync(new DataColumn(id, new int[] { 1, 2 }));
+                    await rg.WriteColumnAsync(new DataColumn(schema.DataFields[0], new int[] { 1, 2 }));
                 }
             }
 
             //append to this file. Note that you cannot append to existing row group, therefore create a new one
             ms.Position = 0;    // this is to rewind our memory stream, no need to do it in real code.
-            using(ParquetWriter writer = await ParquetWriter.CreateAsync(new ParquetSchema(id), ms, append: true)) {
+            using(ParquetWriter writer = await ParquetWriter.CreateAsync(schema, ms, append: true)) {
                 using(ParquetRowGroupWriter rg = writer.CreateRowGroup()) {
-                    await rg.WriteColumnAsync(new DataColumn(id, new int[] { 3, 4 }));
+                    await rg.WriteColumnAsync(new DataColumn(schema.DataFields[0], new int[] { 3, 4 }));
                 }
             }
 
@@ -176,30 +178,30 @@ public async Task AppendDemo() {
 
                 using(ParquetRowGroupReader rg = reader.OpenRowGroupReader(0)) {
                     Assert.Equal(2, rg.RowCount);
-                    Assert.Equal(new int[] { 1, 2 }, (await rg.ReadColumnAsync(id)).Data);
+                    Assert.Equal(new int[] { 1, 2 }, (await rg.ReadColumnAsync(schema.DataFields[0])).Data);
                 }
 
                 using(ParquetRowGroupReader rg = reader.OpenRowGroupReader(1)) {
                     Assert.Equal(2, rg.RowCount);
-                    Assert.Equal(new int[] { 3, 4 }, (await rg.ReadColumnAsync(id)).Data);
+                    Assert.Equal(new int[] { 3, 4 }, (await rg.ReadColumnAsync(schema.DataFields[0])).Data);
                 }
 
             }
         }
 
         public async Task CustomMetadata() {
             var ms = new MemoryStream();
-            var id = new DataField<int>("id");
+            var schema = new ParquetSchema(new DataField<int>("id"));
 
             //write
-            using(ParquetWriter writer = await ParquetWriter.CreateAsync(new ParquetSchema(id), ms)) {
+            using(ParquetWriter writer = await ParquetWriter.CreateAsync(schema, ms)) {
                 writer.CustomMetadata = new Dictionary<string, string> {
                     ["key1"] = "value1",
                     ["key2"] = "value2"
                 };
 
                 using(ParquetRowGroupWriter rg = writer.CreateRowGroup()) {
-                    await rg.WriteColumnAsync(new DataColumn(id, new[] { 1, 2, 3, 4 }));
+                    await rg.WriteColumnAsync(new DataColumn(schema.DataFields[0], new[] { 1, 2, 3, 4 }));
                 }
             }
 
 
@@ -19,7 +19,7 @@ public async Task Read_from_stream_that_only_retuns_one_byte_at_the_time() {
                 DateTime[] data = new DateTime[10000];
                 for(int i = 0; i < 10000; i++)
                     data[i] = DateTime.UtcNow.AddMilliseconds(i);
-                await rowGroupWriter.WriteColumnAsync(new DataColumn(field, data, 0, 10000));
+                await rowGroupWriter.WriteColumnAsync(new DataColumn(field, data));
             }
 
             int fileSize = (int)memoryFileStream.Length;
 
@@ -165,13 +165,10 @@ public async Task Append_to_file_reads_all_data() {
             }
         }
 
-        public readonly static IEnumerable<object[]> NullableColumnContentCases = new List<object[]>()
-        {
-         new object[] { new int?[] { 1, 2 } },
-         new object[] { new int?[] { null } },
-         new object[] { new int?[] { 1, null, 2 } },
-         new object[] { new int[] { 1, 2 } },
-      };
+        public readonly static IEnumerable<object[]> NullableColumnContentCases = new List<object[]>(){
+            new object[] { new int?[] { 1, 2 } },
+            new object[] { new int?[] { null } },
+            new object[] { new int?[] { 1, null, 2 } } };
 
         [Theory]
         [MemberData(nameof(NullableColumnContentCases))]
@@ -196,50 +193,6 @@ public async Task Write_read_nullable_column(Array input) {
             }
         }
 
-        [Fact]
-        public async Task Writes_only_beginning_of_array() {
-            var ms = new MemoryStream();
-            var id = new DataField<int>("id");
-
-            //write
-            using(ParquetWriter writer = await ParquetWriter.CreateAsync(new ParquetSchema(id), ms)) {
-                using(ParquetRowGroupWriter rg = writer.CreateRowGroup()) {
-                    await rg.WriteColumnAsync(new DataColumn(id, new[] { 1, 2, 3, 4 }, 0, 3));
-                }
-            }
-
-            //read back
-            using(ParquetReader reader = await ParquetReader.CreateAsync(ms)) {
-                Assert.Equal(3, reader.ThriftMetadata!.Num_rows);
-
-                using(ParquetRowGroupReader rg = reader.OpenRowGroupReader(0)) {
-                    Assert.Equal(new int[] { 1, 2, 3 }, (await rg.ReadColumnAsync(id)).Data);
-                }
-            }
-        }
-
-        [Fact]
-        public async Task Writes_only_end_of_array() {
-            var ms = new MemoryStream();
-            var id = new DataField<int>("id");
-
-            //write
-            using(ParquetWriter writer = await ParquetWriter.CreateAsync(new ParquetSchema(id), ms)) {
-                using(ParquetRowGroupWriter rg = writer.CreateRowGroup()) {
-                    await rg.WriteColumnAsync(new DataColumn(id, new[] { 1, 2, 3, 4 }, 1, 3));
-                }
-            }
-
-            //read back
-            using(ParquetReader reader = await ParquetReader.CreateAsync(ms)) {
-                Assert.Equal(3, reader.ThriftMetadata!.Num_rows);
-
-                using(ParquetRowGroupReader rg = reader.OpenRowGroupReader(0)) {
-                    Assert.Equal(new int[] { 2, 3, 4 }, (await rg.ReadColumnAsync(id)).Data);
-                }
-            }
-        }
-
         [Fact]
         public async Task FileMetadata_sets_num_rows_on_file_and_row_group() {
             var ms = new MemoryStream();
Original file line number	Diff line number	Diff line change
`@@ -19,7 +19,7 @@ public async Task Read_from_stream_that_only_retuns_one_byte_at_the_time() {`
`19`	`19`	`DateTime[] data = new DateTime[10000];`
`20`	`20`	`for(int i = 0; i < 10000; i++)`
`21`	`21`	`data[i] = DateTime.UtcNow.AddMilliseconds(i);`
`22`		`- await rowGroupWriter.WriteColumnAsync(new DataColumn(field, data, 0, 10000));`
	`22`	`+ await rowGroupWriter.WriteColumnAsync(new DataColumn(field, data));`
`23`	`23`	`}`
`24`	`24`
`25`	`25`	`int fileSize = (int)memoryFileStream.Length;`