GH-35627: [Format][Integration] Add string-view to arrow format

bkietz · bkietz · commit 4a24b36ca038 · 2023-09-20T20:10:04.000-04:00
diff --git a/dev/archery/archery/integration/datagen.py b/dev/archery/archery/integration/datagen.py
@@ -665,6 +665,26 @@ def _get_type(self):
         return OrderedDict([('name', 'largeutf8')])
 
 
+class BinaryViewField(BinaryField):
+
+    @property
+    def column_class(self):
+        return BinaryViewColumn
+
+    def _get_type(self):
+        return OrderedDict([('name', 'binaryview')])
+
+
+class StringViewField(StringField):
+
+    @property
+    def column_class(self):
+        return StringViewColumn
+
+    def _get_type(self):
+        return OrderedDict([('name', 'utf8view')])
+
+
 class Schema(object):
 
     def __init__(self, fields, metadata=None):
@@ -744,6 +764,74 @@ class LargeStringColumn(_BaseStringColumn, _LargeOffsetsMixin):
     pass
 
 
+class BinaryViewColumn(PrimitiveColumn):
+
+    def _encode_value(self, x):
+        return frombytes(binascii.hexlify(x).upper())
+
+    def _get_buffers(self):
+        views = []
+        data_buffers = []
+        # a small default data buffer size is used so we can exercise
+        # arrays with multiple data buffers with small data sets
+        DEFAULT_BUFFER_SIZE = 32
+        INLINE_SIZE = 12
+
+        for i, v in enumerate(self.values):
+            if not self.is_valid[i]:
+                v = b''
+            assert isinstance(v, bytes)
+
+            if len(v) <= INLINE_SIZE:
+                # Append an inline view, skip data buffer management.
+                views.append(OrderedDict([
+                    ('SIZE', len(v)),
+                    ('INLINED', self._encode_value(v)),
+                ]))
+                continue
+
+            if len(data_buffers) == 0:
+                # No data buffers have been added yet;
+                # add this string whole (we may append to it later).
+                offset = 0
+                data_buffers.append(v)
+            elif len(data_buffers[-1]) + len(v) > DEFAULT_BUFFER_SIZE:
+                # Appending this string to the current active data buffer
+                # would overflow the default buffer size; add it whole.
+                offset = 0
+                data_buffers.append(v)
+            else:
+                # Append this string to the current active data buffer.
+                offset = len(data_buffers[-1])
+                data_buffers[-1] += v
+
+            # the prefix is always 4 bytes so it may not be utf-8
+            # even if the whole string view is
+            prefix = frombytes(binascii.hexlify(v[:4]).upper())
+
+            views.append(OrderedDict([
+                ('SIZE', len(v)),
+                ('PREFIX_HEX', prefix),
+                ('BUFFER_INDEX', len(data_buffers) - 1),
+                ('OFFSET', offset),
+            ]))
+
+        return [
+            ('VALIDITY', [int(x) for x in self.is_valid]),
+            ('VIEWS', views),
+            ('VARIADIC_DATA_BUFFERS', [
+                frombytes(binascii.hexlify(b).upper())
+                for b in data_buffers
+            ]),
+        ]
+
+
+class StringViewColumn(BinaryViewColumn):
+
+    def _encode_value(self, x):
+        return frombytes(x)
+
+
 class FixedSizeBinaryColumn(PrimitiveColumn):
 
     def _encode_value(self, x):
@@ -1568,6 +1656,15 @@ def generate_run_end_encoded_case():
     return _generate_file("run_end_encoded", fields, batch_sizes)
 
 
+def generate_binary_view_case():
+    fields = [
+        BinaryViewField('bv'),
+        StringViewField('sv'),
+    ]
+    batch_sizes = [0, 7, 256]
+    return _generate_file("binary_view", fields, batch_sizes)
+
+
 def generate_nested_large_offsets_case():
     fields = [
         LargeListField('large_list_nullable', get_field('item', 'int32')),
@@ -1763,6 +1860,14 @@ def _temp_path():
         .skip_tester('JS')
         .skip_tester('Rust'),
 
+        generate_binary_view_case()
+        .skip_tester('C++')
+        .skip_tester('C#')
+        .skip_tester('Go')
+        .skip_tester('Java')
+        .skip_tester('JS')
+        .skip_tester('Rust'),
+
         generate_extension_case()
         .skip_tester('C#')
         # TODO: ensure the extension is registered in the C++ entrypoint
diff --git a/docs/source/format/Columnar.rst b/docs/source/format/Columnar.rst
@@ -21,7 +21,7 @@
 Arrow Columnar Format
 *********************
 
-*Version: 1.3*
+*Version: 1.4*
 
 The "Arrow Columnar Format" includes a language-agnostic in-memory
 data structure specification, metadata serialization, and a protocol
@@ -108,6 +108,10 @@ the different physical layouts defined by Arrow:
 * **Variable-size Binary**: a sequence of values each having a variable
   byte length. Two variants of this layout are supported using 32-bit
   and 64-bit length encoding.
+* **Views of Variable-size Binary**: a sequence of values each having a
+  variable byte length. In contrast to Variable-size Binary, the values
+  of this layout are distributed across potentially multiple buffers
+  instead of densely and sequentially packed in a single buffer.
 * **Fixed-size List**: a nested layout where each value has the same
   number of elements taken from a child data type.
 * **Variable-size List**: a nested layout where each value is a
@@ -350,6 +354,51 @@ will be represented as follows: ::
     |----------------|-----------------------|
     | joemark        | unspecified (padding) |
 
+Variable-size Binary View Layout
+--------------------------------
+
+.. versionadded:: Arrow Columnar Format 1.4
+
+Each value in this layout consists of 0 or more bytes. These bytes'
+locations are indicated using a **views** buffer, which may point to one
+of potentially several **data** buffers or may contain the characters
+inline.
+
+The views buffer contains `length` view structures with the following layout:
+
+::
+
+    * Short strings, length <= 12
+      | Bytes 0-3  | Bytes 4-15                            |
+      |------------|---------------------------------------|
+      | length     | data (padded with 0)                  |
+
+    * Long strings, length > 12
+      | Bytes 0-3  | Bytes 4-7  | Bytes 8-11 | Bytes 12-15 |
+      |------------|------------|------------|-------------|
+      | length     | prefix     | buf. index | offset      |
+
+In both the long and short string cases, the first four bytes encode the
+length of the string and can be used to determine how the rest of the view
+should be interpreted.
+
+In the short string case the string's bytes are inlined- stored inside the
+view itself, in the twelve bytes which follow the length.
+
+In the long string case, a buffer index indicates which data buffer
+stores the data bytes and an offset indicates where in that buffer the
+data bytes begin. Buffer index 0 refers to the first data buffer, IE
+the first buffer **after** the validity buffer and the views buffer.
+The half-open range ``[offset, offset + length)`` must be entirely contained
+within the indicated buffer. A copy of the first four bytes of the string is
+stored inline in the prefix, after the length. This prefix enables a
+profitable fast path for string comparisons, which are frequently determined
+within the first four bytes.
+
+All integers (length, buffer index, and offset) are signed.
+
+This layout is adapted from TU Munich's `UmbraDB`_.
+
 .. _variable-size-list-layout:
 
 Variable-size List Layout
@@ -880,19 +929,20 @@ For the avoidance of ambiguity, we provide listing the order and type
 of memory buffers for each layout.
 
 .. csv-table:: Buffer Layouts
-   :header: "Layout Type", "Buffer 0", "Buffer 1", "Buffer 2"
-   :widths: 30, 20, 20, 20
-
-   "Primitive",validity,data,
-   "Variable Binary",validity,offsets,data
-   "List",validity,offsets,
-   "Fixed-size List",validity,,
-   "Struct",validity,,
-   "Sparse Union",type ids,,
-   "Dense Union",type ids,offsets,
-   "Null",,,
-   "Dictionary-encoded",validity,data (indices),
-   "Run-end encoded",,,
+   :header: "Layout Type", "Buffer 0", "Buffer 1", "Buffer 2", "Variadic Buffers"
+   :widths: 30, 20, 20, 20, 20
+
+   "Primitive",validity,data,,
+   "Variable Binary",validity,offsets,data,
+   "Variable Binary View",validity,views,,data
+   "List",validity,offsets,,
+   "Fixed-size List",validity,,,
+   "Struct",validity,,,
+   "Sparse Union",type ids,,,
+   "Dense Union",type ids,offsets,,
+   "Null",,,,
+   "Dictionary-encoded",validity,data (indices),,
+   "Run-end encoded",,,,
 
 Logical Types
 =============
@@ -1071,6 +1121,39 @@ bytes. Since this metadata can be used to communicate in-memory pointer
 addresses between libraries, it is recommended to set ``size`` to the actual
 memory size rather than the padded size.
 
+Variadic buffers
+^^^^^^^^^^^^^^^^
+
+Some types such as Utf8View are represented using a variable number of buffers.
+For each such Field in the pre-ordered flattened logical schema, there will be
+an entry in ``variadicBufferCounts`` to indicate the number of variadic buffers
+which belong to that Field in the current RecordBatch.
+
+For example, consider the schema ::
+
+    col1: Struct<a: Int32, b: BinaryView, c: Float64>
+    col2: Utf8View
+
+This has two fields with variadic buffers, so ``variadicBufferCounts`` will
+have two entries in each RecordBatch. For a RecordBatch of this schema with
+``variadicBufferCounts = [3, 2]``, the flattened buffers would be::
+
+    buffer 0:  col1    validity
+    buffer 1:  col1.a  validity
+    buffer 2:  col1.a  values
+    buffer 3:  col1.b  validity
+    buffer 4:  col1.b  views
+    buffer 5:  col1.b  data
+    buffer 6:  col1.b  data
+    buffer 7:  col1.b  data
+    buffer 8:  col1.c  validity
+    buffer 9:  col1.c  values
+    buffer 10: col2    validity
+    buffer 11: col2    views
+    buffer 12: col2    data
+    buffer 13: col2    data
+
+
 Byte Order (`Endianness`_)
 ---------------------------
 
@@ -1346,3 +1429,4 @@ the Arrow spec.
 .. _Endianness: https://en.wikipedia.org/wiki/Endianness
 .. _SIMD: https://software.intel.com/en-us/cpp-compiler-developer-guide-and-reference-introduction-to-the-simd-data-layout-templates
 .. _Parquet: https://parquet.apache.org/docs/
+.. _UmbraDB: https://db.in.tum.de/~freitag/papers/p29-neumann-cidr20.pdf
diff --git a/format/Message.fbs b/format/Message.fbs
@@ -99,6 +99,22 @@ table RecordBatch {
 
   /// Optional compression of the message body
   compression: BodyCompression;
+
+  /// Some types such as Utf8View are represented using a variable number of buffers.
+  /// For each such Field in the pre-ordered flattened logical schema, there will be
+  /// an entry in variadicBufferCounts to indicate the number of number of variadic
+  /// buffers which belong to that Field in the current RecordBatch.
+  ///
+  /// For example, the schema
+  ///     col1: Struct<a: Int32, b: BinaryView, c: Float64>
+  ///     col2: Utf8View
+  /// contains two Fields with variadic buffers so variadicBufferCounts will have
+  /// two entries, the first counting the variadic buffers of `col1.b` and the
+  /// second counting `col2`'s.
+  ///
+  /// This field may be omitted if and only if the schema contains no Fields with
+  /// a variable number of buffers, such as BinaryView and Utf8View.
+  variadicBufferCounts: [long];
 }
 
 /// For sending dictionary encoding information. Any Field can be
diff --git a/format/Schema.fbs b/format/Schema.fbs
@@ -22,6 +22,7 @@
 /// Version 1.1 - Add Decimal256.
 /// Version 1.2 - Add Interval MONTH_DAY_NANO.
 /// Version 1.3 - Add Run-End Encoded.
+/// Version 1.4 - Add BinaryView, Utf8View, and variadicBufferCounts.
 
 namespace org.apache.arrow.flatbuf;
 
@@ -171,6 +172,27 @@ table LargeUtf8 {
 table LargeBinary {
 }
 
+/// Logically the same as Utf8, but the internal representation uses a view
+/// struct that contains the string length and either the string's entire data
+/// inline (for small strings) or an inlined prefix, an index of another buffer,
+/// and an offset pointing to a slice in that buffer (for non-small strings).
+///
+/// Since it uses a variable number of data buffers, each Field with this type
+/// must have a corresponding entry in `variadicBufferCounts`.
+table Utf8View {
+}
+
+/// Logically the same as Binary, but the internal representation uses a header
+/// struct that contains the string length and either the string's entire data
+/// inline (for small strings) or an inlined prefix, an index of another buffer,
+/// and an offset pointing to a slice in that buffer (for non-small strings).
+///
+/// Since it uses a variable number of data buffers, each Field with this type
+/// must have a corresponding entry in `variadicBufferCounts`.
+table BinaryView {
+}
+
+
 table FixedSizeBinary {
   /// Number of bytes per value
   byteWidth: int;
@@ -427,6 +449,8 @@ union Type {
   LargeUtf8,
   LargeList,
   RunEndEncoded,
+  BinaryView,
+  Utf8View,
 }
 
 /// ----------------------------------------------------------------------