Skip to content

Commit 4a24b36

Browse files
committed
GH-35627: [Format][Integration] Add string-view to arrow format
1 parent 6b1bcae commit 4a24b36

File tree

4 files changed

+243
-14
lines changed

4 files changed

+243
-14
lines changed

dev/archery/archery/integration/datagen.py

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -665,6 +665,26 @@ def _get_type(self):
665665
return OrderedDict([('name', 'largeutf8')])
666666

667667

668+
class BinaryViewField(BinaryField):
669+
670+
@property
671+
def column_class(self):
672+
return BinaryViewColumn
673+
674+
def _get_type(self):
675+
return OrderedDict([('name', 'binaryview')])
676+
677+
678+
class StringViewField(StringField):
679+
680+
@property
681+
def column_class(self):
682+
return StringViewColumn
683+
684+
def _get_type(self):
685+
return OrderedDict([('name', 'utf8view')])
686+
687+
668688
class Schema(object):
669689

670690
def __init__(self, fields, metadata=None):
@@ -744,6 +764,74 @@ class LargeStringColumn(_BaseStringColumn, _LargeOffsetsMixin):
744764
pass
745765

746766

767+
class BinaryViewColumn(PrimitiveColumn):
768+
769+
def _encode_value(self, x):
770+
return frombytes(binascii.hexlify(x).upper())
771+
772+
def _get_buffers(self):
773+
views = []
774+
data_buffers = []
775+
# a small default data buffer size is used so we can exercise
776+
# arrays with multiple data buffers with small data sets
777+
DEFAULT_BUFFER_SIZE = 32
778+
INLINE_SIZE = 12
779+
780+
for i, v in enumerate(self.values):
781+
if not self.is_valid[i]:
782+
v = b''
783+
assert isinstance(v, bytes)
784+
785+
if len(v) <= INLINE_SIZE:
786+
# Append an inline view, skip data buffer management.
787+
views.append(OrderedDict([
788+
('SIZE', len(v)),
789+
('INLINED', self._encode_value(v)),
790+
]))
791+
continue
792+
793+
if len(data_buffers) == 0:
794+
# No data buffers have been added yet;
795+
# add this string whole (we may append to it later).
796+
offset = 0
797+
data_buffers.append(v)
798+
elif len(data_buffers[-1]) + len(v) > DEFAULT_BUFFER_SIZE:
799+
# Appending this string to the current active data buffer
800+
# would overflow the default buffer size; add it whole.
801+
offset = 0
802+
data_buffers.append(v)
803+
else:
804+
# Append this string to the current active data buffer.
805+
offset = len(data_buffers[-1])
806+
data_buffers[-1] += v
807+
808+
# the prefix is always 4 bytes so it may not be utf-8
809+
# even if the whole string view is
810+
prefix = frombytes(binascii.hexlify(v[:4]).upper())
811+
812+
views.append(OrderedDict([
813+
('SIZE', len(v)),
814+
('PREFIX_HEX', prefix),
815+
('BUFFER_INDEX', len(data_buffers) - 1),
816+
('OFFSET', offset),
817+
]))
818+
819+
return [
820+
('VALIDITY', [int(x) for x in self.is_valid]),
821+
('VIEWS', views),
822+
('VARIADIC_DATA_BUFFERS', [
823+
frombytes(binascii.hexlify(b).upper())
824+
for b in data_buffers
825+
]),
826+
]
827+
828+
829+
class StringViewColumn(BinaryViewColumn):
830+
831+
def _encode_value(self, x):
832+
return frombytes(x)
833+
834+
747835
class FixedSizeBinaryColumn(PrimitiveColumn):
748836

749837
def _encode_value(self, x):
@@ -1568,6 +1656,15 @@ def generate_run_end_encoded_case():
15681656
return _generate_file("run_end_encoded", fields, batch_sizes)
15691657

15701658

1659+
def generate_binary_view_case():
1660+
fields = [
1661+
BinaryViewField('bv'),
1662+
StringViewField('sv'),
1663+
]
1664+
batch_sizes = [0, 7, 256]
1665+
return _generate_file("binary_view", fields, batch_sizes)
1666+
1667+
15711668
def generate_nested_large_offsets_case():
15721669
fields = [
15731670
LargeListField('large_list_nullable', get_field('item', 'int32')),
@@ -1763,6 +1860,14 @@ def _temp_path():
17631860
.skip_tester('JS')
17641861
.skip_tester('Rust'),
17651862

1863+
generate_binary_view_case()
1864+
.skip_tester('C++')
1865+
.skip_tester('C#')
1866+
.skip_tester('Go')
1867+
.skip_tester('Java')
1868+
.skip_tester('JS')
1869+
.skip_tester('Rust'),
1870+
17661871
generate_extension_case()
17671872
.skip_tester('C#')
17681873
# TODO: ensure the extension is registered in the C++ entrypoint

docs/source/format/Columnar.rst

Lines changed: 98 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
Arrow Columnar Format
2222
*********************
2323

24-
*Version: 1.3*
24+
*Version: 1.4*
2525

2626
The "Arrow Columnar Format" includes a language-agnostic in-memory
2727
data structure specification, metadata serialization, and a protocol
@@ -108,6 +108,10 @@ the different physical layouts defined by Arrow:
108108
* **Variable-size Binary**: a sequence of values each having a variable
109109
byte length. Two variants of this layout are supported using 32-bit
110110
and 64-bit length encoding.
111+
* **Views of Variable-size Binary**: a sequence of values each having a
112+
variable byte length. In contrast to Variable-size Binary, the values
113+
of this layout are distributed across potentially multiple buffers
114+
instead of densely and sequentially packed in a single buffer.
111115
* **Fixed-size List**: a nested layout where each value has the same
112116
number of elements taken from a child data type.
113117
* **Variable-size List**: a nested layout where each value is a
@@ -350,6 +354,51 @@ will be represented as follows: ::
350354
|----------------|-----------------------|
351355
| joemark | unspecified (padding) |
352356

357+
Variable-size Binary View Layout
358+
--------------------------------
359+
360+
.. versionadded:: Arrow Columnar Format 1.4
361+
362+
Each value in this layout consists of 0 or more bytes. These bytes'
363+
locations are indicated using a **views** buffer, which may point to one
364+
of potentially several **data** buffers or may contain the characters
365+
inline.
366+
367+
The views buffer contains `length` view structures with the following layout:
368+
369+
::
370+
371+
* Short strings, length <= 12
372+
| Bytes 0-3 | Bytes 4-15 |
373+
|------------|---------------------------------------|
374+
| length | data (padded with 0) |
375+
376+
* Long strings, length > 12
377+
| Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 |
378+
|------------|------------|------------|-------------|
379+
| length | prefix | buf. index | offset |
380+
381+
In both the long and short string cases, the first four bytes encode the
382+
length of the string and can be used to determine how the rest of the view
383+
should be interpreted.
384+
385+
In the short string case the string's bytes are inlined- stored inside the
386+
view itself, in the twelve bytes which follow the length.
387+
388+
In the long string case, a buffer index indicates which data buffer
389+
stores the data bytes and an offset indicates where in that buffer the
390+
data bytes begin. Buffer index 0 refers to the first data buffer, IE
391+
the first buffer **after** the validity buffer and the views buffer.
392+
The half-open range ``[offset, offset + length)`` must be entirely contained
393+
within the indicated buffer. A copy of the first four bytes of the string is
394+
stored inline in the prefix, after the length. This prefix enables a
395+
profitable fast path for string comparisons, which are frequently determined
396+
within the first four bytes.
397+
398+
All integers (length, buffer index, and offset) are signed.
399+
400+
This layout is adapted from TU Munich's `UmbraDB`_.
401+
353402
.. _variable-size-list-layout:
354403

355404
Variable-size List Layout
@@ -880,19 +929,20 @@ For the avoidance of ambiguity, we provide listing the order and type
880929
of memory buffers for each layout.
881930

882931
.. csv-table:: Buffer Layouts
883-
:header: "Layout Type", "Buffer 0", "Buffer 1", "Buffer 2"
884-
:widths: 30, 20, 20, 20
885-
886-
"Primitive",validity,data,
887-
"Variable Binary",validity,offsets,data
888-
"List",validity,offsets,
889-
"Fixed-size List",validity,,
890-
"Struct",validity,,
891-
"Sparse Union",type ids,,
892-
"Dense Union",type ids,offsets,
893-
"Null",,,
894-
"Dictionary-encoded",validity,data (indices),
895-
"Run-end encoded",,,
932+
:header: "Layout Type", "Buffer 0", "Buffer 1", "Buffer 2", "Variadic Buffers"
933+
:widths: 30, 20, 20, 20, 20
934+
935+
"Primitive",validity,data,,
936+
"Variable Binary",validity,offsets,data,
937+
"Variable Binary View",validity,views,,data
938+
"List",validity,offsets,,
939+
"Fixed-size List",validity,,,
940+
"Struct",validity,,,
941+
"Sparse Union",type ids,,,
942+
"Dense Union",type ids,offsets,,
943+
"Null",,,,
944+
"Dictionary-encoded",validity,data (indices),,
945+
"Run-end encoded",,,,
896946

897947
Logical Types
898948
=============
@@ -1071,6 +1121,39 @@ bytes. Since this metadata can be used to communicate in-memory pointer
10711121
addresses between libraries, it is recommended to set ``size`` to the actual
10721122
memory size rather than the padded size.
10731123

1124+
Variadic buffers
1125+
^^^^^^^^^^^^^^^^
1126+
1127+
Some types such as Utf8View are represented using a variable number of buffers.
1128+
For each such Field in the pre-ordered flattened logical schema, there will be
1129+
an entry in ``variadicBufferCounts`` to indicate the number of variadic buffers
1130+
which belong to that Field in the current RecordBatch.
1131+
1132+
For example, consider the schema ::
1133+
1134+
col1: Struct<a: Int32, b: BinaryView, c: Float64>
1135+
col2: Utf8View
1136+
1137+
This has two fields with variadic buffers, so ``variadicBufferCounts`` will
1138+
have two entries in each RecordBatch. For a RecordBatch of this schema with
1139+
``variadicBufferCounts = [3, 2]``, the flattened buffers would be::
1140+
1141+
buffer 0: col1 validity
1142+
buffer 1: col1.a validity
1143+
buffer 2: col1.a values
1144+
buffer 3: col1.b validity
1145+
buffer 4: col1.b views
1146+
buffer 5: col1.b data
1147+
buffer 6: col1.b data
1148+
buffer 7: col1.b data
1149+
buffer 8: col1.c validity
1150+
buffer 9: col1.c values
1151+
buffer 10: col2 validity
1152+
buffer 11: col2 views
1153+
buffer 12: col2 data
1154+
buffer 13: col2 data
1155+
1156+
10741157
Byte Order (`Endianness`_)
10751158
---------------------------
10761159

@@ -1346,3 +1429,4 @@ the Arrow spec.
13461429
.. _Endianness: https://en.wikipedia.org/wiki/Endianness
13471430
.. _SIMD: https://software.intel.com/en-us/cpp-compiler-developer-guide-and-reference-introduction-to-the-simd-data-layout-templates
13481431
.. _Parquet: https://parquet.apache.org/docs/
1432+
.. _UmbraDB: https://db.in.tum.de/~freitag/papers/p29-neumann-cidr20.pdf

format/Message.fbs

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,22 @@ table RecordBatch {
9999

100100
/// Optional compression of the message body
101101
compression: BodyCompression;
102+
103+
/// Some types such as Utf8View are represented using a variable number of buffers.
104+
/// For each such Field in the pre-ordered flattened logical schema, there will be
105+
/// an entry in variadicBufferCounts to indicate the number of number of variadic
106+
/// buffers which belong to that Field in the current RecordBatch.
107+
///
108+
/// For example, the schema
109+
/// col1: Struct<a: Int32, b: BinaryView, c: Float64>
110+
/// col2: Utf8View
111+
/// contains two Fields with variadic buffers so variadicBufferCounts will have
112+
/// two entries, the first counting the variadic buffers of `col1.b` and the
113+
/// second counting `col2`'s.
114+
///
115+
/// This field may be omitted if and only if the schema contains no Fields with
116+
/// a variable number of buffers, such as BinaryView and Utf8View.
117+
variadicBufferCounts: [long];
102118
}
103119

104120
/// For sending dictionary encoding information. Any Field can be

format/Schema.fbs

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
/// Version 1.1 - Add Decimal256.
2323
/// Version 1.2 - Add Interval MONTH_DAY_NANO.
2424
/// Version 1.3 - Add Run-End Encoded.
25+
/// Version 1.4 - Add BinaryView, Utf8View, and variadicBufferCounts.
2526

2627
namespace org.apache.arrow.flatbuf;
2728

@@ -171,6 +172,27 @@ table LargeUtf8 {
171172
table LargeBinary {
172173
}
173174

175+
/// Logically the same as Utf8, but the internal representation uses a view
176+
/// struct that contains the string length and either the string's entire data
177+
/// inline (for small strings) or an inlined prefix, an index of another buffer,
178+
/// and an offset pointing to a slice in that buffer (for non-small strings).
179+
///
180+
/// Since it uses a variable number of data buffers, each Field with this type
181+
/// must have a corresponding entry in `variadicBufferCounts`.
182+
table Utf8View {
183+
}
184+
185+
/// Logically the same as Binary, but the internal representation uses a header
186+
/// struct that contains the string length and either the string's entire data
187+
/// inline (for small strings) or an inlined prefix, an index of another buffer,
188+
/// and an offset pointing to a slice in that buffer (for non-small strings).
189+
///
190+
/// Since it uses a variable number of data buffers, each Field with this type
191+
/// must have a corresponding entry in `variadicBufferCounts`.
192+
table BinaryView {
193+
}
194+
195+
174196
table FixedSizeBinary {
175197
/// Number of bytes per value
176198
byteWidth: int;
@@ -427,6 +449,8 @@ union Type {
427449
LargeUtf8,
428450
LargeList,
429451
RunEndEncoded,
452+
BinaryView,
453+
Utf8View,
430454
}
431455

432456
/// ----------------------------------------------------------------------

0 commit comments

Comments
 (0)