Skip to content

Commit db663d8

Browse files
authored
Minor: Document pattern for accessing views in StringView (#6673)
1 parent 22bc772 commit db663d8

File tree

1 file changed

+70
-9
lines changed

1 file changed

+70
-9
lines changed

arrow-array/src/array/byte_view_array.rs

Lines changed: 70 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -33,14 +33,18 @@ use std::sync::Arc;
3333

3434
use super::ByteArrayType;
3535

36-
/// [Variable-size Binary View Layout]: An array of variable length bytes view arrays.
36+
/// [Variable-size Binary View Layout]: An array of variable length bytes views.
37+
///
38+
/// This array type is used to store variable length byte data (e.g. Strings, Binary)
39+
/// and has efficient operations such as `take`, `filter`, and comparison.
3740
///
3841
/// [Variable-size Binary View Layout]: https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-view-layout
3942
///
40-
/// This is different from [`GenericByteArray`] as it stores both an offset and
41-
/// length meaning that take / filter operations can be implemented without
42-
/// copying the underlying data. In addition, it stores an inlined prefix which
43-
/// can be used to speed up comparisons.
43+
/// This is different from [`GenericByteArray`], which also stores variable
44+
/// length byte data, as it represents strings with an offset and length. `take`
45+
/// and `filter` like operations are implemented by manipulating the "views"
46+
/// (`u128`) without modifying the bytes. Each view also stores an inlined
47+
/// prefix which speed up comparisons.
4448
///
4549
/// # See Also
4650
///
@@ -50,11 +54,18 @@ use super::ByteArrayType;
5054
///
5155
/// [`ByteView`]: arrow_data::ByteView
5256
///
53-
/// # Notes
57+
/// # Use the [`eq`] kernel to compare the logical content.
58+
///
59+
/// Comparing two `GenericByteViewArray` using PartialEq compares by structure
60+
/// (the `u128`s) and contents of the buffers, not by logical content. As there
61+
/// are many different buffer layouts to represent the same data (e.g. different
62+
/// offsets, different buffer sizes, etc) two arrays with the same data may not
63+
/// compare equal.
64+
///
65+
/// To compare the logical content of two `GenericByteViewArray`s, use the [`eq`]
66+
/// kernel.
5467
///
55-
/// Comparing two `GenericByteViewArray` using PartialEq compares by structure,
56-
/// not by value. as there are many different buffer layouts to represent the
57-
/// same data (e.g. different offsets, different buffer sizes, etc).
68+
/// [`eq`]: https://docs.rs/arrow/latest/arrow/compute/kernels/cmp/fn.eq.html
5869
///
5970
/// # Layout: "views" and buffers
6071
///
@@ -86,6 +97,52 @@ use super::ByteArrayType;
8697
/// view and the entire string is stored in one of the buffers. See [`ByteView`]
8798
/// to access the fields of the these views.
8899
///
100+
/// As with other arrays, the optimized kernels in [`arrow_compute`] are likely
101+
/// the easiest and fastest way to work with this data. However, it is possible
102+
/// to access the views and buffers directly for more control.
103+
///
104+
/// For example
105+
///
106+
/// ```rust
107+
/// # use arrow_array::StringViewArray;
108+
/// # use arrow_array::Array;
109+
/// use arrow_data::ByteView;
110+
/// let array = StringViewArray::from(vec![
111+
/// "hello",
112+
/// "this string is longer than 12 bytes",
113+
/// "this string is also longer than 12 bytes"
114+
/// ]);
115+
///
116+
/// // ** Examine the first view (short string) **
117+
/// assert!(array.is_valid(0)); // Check for nulls
118+
/// let short_view: u128 = array.views()[0]; // "hello"
119+
/// // get length of the string
120+
/// let len = short_view as u32;
121+
/// assert_eq!(len, 5); // strings less than 12 bytes are stored in the view
122+
/// // SAFETY: `view` is a valid view
123+
/// let value = unsafe {
124+
/// StringViewArray::inline_value(&short_view, len as usize)
125+
/// };
126+
/// assert_eq!(value, b"hello");
127+
///
128+
/// // ** Examine the third view (long string) **
129+
/// assert!(array.is_valid(12)); // Check for nulls
130+
/// let long_view: u128 = array.views()[2]; // "this string is also longer than 12 bytes"
131+
/// let len = long_view as u32;
132+
/// assert_eq!(len, 40); // strings longer than 12 bytes are stored in the buffer
133+
/// let view = ByteView::from(long_view); // use ByteView to access the fields
134+
/// assert_eq!(view.length, 40);
135+
/// assert_eq!(view.buffer_index, 0);
136+
/// assert_eq!(view.offset, 35); // data starts after the first long string
137+
/// // Views for long strings store a 4 byte prefix
138+
/// let prefix = view.prefix.to_le_bytes();
139+
/// assert_eq!(&prefix, b"this");
140+
/// let value = array.value(2); // get the string value (see `value` implementation for how to access the bytes directly)
141+
/// assert_eq!(value, "this string is also longer than 12 bytes");
142+
/// ```
143+
///
144+
/// [`arrow_compute`]: https://docs.rs/arrow/latest/arrow/compute/index.html
145+
///
89146
/// Unlike [`GenericByteArray`], there are no constraints on the offsets other
90147
/// than they must point into a valid buffer. However, they can be out of order,
91148
/// non continuous and overlapping.
@@ -694,6 +751,8 @@ where
694751

695752
/// A [`GenericByteViewArray`] of `[u8]`
696753
///
754+
/// See [`GenericByteViewArray`] for format and layout details.
755+
///
697756
/// # Example
698757
/// ```
699758
/// use arrow_array::BinaryViewArray;
@@ -733,6 +792,8 @@ impl From<Vec<Option<&[u8]>>> for BinaryViewArray {
733792

734793
/// A [`GenericByteViewArray`] that stores utf8 data
735794
///
795+
/// See [`GenericByteViewArray`] for format and layout details.
796+
///
736797
/// # Example
737798
/// ```
738799
/// use arrow_array::StringViewArray;

0 commit comments

Comments
 (0)