@@ -33,14 +33,18 @@ use std::sync::Arc;
33
33
34
34
use super :: ByteArrayType ;
35
35
36
- /// [Variable-size Binary View Layout]: An array of variable length bytes view arrays.
36
+ /// [Variable-size Binary View Layout]: An array of variable length bytes views.
37
+ ///
38
+ /// This array type is used to store variable length byte data (e.g. Strings, Binary)
39
+ /// and has efficient operations such as `take`, `filter`, and comparison.
37
40
///
38
41
/// [Variable-size Binary View Layout]: https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-view-layout
39
42
///
40
- /// This is different from [`GenericByteArray`] as it stores both an offset and
41
- /// length meaning that take / filter operations can be implemented without
42
- /// copying the underlying data. In addition, it stores an inlined prefix which
43
- /// can be used to speed up comparisons.
43
+ /// This is different from [`GenericByteArray`], which also stores variable
44
+ /// length byte data, as it represents strings with an offset and length. `take`
45
+ /// and `filter` like operations are implemented by manipulating the "views"
46
+ /// (`u128`) without modifying the bytes. Each view also stores an inlined
47
+ /// prefix which speed up comparisons.
44
48
///
45
49
/// # See Also
46
50
///
@@ -50,11 +54,18 @@ use super::ByteArrayType;
50
54
///
51
55
/// [`ByteView`]: arrow_data::ByteView
52
56
///
53
- /// # Notes
57
+ /// # Use the [`eq`] kernel to compare the logical content.
58
+ ///
59
+ /// Comparing two `GenericByteViewArray` using PartialEq compares by structure
60
+ /// (the `u128`s) and contents of the buffers, not by logical content. As there
61
+ /// are many different buffer layouts to represent the same data (e.g. different
62
+ /// offsets, different buffer sizes, etc) two arrays with the same data may not
63
+ /// compare equal.
64
+ ///
65
+ /// To compare the logical content of two `GenericByteViewArray`s, use the [`eq`]
66
+ /// kernel.
54
67
///
55
- /// Comparing two `GenericByteViewArray` using PartialEq compares by structure,
56
- /// not by value. as there are many different buffer layouts to represent the
57
- /// same data (e.g. different offsets, different buffer sizes, etc).
68
+ /// [`eq`]: https://docs.rs/arrow/latest/arrow/compute/kernels/cmp/fn.eq.html
58
69
///
59
70
/// # Layout: "views" and buffers
60
71
///
@@ -86,6 +97,52 @@ use super::ByteArrayType;
86
97
/// view and the entire string is stored in one of the buffers. See [`ByteView`]
87
98
/// to access the fields of the these views.
88
99
///
100
+ /// As with other arrays, the optimized kernels in [`arrow_compute`] are likely
101
+ /// the easiest and fastest way to work with this data. However, it is possible
102
+ /// to access the views and buffers directly for more control.
103
+ ///
104
+ /// For example
105
+ ///
106
+ /// ```rust
107
+ /// # use arrow_array::StringViewArray;
108
+ /// # use arrow_array::Array;
109
+ /// use arrow_data::ByteView;
110
+ /// let array = StringViewArray::from(vec![
111
+ /// "hello",
112
+ /// "this string is longer than 12 bytes",
113
+ /// "this string is also longer than 12 bytes"
114
+ /// ]);
115
+ ///
116
+ /// // ** Examine the first view (short string) **
117
+ /// assert!(array.is_valid(0)); // Check for nulls
118
+ /// let short_view: u128 = array.views()[0]; // "hello"
119
+ /// // get length of the string
120
+ /// let len = short_view as u32;
121
+ /// assert_eq!(len, 5); // strings less than 12 bytes are stored in the view
122
+ /// // SAFETY: `view` is a valid view
123
+ /// let value = unsafe {
124
+ /// StringViewArray::inline_value(&short_view, len as usize)
125
+ /// };
126
+ /// assert_eq!(value, b"hello");
127
+ ///
128
+ /// // ** Examine the third view (long string) **
129
+ /// assert!(array.is_valid(12)); // Check for nulls
130
+ /// let long_view: u128 = array.views()[2]; // "this string is also longer than 12 bytes"
131
+ /// let len = long_view as u32;
132
+ /// assert_eq!(len, 40); // strings longer than 12 bytes are stored in the buffer
133
+ /// let view = ByteView::from(long_view); // use ByteView to access the fields
134
+ /// assert_eq!(view.length, 40);
135
+ /// assert_eq!(view.buffer_index, 0);
136
+ /// assert_eq!(view.offset, 35); // data starts after the first long string
137
+ /// // Views for long strings store a 4 byte prefix
138
+ /// let prefix = view.prefix.to_le_bytes();
139
+ /// assert_eq!(&prefix, b"this");
140
+ /// let value = array.value(2); // get the string value (see `value` implementation for how to access the bytes directly)
141
+ /// assert_eq!(value, "this string is also longer than 12 bytes");
142
+ /// ```
143
+ ///
144
+ /// [`arrow_compute`]: https://docs.rs/arrow/latest/arrow/compute/index.html
145
+ ///
89
146
/// Unlike [`GenericByteArray`], there are no constraints on the offsets other
90
147
/// than they must point into a valid buffer. However, they can be out of order,
91
148
/// non continuous and overlapping.
@@ -694,6 +751,8 @@ where
694
751
695
752
/// A [`GenericByteViewArray`] of `[u8]`
696
753
///
754
+ /// See [`GenericByteViewArray`] for format and layout details.
755
+ ///
697
756
/// # Example
698
757
/// ```
699
758
/// use arrow_array::BinaryViewArray;
@@ -733,6 +792,8 @@ impl From<Vec<Option<&[u8]>>> for BinaryViewArray {
733
792
734
793
/// A [`GenericByteViewArray`] that stores utf8 data
735
794
///
795
+ /// See [`GenericByteViewArray`] for format and layout details.
796
+ ///
736
797
/// # Example
737
798
/// ```
738
799
/// use arrow_array::StringViewArray;
0 commit comments