Skip to content

Commit 60c3e6c

Browse files
committed
fix: Adjust FFI_ArrowArray offset based on the offset of offset buffer
1 parent 087f34b commit 60c3e6c

File tree

3 files changed

+120
-4
lines changed

3 files changed

+120
-4
lines changed

arrow-array/src/ffi.rs

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1225,6 +1225,7 @@ mod tests_from_ffi {
12251225
use std::sync::Arc;
12261226

12271227
use arrow_buffer::{bit_util, buffer::Buffer, MutableBuffer, OffsetBuffer};
1228+
use arrow_data::transform::MutableArrayData;
12281229
use arrow_data::ArrayData;
12291230
use arrow_schema::{DataType, Field};
12301231

@@ -1234,6 +1235,7 @@ mod tests_from_ffi {
12341235
Int32Array, Int64Array, StringArray, StructArray, UInt32Array, UInt64Array,
12351236
},
12361237
ffi::{from_ffi, FFI_ArrowArray, FFI_ArrowSchema},
1238+
make_array, ArrayRef,
12371239
};
12381240

12391241
use super::{ImportedArrowArray, Result};
@@ -1458,4 +1460,58 @@ mod tests_from_ffi {
14581460

14591461
test_round_trip(&imported_array.consume()?)
14601462
}
1463+
1464+
fn export_string(array: StringArray) -> StringArray {
1465+
let data = array.into_data();
1466+
1467+
let array = FFI_ArrowArray::new(&data);
1468+
let schema = FFI_ArrowSchema::try_from(data.data_type()).unwrap();
1469+
1470+
let array = unsafe { from_ffi(array, &schema) }.unwrap();
1471+
StringArray::from(array)
1472+
}
1473+
1474+
fn extend_array(array: &dyn Array) -> ArrayRef {
1475+
let len = array.len();
1476+
let data = array.to_data();
1477+
1478+
let mut mutable = MutableArrayData::new(vec![&data], false, len);
1479+
mutable.extend(0, 0, len);
1480+
make_array(mutable.freeze())
1481+
}
1482+
1483+
#[test]
1484+
fn test_extend_imported_string_slice() {
1485+
let mut strings = vec![];
1486+
1487+
for i in 0..1000 {
1488+
strings.push(format!("string: {}", i));
1489+
}
1490+
1491+
let string_array = StringArray::from(strings);
1492+
1493+
let imported = export_string(string_array.clone());
1494+
assert_eq!(imported.len(), 1000);
1495+
assert_eq!(imported.value(0), "string: 0");
1496+
assert_eq!(imported.value(499), "string: 499");
1497+
1498+
let copied = extend_array(&imported);
1499+
assert_eq!(
1500+
copied.as_any().downcast_ref::<StringArray>().unwrap(),
1501+
&imported
1502+
);
1503+
1504+
let slice = string_array.slice(500, 500);
1505+
1506+
let imported = export_string(slice);
1507+
assert_eq!(imported.len(), 500);
1508+
assert_eq!(imported.value(0), "string: 500");
1509+
assert_eq!(imported.value(499), "string: 999");
1510+
1511+
let copied = extend_array(&imported);
1512+
assert_eq!(
1513+
copied.as_any().downcast_ref::<StringArray>().unwrap(),
1514+
&imported
1515+
);
1516+
}
14611517
}

arrow-buffer/src/buffer/immutable.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,11 @@ impl Buffer {
7171
}
7272
}
7373

74+
/// Returns the internal byte buffer.
75+
pub fn get_bytes(&self) -> Arc<Bytes> {
76+
self.data.clone()
77+
}
78+
7479
/// Create a [`Buffer`] from the provided [`Vec`] without copying
7580
#[inline]
7681
pub fn from_vec<T: ArrowNativeType>(vec: Vec<T>) -> Self {

arrow-data/src/ffi.rs

Lines changed: 59 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,45 @@ impl FFI_ArrowArray {
131131
data.buffers().iter().map(|b| Some(b.clone())).collect()
132132
};
133133

134+
// Handle buffer offset for offset buffer.
135+
let offset_offset = match data.data_type() {
136+
DataType::Utf8 | DataType::Binary => {
137+
// Offset buffer is possible a slice of the buffer.
138+
// If we use slice pointer as exported buffer pointer, it will cause
139+
// the consumer to calculate incorrect length of data buffer (buffer 1).
140+
// We need to get the offset of the offset buffer and fill it in
141+
// the `FFI_ArrowArray` offset field.
142+
Some(unsafe {
143+
let b = &data.buffers()[0];
144+
b.as_ptr().offset_from(b.get_bytes().ptr().as_ptr()) as usize
145+
/ std::mem::size_of::<i32>()
146+
})
147+
}
148+
DataType::LargeUtf8 | DataType::LargeBinary => {
149+
// Offset buffer is possible a slice of the buffer.
150+
// If we use slice pointer as exported buffer pointer, it will cause
151+
// the consumer to calculate incorrect length of data buffer (buffer 1).
152+
// We need to get the offset of the offset buffer and fill it in
153+
// the `FFI_ArrowArray` offset field.
154+
Some(unsafe {
155+
let b = &data.buffers()[0];
156+
b.as_ptr().offset_from(b.get_bytes().ptr().as_ptr()) as usize
157+
/ std::mem::size_of::<i64>()
158+
})
159+
}
160+
_ => None,
161+
};
162+
163+
let offset = if let Some(offset) = offset_offset {
164+
if data.offset() != 0 {
165+
// TODO: Adjust for data offset
166+
panic!("Offset buffer should not have offset");
167+
}
168+
offset
169+
} else {
170+
data.offset()
171+
};
172+
134173
// `n_buffers` is the number of buffers by the spec.
135174
let n_buffers = {
136175
data_layout.buffers.len() + {
@@ -143,9 +182,25 @@ impl FFI_ArrowArray {
143182

144183
let buffers_ptr = buffers
145184
.iter()
146-
.flat_map(|maybe_buffer| match maybe_buffer {
147-
// note that `raw_data` takes into account the buffer's offset
148-
Some(b) => Some(b.as_ptr() as *const c_void),
185+
.enumerate()
186+
.flat_map(|(buffer_idx, maybe_buffer)| match maybe_buffer {
187+
Some(b) => {
188+
match (data.data_type(), buffer_idx) {
189+
(
190+
DataType::Utf8
191+
| DataType::LargeUtf8
192+
| DataType::Binary
193+
| DataType::LargeBinary,
194+
1,
195+
) => {
196+
// For offset buffer, take original pointer without offset.
197+
// Buffer offset should be handled by `FFI_ArrowArray` offset field.
198+
Some(b.get_bytes().ptr().as_ptr() as *const c_void)
199+
}
200+
// For other buffers, note that `raw_data` takes into account the buffer's offset
201+
_ => Some(b.as_ptr() as *const c_void),
202+
}
203+
}
149204
// This is for null buffer. We only put a null pointer for
150205
// null buffer if by spec it can contain null mask.
151206
None if data_layout.can_contain_null_mask => Some(std::ptr::null()),
@@ -186,7 +241,7 @@ impl FFI_ArrowArray {
186241
Self {
187242
length: data.len() as i64,
188243
null_count: null_count as i64,
189-
offset: data.offset() as i64,
244+
offset: offset as i64,
190245
n_buffers,
191246
n_children,
192247
buffers: private_data.buffers_ptr.as_mut_ptr(),

0 commit comments

Comments
 (0)