Skip to content

Commit 0a4d8a1

Browse files
authored
Fix FFI array offset handling (#5964)
1 parent 6bc9514 commit 0a4d8a1

File tree

2 files changed

+16
-63
lines changed

2 files changed

+16
-63
lines changed

arrow-array/src/ffi.rs

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -425,32 +425,32 @@ impl<'a> ImportedArrowArray<'a> {
425425
(length + 1) * (bits / 8)
426426
}
427427
(DataType::Utf8, 2) | (DataType::Binary, 2) => {
428-
// the len of the data buffer (buffer 2) equals the difference between the last value
429-
// and the first value of the offset buffer (buffer 1).
428+
if self.array.is_empty() {
429+
return Ok(0);
430+
}
431+
432+
// the len of the data buffer (buffer 2) equals the last value of the offset buffer (buffer 1)
430433
let len = self.buffer_len(1, dt)?;
431434
// first buffer is the null buffer => add(1)
432435
// we assume that pointer is aligned for `i32`, as Utf8 uses `i32` offsets.
433436
#[allow(clippy::cast_ptr_alignment)]
434437
let offset_buffer = self.array.buffer(1) as *const i32;
435-
// get first offset
436-
let start = (unsafe { *offset_buffer.add(0) }) as usize;
437438
// get last offset
438-
let end = (unsafe { *offset_buffer.add(len / size_of::<i32>() - 1) }) as usize;
439-
end - start
439+
(unsafe { *offset_buffer.add(len / size_of::<i32>() - 1) }) as usize
440440
}
441441
(DataType::LargeUtf8, 2) | (DataType::LargeBinary, 2) => {
442-
// the len of the data buffer (buffer 2) equals the difference between the last value
443-
// and the first value of the offset buffer (buffer 1).
442+
if self.array.is_empty() {
443+
return Ok(0);
444+
}
445+
446+
// the len of the data buffer (buffer 2) equals the last value of the offset buffer (buffer 1)
444447
let len = self.buffer_len(1, dt)?;
445448
// first buffer is the null buffer => add(1)
446449
// we assume that pointer is aligned for `i64`, as Large uses `i64` offsets.
447450
#[allow(clippy::cast_ptr_alignment)]
448451
let offset_buffer = self.array.buffer(1) as *const i64;
449-
// get first offset
450-
let start = (unsafe { *offset_buffer.add(0) }) as usize;
451452
// get last offset
452-
let end = (unsafe { *offset_buffer.add(len / size_of::<i64>() - 1) }) as usize;
453-
end - start
453+
(unsafe { *offset_buffer.add(len / size_of::<i64>() - 1) }) as usize
454454
}
455455
// buffer len of primitive types
456456
_ => {

arrow-data/src/ffi.rs

Lines changed: 4 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -131,37 +131,6 @@ impl FFI_ArrowArray {
131131
data.buffers().iter().map(|b| Some(b.clone())).collect()
132132
};
133133

134-
// Handle buffer offset for offset buffer.
135-
let offset_offset = match data.data_type() {
136-
DataType::Utf8 | DataType::Binary => {
137-
// Offset buffer is possible a slice of the buffer.
138-
// If we use slice pointer as exported buffer pointer, it will cause
139-
// the consumer to calculate incorrect length of data buffer (buffer 1).
140-
// We need to get the offset of the offset buffer and fill it in
141-
// the `FFI_ArrowArray` offset field.
142-
Some(data.buffers()[0].ptr_offset() / std::mem::size_of::<i32>())
143-
}
144-
DataType::LargeUtf8 | DataType::LargeBinary => {
145-
// Offset buffer is possible a slice of the buffer.
146-
// If we use slice pointer as exported buffer pointer, it will cause
147-
// the consumer to calculate incorrect length of data buffer (buffer 1).
148-
// We need to get the offset of the offset buffer and fill it in
149-
// the `FFI_ArrowArray` offset field.
150-
Some(data.buffers()[0].ptr_offset() / std::mem::size_of::<i64>())
151-
}
152-
_ => None,
153-
};
154-
155-
let offset = if let Some(offset) = offset_offset {
156-
if data.offset() != 0 {
157-
// TODO: Adjust for data offset
158-
panic!("The ArrayData of a slice offset buffer should not have offset");
159-
}
160-
offset
161-
} else {
162-
data.offset()
163-
};
164-
165134
// `n_buffers` is the number of buffers by the spec.
166135
let n_buffers = {
167136
data_layout.buffers.len() + {
@@ -174,25 +143,9 @@ impl FFI_ArrowArray {
174143

175144
let buffers_ptr = buffers
176145
.iter()
177-
.enumerate()
178-
.flat_map(|(buffer_idx, maybe_buffer)| match maybe_buffer {
179-
Some(b) => {
180-
match (data.data_type(), buffer_idx) {
181-
(
182-
DataType::Utf8
183-
| DataType::LargeUtf8
184-
| DataType::Binary
185-
| DataType::LargeBinary,
186-
1,
187-
) => {
188-
// For offset buffer, take original pointer without offset.
189-
// Buffer offset should be handled by `FFI_ArrowArray` offset field.
190-
Some(b.data_ptr().as_ptr() as *const c_void)
191-
}
192-
// For other buffers, note that `raw_data` takes into account the buffer's offset
193-
_ => Some(b.as_ptr() as *const c_void),
194-
}
195-
}
146+
.flat_map(|maybe_buffer| match maybe_buffer {
147+
// note that `raw_data` takes into account the buffer's offset
148+
Some(b) => Some(b.as_ptr() as *const c_void),
196149
// This is for null buffer. We only put a null pointer for
197150
// null buffer if by spec it can contain null mask.
198151
None if data_layout.can_contain_null_mask => Some(std::ptr::null()),
@@ -233,7 +186,7 @@ impl FFI_ArrowArray {
233186
Self {
234187
length: data.len() as i64,
235188
null_count: null_count as i64,
236-
offset: offset as i64,
189+
offset: data.offset() as i64,
237190
n_buffers,
238191
n_children,
239192
buffers: private_data.buffers_ptr.as_mut_ptr(),

0 commit comments

Comments
 (0)