Skip to content

Commit cd39b8c

Browse files
authored
Compute data buffer length by using start and end values in offset buffer (#5741)
* Compute data buffer length by offset buffer start and end values * Update code comment * Add unit test * Add round_trip check * Fix clippy
1 parent 1c86921 commit cd39b8c

File tree

2 files changed

+55
-16
lines changed

2 files changed

+55
-16
lines changed

arrow-array/src/ffi.rs

Lines changed: 44 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -425,24 +425,32 @@ impl<'a> ImportedArrowArray<'a> {
425425
(length + 1) * (bits / 8)
426426
}
427427
(DataType::Utf8, 2) | (DataType::Binary, 2) => {
428-
// the len of the data buffer (buffer 2) equals the last value of the offset buffer (buffer 1)
428+
// the len of the data buffer (buffer 2) equals the difference between the last value
429+
// and the first value of the offset buffer (buffer 1).
429430
let len = self.buffer_len(1, dt)?;
430431
// first buffer is the null buffer => add(1)
431432
// we assume that pointer is aligned for `i32`, as Utf8 uses `i32` offsets.
432433
#[allow(clippy::cast_ptr_alignment)]
433434
let offset_buffer = self.array.buffer(1) as *const i32;
435+
// get first offset
436+
let start = (unsafe { *offset_buffer.add(0) }) as usize;
434437
// get last offset
435-
(unsafe { *offset_buffer.add(len / size_of::<i32>() - 1) }) as usize
438+
let end = (unsafe { *offset_buffer.add(len / size_of::<i32>() - 1) }) as usize;
439+
end - start
436440
}
437441
(DataType::LargeUtf8, 2) | (DataType::LargeBinary, 2) => {
438-
// the len of the data buffer (buffer 2) equals the last value of the offset buffer (buffer 1)
442+
// the len of the data buffer (buffer 2) equals the difference between the last value
443+
// and the first value of the offset buffer (buffer 1).
439444
let len = self.buffer_len(1, dt)?;
440445
// first buffer is the null buffer => add(1)
441446
// we assume that pointer is aligned for `i64`, as Large uses `i64` offsets.
442447
#[allow(clippy::cast_ptr_alignment)]
443448
let offset_buffer = self.array.buffer(1) as *const i64;
449+
// get first offset
450+
let start = (unsafe { *offset_buffer.add(0) }) as usize;
444451
// get last offset
445-
(unsafe { *offset_buffer.add(len / size_of::<i64>() - 1) }) as usize
452+
let end = (unsafe { *offset_buffer.add(len / size_of::<i64>() - 1) }) as usize;
453+
end - start
446454
}
447455
// buffer len of primitive types
448456
_ => {
@@ -1216,7 +1224,7 @@ mod tests_to_then_from_ffi {
12161224
mod tests_from_ffi {
12171225
use std::sync::Arc;
12181226

1219-
use arrow_buffer::{bit_util, buffer::Buffer};
1227+
use arrow_buffer::{bit_util, buffer::Buffer, MutableBuffer, OffsetBuffer};
12201228
use arrow_data::ArrayData;
12211229
use arrow_schema::{DataType, Field};
12221230

@@ -1228,7 +1236,7 @@ mod tests_from_ffi {
12281236
ffi::{from_ffi, FFI_ArrowArray, FFI_ArrowSchema},
12291237
};
12301238

1231-
use super::Result;
1239+
use super::{ImportedArrowArray, Result};
12321240

12331241
fn test_round_trip(expected: &ArrayData) -> Result<()> {
12341242
// here we export the array
@@ -1420,4 +1428,34 @@ mod tests_from_ffi {
14201428
let data = array.into_data();
14211429
test_round_trip(&data)
14221430
}
1431+
1432+
#[test]
1433+
fn test_empty_string_with_non_zero_offset() -> Result<()> {
1434+
// Simulate an empty string array with a non-zero offset from a producer
1435+
let data: Buffer = MutableBuffer::new(0).into();
1436+
let offsets = OffsetBuffer::new(vec![123].into());
1437+
let string_array =
1438+
unsafe { StringArray::new_unchecked(offsets.clone(), data.clone(), None) };
1439+
1440+
let data = string_array.into_data();
1441+
1442+
let array = FFI_ArrowArray::new(&data);
1443+
let schema = FFI_ArrowSchema::try_from(data.data_type())?;
1444+
1445+
let dt = DataType::try_from(&schema)?;
1446+
let array = Arc::new(array);
1447+
let imported_array = ImportedArrowArray {
1448+
array: &array,
1449+
data_type: dt,
1450+
owner: &array,
1451+
};
1452+
1453+
let offset_buf_len = imported_array.buffer_len(1, &imported_array.data_type)?;
1454+
let data_buf_len = imported_array.buffer_len(2, &imported_array.data_type)?;
1455+
1456+
assert_eq!(offset_buf_len, 4);
1457+
assert_eq!(data_buf_len, 0);
1458+
1459+
test_round_trip(&imported_array.consume()?)
1460+
}
14231461
}

arrow-data/src/equal/variable_size.rs

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -32,17 +32,18 @@ fn offset_value_equal<T: ArrowNativeType + Integer>(
3232
) -> bool {
3333
let lhs_start = lhs_offsets[lhs_pos].as_usize();
3434
let rhs_start = rhs_offsets[rhs_pos].as_usize();
35-
let lhs_len = lhs_offsets[lhs_pos + len] - lhs_offsets[lhs_pos];
36-
let rhs_len = rhs_offsets[rhs_pos + len] - rhs_offsets[rhs_pos];
35+
let lhs_len = (lhs_offsets[lhs_pos + len] - lhs_offsets[lhs_pos])
36+
.to_usize()
37+
.unwrap();
38+
let rhs_len = (rhs_offsets[rhs_pos + len] - rhs_offsets[rhs_pos])
39+
.to_usize()
40+
.unwrap();
3741

38-
lhs_len == rhs_len
39-
&& equal_len(
40-
lhs_values,
41-
rhs_values,
42-
lhs_start,
43-
rhs_start,
44-
lhs_len.to_usize().unwrap(),
45-
)
42+
if lhs_len == 0 && rhs_len == 0 {
43+
return true;
44+
}
45+
46+
lhs_len == rhs_len && equal_len(lhs_values, rhs_values, lhs_start, rhs_start, lhs_len)
4647
}
4748

4849
pub(super) fn variable_sized_equal<T: ArrowNativeType + Integer>(

0 commit comments

Comments
 (0)