Skip to content

Commit 9f85667

Browse files
committed
feat: Add ffi_enforce_no_offset feature as a workaround hack to unavailable offset support in Arrow Java
1 parent 901fbe8 commit 9f85667

File tree

6 files changed

+70
-4
lines changed

6 files changed

+70
-4
lines changed

.github/workflows/arrow.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,8 @@ jobs:
9595
run: cargo test -p arrow
9696
- name: Test arrow with all features except pyarrow
9797
run: cargo test -p arrow --features=force_validate,prettyprint,ipc_compression,ffi,chrono-tz
98+
- name: Test arrow with ffi_enforce_no_offset
99+
run: cargo test -p arrow --features=force_validate,prettyprint,ipc_compression,ffi,ffi_enforce_no_offset,chrono-tz
98100
- name: Run examples
99101
run: |
100102
# Test arrow examples
@@ -129,6 +131,8 @@ jobs:
129131
run: cargo check -p arrow --no-default-features --all-targets --features test_utils
130132
- name: Check compilation --no-default-features --all-targets --features ffi
131133
run: cargo check -p arrow --no-default-features --all-targets --features ffi
134+
- name: Check compilation --no-default-features --all-targets --features ffi,ffi_enforce_no_offset
135+
run: cargo check -p arrow --no-default-features --all-targets --features ffi,ffi_enforce_no_offset
132136
- name: Check compilation --no-default-features --all-targets --features chrono-tz
133137
run: cargo check -p arrow --no-default-features --all-targets --features chrono-tz
134138

arrow-array/Cargo.toml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,20 @@ hashbrown = { version = "0.14", default-features = false }
5353
[features]
5454
ffi = ["arrow-schema/ffi", "arrow-data/ffi"]
5555

56+
# HACKY: For offset-ed offset buffer of variable-sized binary array
57+
# The valid FFI output is to record the offset buffer's offset into FFI's
58+
# `offset` field and expose the beginning of the buffer as the pointer.
59+
# So the consumer can calculate correctly the length of data buffer.
60+
# However, due to Arrow Java's issue: https://github.com/apache/arrow/issues/42156,
61+
# Arrow Java does not support `offset` field in the FFI output. It causes
62+
# no sliced binary array can be passed between arrow-rs and Arrow Java.
63+
# To workaround this issue, we have to enforce offset-ed offset buffer to
64+
# be exposed at the sliced buffer's beginning. When calculating the length
65+
# of data buffer, we assume the first element of the offset buffer is always 0
66+
# (this is how Arrow Java and arrow-rs do). This is a hacky solution and
67+
# should be removed once Arrow Java supports `offset` field in the FFI output.
68+
ffi_enforce_no_offset = ["arrow-data/ffi_enforce_no_offset"]
69+
5670
[dev-dependencies]
5771
rand = { version = "0.8", default-features = false, features = ["std", "std_rng"] }
5872
criterion = { version = "0.5", default-features = false }

arrow-array/src/ffi.rs

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -436,7 +436,16 @@ impl<'a> ImportedArrowArray<'a> {
436436
let start = (unsafe { *offset_buffer.add(0) }) as usize;
437437
// get last offset
438438
let end = (unsafe { *offset_buffer.add(len / size_of::<i32>() - 1) }) as usize;
439-
end - start
439+
440+
if cfg!(feature = "ffi_enforce_no_offset") {
441+
if end == start {
442+
0
443+
} else {
444+
end
445+
}
446+
} else {
447+
end - start
448+
}
440449
}
441450
(DataType::LargeUtf8, 2) | (DataType::LargeBinary, 2) => {
442451
// the len of the data buffer (buffer 2) equals the difference between the last value
@@ -450,7 +459,16 @@ impl<'a> ImportedArrowArray<'a> {
450459
let start = (unsafe { *offset_buffer.add(0) }) as usize;
451460
// get last offset
452461
let end = (unsafe { *offset_buffer.add(len / size_of::<i64>() - 1) }) as usize;
453-
end - start
462+
463+
if cfg!(feature = "ffi_enforce_no_offset") {
464+
if end == start {
465+
0
466+
} else {
467+
end
468+
}
469+
} else {
470+
end - start
471+
}
454472
}
455473
// buffer len of primitive types
456474
_ => {

arrow-data/Cargo.toml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,20 @@ force_validate = []
4141
# Enable ffi support
4242
ffi = ["arrow-schema/ffi"]
4343

44+
# HACKY: For offset-ed offset buffer of variable-sized binary array
45+
# The valid FFI output is to record the offset buffer's offset into FFI's
46+
# `offset` field and expose the beginning of the buffer as the pointer.
47+
# So the consumer can calculate correctly the length of data buffer.
48+
# However, due to Arrow Java's issue: https://github.com/apache/arrow/issues/42156,
49+
# Arrow Java does not support `offset` field in the FFI output. It causes
50+
# no sliced binary array can be passed between arrow-rs and Arrow Java.
51+
# To workaround this issue, we have to enforce offset-ed offset buffer to
52+
# be exposed at the sliced buffer's beginning. When calculating the length
53+
# of data buffer, we assume the first element of the offset buffer is always 0
54+
# (this is how Arrow Java and arrow-rs do). This is a hacky solution and
55+
# should be removed once Arrow Java supports `offset` field in the FFI output.
56+
ffi_enforce_no_offset = []
57+
4458
[package.metadata.docs.rs]
4559
features = ["ffi"]
4660

arrow-data/src/ffi.rs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,9 @@ impl FFI_ArrowArray {
152152
_ => None,
153153
};
154154

155-
let offset = if let Some(offset) = offset_offset {
155+
let offset = if cfg!(feature = "ffi_enforce_no_offset") {
156+
data.offset()
157+
} else if let Some(offset) = offset_offset {
156158
if data.offset() != 0 {
157159
// TODO: Adjust for data offset
158160
panic!("The ArrayData of a slice offset buffer should not have offset");
@@ -184,7 +186,7 @@ impl FFI_ArrowArray {
184186
| DataType::Binary
185187
| DataType::LargeBinary,
186188
1,
187-
) => {
189+
) if !cfg!(feature = "ffi_enforce_no_offset") => {
188190
// For offset buffer, take original pointer without offset.
189191
// Buffer offset should be handled by `FFI_ArrowArray` offset field.
190192
Some(b.data_ptr().as_ptr() as *const c_void)

arrow/Cargo.toml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,20 @@ force_validate = ["arrow-data/force_validate"]
8080
ffi = ["arrow-schema/ffi", "arrow-data/ffi", "arrow-array/ffi"]
8181
chrono-tz = ["arrow-array/chrono-tz"]
8282

83+
# HACKY: For offset-ed offset buffer of variable-sized binary array
84+
# The valid FFI output is to record the offset buffer's offset into FFI's
85+
# `offset` field and expose the beginning of the buffer as the pointer.
86+
# So the consumer can calculate correctly the length of data buffer.
87+
# However, due to Arrow Java's issue: https://github.com/apache/arrow/issues/42156,
88+
# Arrow Java does not support `offset` field in the FFI output. It causes
89+
# no sliced binary array can be passed between arrow-rs and Arrow Java.
90+
# To workaround this issue, we have to enforce offset-ed offset buffer to
91+
# be exposed at the sliced buffer's beginning. When calculating the length
92+
# of data buffer, we assume the first element of the offset buffer is always 0
93+
# (this is how Arrow Java and arrow-rs do). This is a hacky solution and
94+
# should be removed once Arrow Java supports `offset` field in the FFI output.
95+
ffi_enforce_no_offset = ["arrow-data/ffi_enforce_no_offset", "arrow-array/ffi_enforce_no_offset"]
96+
8397
[dev-dependencies]
8498
chrono = { workspace = true }
8599
criterion = { version = "0.5", default-features = false }

0 commit comments

Comments
 (0)