Skip to content

Commit fb697ce

Browse files
authored
Remove simd and avx512 bitwise kernels in favor of autovectorization (#1830)
* Remove simd and avx512 bitwise kernels since they are actually slightly slower than the autovectorized version * Add notes about target-cpu to README
1 parent 029203e commit fb697ce

File tree

7 files changed

+69
-413
lines changed

7 files changed

+69
-413
lines changed

arrow/Cargo.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,6 @@ bitflags = "1.2.1"
6161

6262
[features]
6363
default = ["csv", "ipc", "test_utils"]
64-
avx512 = []
6564
csv = ["csv_crate"]
6665
ipc = ["flatbuffers"]
6766
simd = ["packed_simd"]

arrow/README.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,3 +100,17 @@ cargo run --example read_csv
100100
```
101101

102102
[arrow]: https://arrow.apache.org/
103+
104+
105+
## Performance
106+
107+
Most of the compute kernels benefit a lot from being optimized for a specific CPU target.
108+
This is especially so on x86-64 since without specifying a target the compiler can only assume support for SSE2 vector instructions.
109+
One of the following values as `-Ctarget-cpu=value` in `RUSTFLAGS` can therefore improve performance significantly:
110+
111+
- `native`: Target the exact features of the cpu that the build is running on.
112+
This should give the best performance when building and running locally, but should be used carefully for example when building in a CI pipeline or when shipping pre-compiled software.
113+
- `x86-64-v3`: Includes AVX2 support and is close to the intel `haswell` architecture released in 2013 and should be supported by any recent Intel or Amd cpu.
114+
- `x86-64-v4`: Includes AVX512 support available on intel `skylake` server and `icelake`/`tigerlake`/`rocketlake` laptop and desktop processors.
115+
116+
These flags should be used in addition to the `simd` feature, since they will also affect the code generated by the simd library.

arrow/benches/buffer_bit_ops.rs

Lines changed: 53 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,14 @@
1717

1818
#[macro_use]
1919
extern crate criterion;
20-
use criterion::Criterion;
20+
21+
use criterion::{Criterion, Throughput};
2122

2223
extern crate arrow;
2324

24-
use arrow::buffer::{Buffer, MutableBuffer};
25+
use arrow::buffer::{
26+
buffer_bin_and, buffer_bin_or, buffer_unary_not, Buffer, MutableBuffer,
27+
};
2528

2629
/// Helper function to create arrays
2730
fn create_buffer(size: usize) -> Buffer {
@@ -42,17 +45,59 @@ fn bench_buffer_or(left: &Buffer, right: &Buffer) {
4245
criterion::black_box((left | right).unwrap());
4346
}
4447

48+
fn bench_buffer_not(buffer: &Buffer) {
49+
criterion::black_box(!buffer);
50+
}
51+
52+
fn bench_buffer_and_with_offsets(
53+
left: &Buffer,
54+
left_offset: usize,
55+
right: &Buffer,
56+
right_offset: usize,
57+
len: usize,
58+
) {
59+
criterion::black_box(buffer_bin_and(left, left_offset, right, right_offset, len));
60+
}
61+
62+
fn bench_buffer_or_with_offsets(
63+
left: &Buffer,
64+
left_offset: usize,
65+
right: &Buffer,
66+
right_offset: usize,
67+
len: usize,
68+
) {
69+
criterion::black_box(buffer_bin_or(left, left_offset, right, right_offset, len));
70+
}
71+
72+
fn bench_buffer_not_with_offsets(buffer: &Buffer, offset: usize, len: usize) {
73+
criterion::black_box(buffer_unary_not(buffer, offset, len));
74+
}
75+
4576
fn bit_ops_benchmark(c: &mut Criterion) {
4677
let left = create_buffer(512 * 10);
4778
let right = create_buffer(512 * 10);
4879

49-
c.bench_function("buffer_bit_ops and", |b| {
50-
b.iter(|| bench_buffer_and(&left, &right))
51-
});
80+
c.benchmark_group("buffer_binary_ops")
81+
.throughput(Throughput::Bytes(3 * left.len() as u64))
82+
.bench_function("and", |b| b.iter(|| bench_buffer_and(&left, &right)))
83+
.bench_function("or", |b| b.iter(|| bench_buffer_or(&left, &right)))
84+
.bench_function("and_with_offset", |b| {
85+
b.iter(|| {
86+
bench_buffer_and_with_offsets(&left, 1, &right, 2, left.len() * 8 - 5)
87+
})
88+
})
89+
.bench_function("or_with_offset", |b| {
90+
b.iter(|| {
91+
bench_buffer_or_with_offsets(&left, 1, &right, 2, left.len() * 8 - 5)
92+
})
93+
});
5294

53-
c.bench_function("buffer_bit_ops or", |b| {
54-
b.iter(|| bench_buffer_or(&left, &right))
55-
});
95+
c.benchmark_group("buffer_unary_ops")
96+
.throughput(Throughput::Bytes(2 * left.len() as u64))
97+
.bench_function("not", |b| b.iter(|| bench_buffer_not(&left)))
98+
.bench_function("not_with_offset", |b| {
99+
b.iter(|| bench_buffer_not_with_offsets(&left, 1, left.len() * 8 - 5))
100+
});
56101
}
57102

58103
criterion_group!(benches, bit_ops_benchmark);

arrow/src/arch/avx512.rs

Lines changed: 0 additions & 73 deletions
This file was deleted.

arrow/src/arch/mod.rs

Lines changed: 0 additions & 22 deletions
This file was deleted.

0 commit comments

Comments
 (0)