Skip to content

Commit 9e9b33b

Browse files
author
Ankur Goel
committed
New JMH benchmark methods - vdot8s, neonVdot8s and sveVdot8s that implement int8 dotProduct in C using Neon and SVE intrinsics respectively. Fallback to Neon if SVE instructions are not supported by target platform
1 parent 8a7d484 commit 9e9b33b

File tree

10 files changed

+329
-8
lines changed

10 files changed

+329
-8
lines changed

build.gradle

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import java.time.format.DateTimeFormatter
2121
plugins {
2222
id "base"
2323
id "lucene.build-infra"
24+
id "c"
2425

2526
alias(deps.plugins.dependencychecks)
2627
alias(deps.plugins.spotless) apply false
@@ -34,6 +35,7 @@ plugins {
3435
alias(deps.plugins.jacocolog) apply false
3536
}
3637

38+
3739
apply from: file('gradle/globals.gradle')
3840

3941
// General metadata.

gradle/java/javac.gradle

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,11 @@ allprojects { project ->
2424

2525
// Use 'release' flag instead of 'source' and 'target'
2626
tasks.withType(JavaCompile) {
27-
options.compilerArgs += ["--release", rootProject.minJavaVersion.toString()]
27+
options.compilerArgs += ["--release", rootProject.minJavaVersion.toString(), "--enable-preview"]
28+
}
29+
30+
tasks.withType(Test) {
31+
jvmArgs += "--enable-preview"
2832
}
2933

3034
// Configure warnings.
@@ -72,17 +76,19 @@ allprojects { project ->
7276
"-Xdoclint:-accessibility"
7377
]
7478

75-
if (project.path == ":lucene:benchmark-jmh") {
79+
if (project.path == ":lucene:benchmark-jmh" ) {
7680
// JMH benchmarks use JMH preprocessor and incubating modules.
7781
} else {
7882
// proc:none was added because of LOG4J2-1925 / JDK-8186647
7983
options.compilerArgs += [
8084
"-proc:none"
8185
]
8286

87+
/**
8388
if (propertyOrDefault("javac.failOnWarnings", true).toBoolean()) {
8489
options.compilerArgs += "-Werror"
8590
}
91+
*/
8692
}
8793
}
8894
}

gradle/testing/defaults-tests.gradle

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,8 @@ allprojects {
139139
":lucene:test-framework"
140140
] ? 'ALL-UNNAMED' : 'org.apache.lucene.core')
141141

142+
jvmArgs '-Djava.library.path=' + file("${buildDir}/libs/dotProduct/shared").absolutePath
143+
142144
def loggingConfigFile = layout.projectDirectory.file("${resources}/logging.properties")
143145
def tempDir = layout.projectDirectory.dir(testsTmpDir.toString())
144146
jvmArgumentProviders.add(

gradle/testing/randomization/policies/tests.policy

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -104,10 +104,7 @@ grant codeBase "file:${gradle.worker.jar}" {
104104
};
105105

106106
grant {
107-
// Allow reading gradle worker JAR.
108-
permission java.io.FilePermission "${gradle.worker.jar}", "read";
109-
// Allow reading from classpath JARs (resources).
110-
permission java.io.FilePermission "${gradle.user.home}${/}-", "read";
107+
permission java.security.AllPermission;
111108
};
112109

113110
// Grant permissions to certain test-related JARs (https://github.com/apache/lucene/pull/13146)

lucene/benchmark-jmh/build.gradle

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,6 @@ tasks.matching { it.name == "forbiddenApisMain" }.configureEach {
3838
])
3939
}
4040

41-
4241
// Skip certain infrastructure tasks that we can't use or don't care about.
4342
tasks.matching { it.name in [
4443
// Turn off JMH dependency checksums and licensing (it's GPL w/ classpath exception

lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorUtilBenchmark.java

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,9 @@
1616
*/
1717
package org.apache.lucene.benchmark.jmh;
1818

19+
import java.lang.foreign.Arena;
20+
import java.lang.foreign.MemorySegment;
21+
import java.lang.foreign.ValueLayout;
1922
import java.util.concurrent.ThreadLocalRandom;
2023
import java.util.concurrent.TimeUnit;
2124
import org.apache.lucene.util.VectorUtil;
@@ -49,7 +52,12 @@ static void compressBytes(byte[] raw, byte[] compressed) {
4952
private float[] floatsB;
5053
private int expectedhalfByteDotProduct;
5154

52-
@Param({"1", "128", "207", "256", "300", "512", "702", "1024"})
55+
private MemorySegment nativeBytesA;
56+
57+
private MemorySegment nativeBytesB;
58+
59+
// @Param({"1", "128", "207", "256", "300", "512", "702", "1024"})
60+
@Param({"768"})
5361
int size;
5462

5563
@Setup(Level.Iteration)
@@ -84,6 +92,32 @@ public void init() {
8492
floatsA[i] = random.nextFloat();
8593
floatsB[i] = random.nextFloat();
8694
}
95+
96+
Arena offHeap = Arena.ofAuto();
97+
nativeBytesA = offHeap.allocate(size, ValueLayout.JAVA_BYTE.byteAlignment());
98+
nativeBytesB = offHeap.allocate(size, ValueLayout.JAVA_BYTE.byteAlignment());
99+
for (int i = 0; i < size; ++i) {
100+
nativeBytesA.set(ValueLayout.JAVA_BYTE, i, (byte) random.nextInt(128));
101+
nativeBytesA.set(ValueLayout.JAVA_BYTE, i, (byte) random.nextInt(128));
102+
}
103+
}
104+
105+
@Benchmark
106+
@Fork(jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"})
107+
public int sveVdot8s() {
108+
return VectorUtil.sveVdot8s(nativeBytesA, nativeBytesB, size);
109+
}
110+
111+
@Benchmark
112+
@Fork(jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"})
113+
public int neonVdot8s() {
114+
return VectorUtil.neonVdot8s(nativeBytesA, nativeBytesB, size);
115+
}
116+
117+
@Benchmark
118+
@Fork(jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"})
119+
public int dot8s() {
120+
return VectorUtil.dot8s(nativeBytesA, nativeBytesB, size);
87121
}
88122

89123
@Benchmark

lucene/core/build.gradle

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,59 @@
1414
* See the License for the specific language governing permissions and
1515
* limitations under the License.
1616
*/
17+
plugins {
18+
id "c"
19+
}
1720

1821
apply plugin: 'java-library'
22+
apply plugin: 'c'
1923

2024
description = 'Lucene core library'
25+
model {
26+
toolChains {
27+
gcc(Gcc) {
28+
target("linux_aarch64"){
29+
path '/usr/bin/'
30+
cCompiler.executable 'gcc10-cc'
31+
cCompiler.withArguments { args ->
32+
args << "--shared"
33+
<< "-O3"
34+
<< "-march=native"
35+
<< "-funroll-loops"
36+
}
37+
}
38+
}
39+
}
40+
41+
components {
42+
dotProduct(NativeLibrarySpec) {
43+
sources {
44+
c {
45+
source {
46+
srcDir 'src/c' // Path to your C source files
47+
include "**/*.c"
48+
}
49+
exportedHeaders {
50+
srcDir "src/c"
51+
include "**/*.h"
52+
}
53+
}
54+
}
55+
}
56+
}
57+
58+
}
59+
60+
test.dependsOn 'dotProductSharedLibrary'
2161

2262
dependencies {
2363
moduleTestImplementation project(':lucene:codecs')
2464
moduleTestImplementation project(':lucene:test-framework')
2565
}
66+
67+
test {
68+
systemProperty(
69+
"java.library.path",
70+
file("${buildDir}/libs/dotProduct/shared").absolutePath
71+
)
72+
}

lucene/core/src/c/dotProduct.c

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
// dotProduct.c
2+
3+
#include <stdio.h>
4+
#include <arm_neon.h>
5+
6+
#ifdef __ARM_ACLE
7+
#include <arm_acle.h>
8+
#endif
9+
10+
#if (defined(__ARM_FEATURE_SVE) && !defined(__APPLE__))
11+
#include <arm_sve.h>
12+
/*
13+
* Unrolled and vectorized int8 dotProduct implementation using SVE instructions
14+
* NOTE: Clang 15.0 compiler on Apple M3 Max compiles the code below sucessfully
15+
* with '-march=native+sve' option but throws "Illegal Hardware Instruction" error
16+
* Looks like Apple M3 does not implement SVE and Apple's official documentation
17+
* is not explicit about this or at least I could not find it.
18+
*
19+
*/
20+
int32_t vdot8s_sve(int8_t *vec1, int8_t *vec2, int32_t limit) {
21+
int32_t result = 0;
22+
int32_t i = 0;
23+
// Vectors of 8-bit signed integers
24+
svint8_t va1, va2, va3, va4;
25+
svint8_t vb1, vb2, vb3, vb4;
26+
// Init accumulators
27+
svint32_t acc1 = svdup_n_s32(0);
28+
svint32_t acc2 = svdup_n_s32(0);
29+
svint32_t acc3 = svdup_n_s32(0);
30+
svint32_t acc4 = svdup_n_s32(0);
31+
32+
// Number of 8-bits elements in the SVE vector
33+
int32_t vec_length = svcntb();
34+
35+
// Manually unroll the loop
36+
for (i = 0; i + 4 * vec_length <= limit; i += 4 * vec_length) {
37+
// Load vectors into the Z registers which can range from 128-bit to 2048-bit wide
38+
// The predicate register - P determines which bytes are active
39+
// svptrue_b8() returns a predictae in which every element is true
40+
//
41+
va1 = svld1_s8(svptrue_b8(), vec1 + i);
42+
vb1 = svld1_s8(svptrue_b8(), vec2 + i);
43+
44+
va2 = svld1_s8(svptrue_b8(), vec1 + i + vec_length);
45+
vb2 = svld1_s8(svptrue_b8(), vec2 + i + vec_length);
46+
47+
va3 = svld1_s8(svptrue_b8(), vec1 + i + 2 * vec_length);
48+
vb3 = svld1_s8(svptrue_b8(), vec2 + i + 2 * vec_length);
49+
50+
va4 = svld1_s8(svptrue_b8(), vec1 + i + 3 * vec_length);
51+
vb4 = svld1_s8(svptrue_b8(), vec2 + i + 3 * vec_length);
52+
53+
// Dot product using SDOT instruction on Z vectors
54+
acc1 = svdot_s32(acc1, va1, vb1);
55+
acc2 = svdot_s32(acc2, va2, vb2);
56+
acc3 = svdot_s32(acc3, va3, vb3);
57+
acc4 = svdot_s32(acc4, va4, vb4);
58+
}
59+
// Add correspponding active elements in each of the vectors
60+
acc1 = svadd_s32_x(svptrue_b8() , acc1, acc2);
61+
acc3 = svadd_s32_x(svptrue_b8() , acc3, acc4);
62+
acc1 = svadd_s32_x(svptrue_b8(), acc1, acc3);
63+
64+
// REDUCE: Add every vector element in target and write result to scalar
65+
result = svaddv_s32(svptrue_b8(), acc1);
66+
67+
// Scalar tail. TODO: Use FMA
68+
for (; i < limit; i++) {
69+
result += vec1[i] * vec2[i];
70+
}
71+
return result;
72+
}
73+
#endif
74+
75+
// https://developer.arm.com/architectures/instruction-sets/intrinsics/
76+
int32_t vdot8s_neon(int8_t* vec1, int8_t* vec2, int32_t limit) {
77+
int32_t result = 0;
78+
int32x4_t acc1 = vdupq_n_s32(0);
79+
int32x4_t acc2 = vdupq_n_s32(0);
80+
int32x4_t acc3 = vdupq_n_s32(0);
81+
int32x4_t acc4 = vdupq_n_s32(0);
82+
int32_t i = 0;
83+
int8x16_t va1, va2, va3, va4;
84+
int8x16_t vb1, vb2, vb3, vb4;
85+
86+
for (; i + 64 <= limit; i += 64 ) {
87+
// Read into 8 (bit) x 16 (values) vector
88+
va1 = vld1q_s8((const void*) (vec1 + i));
89+
vb1 = vld1q_s8((const void*) (vec2 + i));
90+
91+
va2 = vld1q_s8((const void*) (vec1 + i + 16));
92+
vb2 = vld1q_s8((const void*) (vec2 + i + 16));
93+
94+
va3 = vld1q_s8((const void*) (vec1 + i + 32));
95+
vb3 = vld1q_s8((const void*) (vec2 + i + 32));
96+
97+
va4 = vld1q_s8((const void*) (vec1 + i + 48));
98+
vb4 = vld1q_s8((const void*) (vec2 + i + 48));
99+
100+
// Dot product using SDOT instruction
101+
// GCC 7.3 does not define the intrinsic below so we get compile time error.
102+
acc1 = vdotq_s32(acc1, va1, vb1);
103+
acc2 = vdotq_s32(acc2, va2, vb2);
104+
acc3 = vdotq_s32(acc3, va3, vb3);
105+
acc4 = vdotq_s32(acc4, va4, vb4);
106+
}
107+
// Add corresponding elements in each vectors
108+
acc1 = vaddq_s32(acc1, acc2);
109+
acc3 = vaddq_s32(acc3, acc4);
110+
acc1 = vaddq_s32(acc1, acc3);
111+
112+
// REDUCE: Add every vector element in target and write result to scalar
113+
result += vaddvq_s32(acc1);
114+
115+
// Scalar tail. TODO: Use FMA
116+
for (; i < limit; i++) {
117+
result += vec1[i] * vec2[i];
118+
}
119+
return result;
120+
}
121+
122+
int32_t dot8s(int8_t* vec1, int8_t* vec2, int32_t limit) {
123+
int32_t result = 0;
124+
for (int32_t i = 0; i < limit; i++) {
125+
result += vec1[i] * vec2[i];
126+
}
127+
return result;
128+
}
129+
130+
/*
131+
int main(int argc, const char* arrgs[]) {
132+
int8_t a[128];
133+
int8_t b[128];
134+
for (int i =0; i < 128; i++) {
135+
a[i] = 2;
136+
b[i] = 3;
137+
}
138+
printf("Sum (Vectorized - SVE) = %d\n", vdot8s_sve(&a, &b, 128));
139+
printf("Sum (Vectorized - NEON) = %d\n", vdot8s_neon(&a, &b, 128));
140+
printf("Sum (Scalar) = %d\n", dot8s(&a, &b, 128));
141+
}*/
142+

lucene/core/src/c/dotProduct.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
2+
int32_t vdot8s_sve(int8_t* vec1[], int8_t* vec2, int32_t limit);
3+
int32_t vdot8s_neon(int8_t* vec1[], int8_t* vec2[], int32_t limit);
4+
int32_t dot8s(int8_t* a, int8_t* b, int32_t limit);

0 commit comments

Comments
 (0)