Skip to content

Commit 03bc702

Browse files
committed
[v1.16] Allow for separate compilation of SSE, AVX and AVX512 files (fixes illegal instructions errors)
1 parent b9a5220 commit 03bc702

11 files changed

+353
-184
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
*.o
22
peakperf
3+
build/

CMakeLists.txt

+6-2
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,12 @@ set(CMAKE_CXX_FLAGS "${SANITY_FLAGS} -O2")
3131
if(ENABLE_CPU_DEVICE)
3232
target_compile_definitions(peakperf PUBLIC DEVICE_CPU_ENABLED)
3333

34-
set_source_files_properties(${ARCH_CPU_DIR}/arch.cpp PROPERTIES COMPILE_FLAGS "-mavx -mavx512f -fopenmp")
35-
add_library(cpu_device STATIC ${CPUFETCH_DIR}/cpufetch.cpp ${CPUFETCH_DIR}/cpuid.cpp ${CPUFETCH_DIR}/uarch.cpp ${ARCH_CPU_DIR}/arch.cpp)
34+
set_source_files_properties(${ARCH_CPU_DIR}/arch.cpp PROPERTIES COMPILE_FLAGS "-fopenmp")
35+
set_source_files_properties(${ARCH_CPU_DIR}/arch_sse.cpp PROPERTIES COMPILE_FLAGS "-msse -fopenmp")
36+
set_source_files_properties(${ARCH_CPU_DIR}/arch_avx.cpp PROPERTIES COMPILE_FLAGS "-mavx -fopenmp")
37+
set_source_files_properties(${ARCH_CPU_DIR}/arch_avx512.cpp PROPERTIES COMPILE_FLAGS "-mavx512f -fopenmp")
38+
39+
add_library(cpu_device STATIC ${CPUFETCH_DIR}/cpufetch.cpp ${CPUFETCH_DIR}/cpuid.cpp ${CPUFETCH_DIR}/uarch.cpp ${ARCH_CPU_DIR}/arch.cpp ${ARCH_CPU_DIR}/arch_sse.cpp ${ARCH_CPU_DIR}/arch_avx.cpp ${ARCH_CPU_DIR}/arch_avx512.cpp)
3640
target_link_libraries(peakperf -lm -fopenmp cpu_device sandy_bridge ivy_bridge haswell skylake_128 skylake_256 skylake_512 broadwell ice_lake knl zen zen2)
3741

3842
# architecture specific build

src/cpu/arch/arch.cpp

+13-181
Original file line numberDiff line numberDiff line change
@@ -6,78 +6,6 @@
66
#include "arch.hpp"
77
#include "../../global.hpp"
88

9-
#include "sandy_bridge.hpp"
10-
#include "ivy_bridge.hpp"
11-
#include "haswell.hpp"
12-
#include "skylake_128.hpp"
13-
#include "skylake_256.hpp"
14-
#include "skylake_512.hpp"
15-
#include "broadwell.hpp"
16-
#include "cannon_lake_256.hpp"
17-
#include "cannon_lake_512.hpp"
18-
#include "ice_lake.hpp"
19-
#include "knl.hpp"
20-
#include "zen.hpp"
21-
#include "zen2.hpp"
22-
23-
struct benchmark_cpu {
24-
int n_threads;
25-
double gflops;
26-
const char* name;
27-
bench_type benchmark_type;
28-
void (*compute_function_128)(__m128 *farr_ptr, __m128, int);
29-
void (*compute_function_256)(__m256 *farr_ptr, __m256, int);
30-
void (*compute_function_512)(__m512 *farr_ptr, __m512, int);
31-
};
32-
33-
enum {
34-
BENCH_128_8,
35-
BENCH_256_6_NOFMA,
36-
BENCH_256_5,
37-
BENCH_256_8,
38-
BENCH_256_10,
39-
BENCH_512_8,
40-
BENCH_512_12,
41-
};
42-
43-
static const char *bench_name[] = {
44-
/*[BENCH_TYPE_SANDY_BRIDGE] = */ "Sandy Bridge (AVX)",
45-
/*[BENCH_TYPE_IVY_BRIDGE] = */ "Ivy Bridge (AVX)",
46-
/*[BENCH_TYPE_HASWELL] = */ "Haswell (AVX2)",
47-
/*[BENCH_TYPE_BROADWELL] = */ "Broadwell (AVX2)",
48-
/*[BENCH_TYPE_SKYLAKE_256] = */ "Skylake (SSE)",
49-
/*[BENCH_TYPE_SKYLAKE_256] = */ "Skylake (AVX2)",
50-
/*[BENCH_TYPE_SKYLAKE_512] = */ "Skylake (AVX512)",
51-
/*[BENCH_TYPE_KABY_LAKE] = */ "Kaby Lake (AVX2)",
52-
/*[BENCH_TYPE_COFFE_LAKE] = */ "Coffe Lake (AVX2)",
53-
/*[BENCH_TYPE_COMET_LAKE] = */ "Comet Lake (AVX2)",
54-
/*[BENCH_TYPE_ICE_LAKE] = */ "Ice Lake (AVX2)",
55-
/*[BENCH_TYPE_TIGER_LAKE] = */ "Tiger Lake (AVX2)",
56-
/*[BENCH_TYPE_KNIGHTS_LANDING] = */ "Knights Landing (AVX512)",
57-
/*[BENCH_TYPE_ZEN] = */ "Zen (AVX2)",
58-
/*[BENCH_TYPE_ZEN_PLUS] = */ "Zen+ (AVX2)",
59-
/*[BENCH_TYPE_ZEN2] = */ "Zen 2 (AVX2)",
60-
};
61-
62-
static const char *bench_types_str[] = {
63-
/*[BENCH_TYPE_SANDY_BRIDGE] = */ "sandy_bridge",
64-
/*[BENCH_TYPE_IVY_BRIDGE] = */ "ivy_bridge",
65-
/*[BENCH_TYPE_HASWELL] = */ "haswell",
66-
/*[BENCH_TYPE_BROADWELL] = */ "broadwell",
67-
/*[BENCH_TYPE_SKYLAKE_256] = */ "skylake_128",
68-
/*[BENCH_TYPE_SKYLAKE_256] = */ "skylake_256",
69-
/*[BENCH_TYPE_SKYLAKE_512] = */ "skylake_512",
70-
/*[BENCH_TYPE_KABY_LAKE] = */ "kaby_lake",
71-
/*[BENCH_TYPE_COFFE_LAKE] = */ "coffe_lake",
72-
/*[BENCH_TYPE_COMET_LAKE] = */ "comet_lake",
73-
/*[BENCH_TYPE_ICE_LAKE] = */ "ice_lake",
74-
/*[BENCH_TYPE_TIGER_LAKE] = */ "tiger_lake",
75-
/*[BENCH_TYPE_KNIGHTS_LANDING] = */ "knights_landing",
76-
/*[BENCH_TYPE_ZEN] = */ "zen",
77-
/*[BENCH_TYPE_ZEN_PLUS] = */ "zen_plus",
78-
/*[BENCH_TYPE_ZEN2] = */ "zen2",
79-
};
80-
819
bench_type parse_benchmark_cpu(char* str) {
8210
int len = sizeof(bench_types_str) / sizeof(bench_types_str[0]);
8311
for(bench_type t = 0; t < len; t++) {
@@ -176,82 +104,12 @@ double compute_gflops(int n_threads, char bench) {
176104
* - Zen 2 -> zen2
177105
*/
178106
bool select_benchmark(struct benchmark_cpu* bench) {
179-
bench->compute_function_128 = NULL;
180-
bench->compute_function_256 = NULL;
181-
bench->compute_function_512 = NULL;
182-
183-
switch(bench->benchmark_type) {
184-
case BENCH_TYPE_SANDY_BRIDGE:
185-
bench->compute_function_256 = compute_sandy_bridge;
186-
bench->gflops = compute_gflops(bench->n_threads, BENCH_256_6_NOFMA);
187-
break;
188-
case BENCH_TYPE_IVY_BRIDGE:
189-
bench->compute_function_256 = compute_ivy_bridge;
190-
bench->gflops = compute_gflops(bench->n_threads, BENCH_256_6_NOFMA);
191-
break;
192-
case BENCH_TYPE_HASWELL:
193-
bench->compute_function_256 = compute_haswell;
194-
bench->gflops = compute_gflops(bench->n_threads, BENCH_256_10);
195-
break;
196-
case BENCH_TYPE_SKYLAKE_512:
197-
bench->compute_function_512 = compute_skylake_512;
198-
bench->gflops = compute_gflops(bench->n_threads, BENCH_512_8);
199-
break;
200-
case BENCH_TYPE_SKYLAKE_256:
201-
bench->compute_function_256 = compute_skylake_256;
202-
bench->gflops = compute_gflops(bench->n_threads, BENCH_256_8);
203-
break;
204-
case BENCH_TYPE_SKYLAKE_128:
205-
bench->compute_function_128 = compute_skylake_128;
206-
bench->gflops = compute_gflops(bench->n_threads, BENCH_128_8);
207-
break;
208-
case BENCH_TYPE_BROADWELL:
209-
bench->compute_function_256 = compute_broadwell;
210-
bench->gflops = compute_gflops(bench->n_threads, BENCH_256_8);
211-
break;
212-
case BENCH_TYPE_KABY_LAKE:
213-
bench->compute_function_256 = compute_skylake_256;
214-
bench->gflops = compute_gflops(bench->n_threads, BENCH_256_8);
215-
break;
216-
case BENCH_TYPE_COFFE_LAKE:
217-
bench->compute_function_256 = compute_skylake_256;
218-
bench->gflops = compute_gflops(bench->n_threads, BENCH_256_8);
219-
break;
220-
case BENCH_TYPE_COMET_LAKE:
221-
bench->compute_function_256 = compute_skylake_256;
222-
bench->gflops = compute_gflops(bench->n_threads, BENCH_256_8);
223-
break;
224-
case BENCH_TYPE_ICE_LAKE:
225-
bench->compute_function_256 = compute_ice_lake;
226-
bench->gflops = compute_gflops(bench->n_threads, BENCH_256_8);
227-
break;
228-
case BENCH_TYPE_TIGER_LAKE:
229-
bench->compute_function_256 = compute_ice_lake;
230-
bench->gflops = compute_gflops(bench->n_threads, BENCH_256_8);
231-
break;
232-
case BENCH_TYPE_KNIGHTS_LANDING:
233-
bench->compute_function_512 = compute_knl;
234-
bench->gflops = compute_gflops(bench->n_threads, BENCH_512_12);
235-
break;
236-
case BENCH_TYPE_ZEN:
237-
bench->compute_function_256 = compute_zen;
238-
bench->gflops = compute_gflops(bench->n_threads, BENCH_256_5);
239-
break;
240-
case BENCH_TYPE_ZEN_PLUS:
241-
bench->compute_function_256 = compute_zen;
242-
bench->gflops = compute_gflops(bench->n_threads, BENCH_256_5);
243-
break;
244-
case BENCH_TYPE_ZEN2:
245-
bench->compute_function_256 = compute_zen2;
246-
bench->gflops = compute_gflops(bench->n_threads, BENCH_256_10);
247-
break;
248-
default:
249-
printErr("No valid benchmark! (bench: %d)", bench->benchmark_type);
250-
return false;
251-
}
252-
253-
bench->name = bench_name[bench->benchmark_type];
254-
return true;
107+
if(bench->benchmark_type == BENCH_TYPE_SKYLAKE_128)
108+
return select_benchmark_sse(bench);
109+
else if(bench->benchmark_type == BENCH_TYPE_SKYLAKE_512 || bench->benchmark_type == BENCH_TYPE_KNIGHTS_LANDING)
110+
return select_benchmark_avx512(bench);
111+
else
112+
return select_benchmark_avx(bench);
255113
}
256114

257115
struct benchmark_cpu* init_benchmark_cpu(struct cpu* cpu, int n_threads, char *bench_type_str) {
@@ -351,39 +209,13 @@ struct benchmark_cpu* init_benchmark_cpu(struct cpu* cpu, int n_threads, char *b
351209
return NULL;
352210
}
353211

354-
bool compute_cpu (struct benchmark_cpu* bench, double* e_time) {
355-
struct timeval t1, t2;
356-
gettimeofday(&t1, NULL);
357-
358-
if(bench->benchmark_type == BENCH_TYPE_SKYLAKE_512 || bench->benchmark_type == BENCH_TYPE_KNIGHTS_LANDING) {
359-
__m512 mult = {0};
360-
__m512 *farr_ptr = NULL;
361-
362-
#pragma omp parallel for
363-
for(int t=0; t < bench->n_threads; t++)
364-
bench->compute_function_512(farr_ptr, mult, t);
365-
}
366-
else if(bench->benchmark_type == BENCH_TYPE_SKYLAKE_128) {
367-
__m128 mult = {0};
368-
__m128 *farr_ptr = NULL;
369-
370-
#pragma omp parallel for
371-
for(int t=0; t < bench->n_threads; t++)
372-
bench->compute_function_128(farr_ptr, mult, t);
373-
}
374-
else {
375-
__m256 mult = {0};
376-
__m256 *farr_ptr = NULL;
377-
378-
#pragma omp parallel for
379-
for(int t=0; t < bench->n_threads; t++)
380-
bench->compute_function_256(farr_ptr, mult, t);
381-
}
382-
383-
gettimeofday(&t2, NULL);
384-
*e_time = (double)((t2.tv_sec-t1.tv_sec)*1000000 + t2.tv_usec-t1.tv_usec)/1000000;
385-
386-
return true;
212+
bool compute_cpu(struct benchmark_cpu* bench, double* e_time) {
213+
if(bench->benchmark_type == BENCH_TYPE_SKYLAKE_128)
214+
return compute_cpu_sse(bench, e_time);
215+
else if(bench->benchmark_type == BENCH_TYPE_SKYLAKE_512 || bench->benchmark_type == BENCH_TYPE_KNIGHTS_LANDING)
216+
return compute_cpu_avx512(bench, e_time);
217+
else
218+
return compute_cpu_avx(bench, e_time);
387219
}
388220

389221
double get_gflops_cpu(struct benchmark_cpu* bench) {

src/cpu/arch/arch.hpp

+64
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,31 @@
55
#include "../cpufetch/uarch.hpp"
66
#include "../../getarg.hpp"
77

8+
#include "arch_sse.hpp"
9+
#include "arch_avx.hpp"
10+
#include "arch_avx512.hpp"
11+
12+
struct benchmark_cpu {
13+
int n_threads;
14+
double gflops;
15+
const char* name;
16+
bench_type benchmark_type;
17+
18+
struct benchmark_cpu_sse* bench_sse;
19+
struct benchmark_cpu_avx* bench_avx;
20+
struct benchmark_cpu_avx512* bench_avx512;
21+
};
22+
23+
enum {
24+
BENCH_128_8,
25+
BENCH_256_6_NOFMA,
26+
BENCH_256_5,
27+
BENCH_256_8,
28+
BENCH_256_10,
29+
BENCH_512_8,
30+
BENCH_512_12,
31+
};
32+
833
enum bench_types {
934
BENCH_TYPE_SANDY_BRIDGE,
1035
BENCH_TYPE_IVY_BRIDGE,
@@ -24,6 +49,44 @@ enum bench_types {
2449
BENCH_TYPE_ZEN2,
2550
};
2651

52+
static const char *bench_name[] = {
53+
/*[BENCH_TYPE_SANDY_BRIDGE] = */ "Sandy Bridge (AVX)",
54+
/*[BENCH_TYPE_IVY_BRIDGE] = */ "Ivy Bridge (AVX)",
55+
/*[BENCH_TYPE_HASWELL] = */ "Haswell (AVX2)",
56+
/*[BENCH_TYPE_BROADWELL] = */ "Broadwell (AVX2)",
57+
/*[BENCH_TYPE_SKYLAKE_256] = */ "Skylake (SSE)",
58+
/*[BENCH_TYPE_SKYLAKE_256] = */ "Skylake (AVX2)",
59+
/*[BENCH_TYPE_SKYLAKE_512] = */ "Skylake (AVX512)",
60+
/*[BENCH_TYPE_KABY_LAKE] = */ "Kaby Lake (AVX2)",
61+
/*[BENCH_TYPE_COFFE_LAKE] = */ "Coffe Lake (AVX2)",
62+
/*[BENCH_TYPE_COMET_LAKE] = */ "Comet Lake (AVX2)",
63+
/*[BENCH_TYPE_ICE_LAKE] = */ "Ice Lake (AVX2)",
64+
/*[BENCH_TYPE_TIGER_LAKE] = */ "Tiger Lake (AVX2)",
65+
/*[BENCH_TYPE_KNIGHTS_LANDING] = */ "Knights Landing (AVX512)",
66+
/*[BENCH_TYPE_ZEN] = */ "Zen (AVX2)",
67+
/*[BENCH_TYPE_ZEN_PLUS] = */ "Zen+ (AVX2)",
68+
/*[BENCH_TYPE_ZEN2] = */ "Zen 2 (AVX2)",
69+
};
70+
71+
static const char *bench_types_str[] = {
72+
/*[BENCH_TYPE_SANDY_BRIDGE] = */ "sandy_bridge",
73+
/*[BENCH_TYPE_IVY_BRIDGE] = */ "ivy_bridge",
74+
/*[BENCH_TYPE_HASWELL] = */ "haswell",
75+
/*[BENCH_TYPE_BROADWELL] = */ "broadwell",
76+
/*[BENCH_TYPE_SKYLAKE_256] = */ "skylake_128",
77+
/*[BENCH_TYPE_SKYLAKE_256] = */ "skylake_256",
78+
/*[BENCH_TYPE_SKYLAKE_512] = */ "skylake_512",
79+
/*[BENCH_TYPE_KABY_LAKE] = */ "kaby_lake",
80+
/*[BENCH_TYPE_COFFE_LAKE] = */ "coffe_lake",
81+
/*[BENCH_TYPE_COMET_LAKE] = */ "comet_lake",
82+
/*[BENCH_TYPE_ICE_LAKE] = */ "ice_lake",
83+
/*[BENCH_TYPE_TIGER_LAKE] = */ "tiger_lake",
84+
/*[BENCH_TYPE_KNIGHTS_LANDING] = */ "knights_landing",
85+
/*[BENCH_TYPE_ZEN] = */ "zen",
86+
/*[BENCH_TYPE_ZEN_PLUS] = */ "zen_plus",
87+
/*[BENCH_TYPE_ZEN2] = */ "zen2",
88+
};
89+
2790
#define BENCHMARK_CPU_ITERS 1000000000
2891
#define MAX_NUMBER_THREADS 512
2992

@@ -91,5 +154,6 @@ const char* get_benchmark_name_cpu(struct benchmark_cpu* bench);
91154
bench_type parse_benchmark_cpu(char* str);
92155
void print_bench_types_cpu(struct cpu* cpu);
93156
int get_n_threads(struct benchmark_cpu* bench);
157+
double compute_gflops(int n_threads, char bench);
94158

95159
#endif

0 commit comments

Comments
 (0)