Skip to content
This repository was archived by the owner on Aug 30, 2024. It is now read-only.

Commit 12bafdc

Browse files
committed
Enable runtime gpu_arch auto-select based on devices where kernels are executing for gemm_int4 tests; enable device-specific compilation using USE_XETLA (xe_lpg, xe_hpg, xe_hpc).
Signed-off-by: Qun Gao <[email protected]>
1 parent 7848595 commit 12bafdc

File tree

7 files changed

+222
-60
lines changed

7 files changed

+222
-60
lines changed

CMakeLists.txt

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,30 @@ set(XETLA_OFFLINE_OPTIONS "${XETLA_OFFLINE_OPTIONS} -Xfinalizer -enableBCR")
5454
# Optimization to reduce the tokens used for DPAS instruction.
5555
set(XETLA_OFFLINE_OPTIONS "${XETLA_OFFLINE_OPTIONS} -Xfinalizer -DPASTokenReduction")
5656

57+
# USE_XETLA - Align to IPEX logic
58+
if(USE_XETLA) # A quoted string always evaluates to false unless: The string's value is one of the true constants
59+
string(REPLACE "," ";" USE_XETLA ${USE_XETLA})
60+
message("The used archs are: ${USE_XETLA}")
61+
elseif(NOT USE_XETLA) # if(<variable>): True if given a variable that is defined to a value that is not a false constant
62+
message("No archs specified. Stopping CMake execution here.")
63+
set(USE_XETLA "")
64+
endif()
65+
66+
set(XETLA_AVAILABLE_ARCHS xe_hpc xe_hpg xe_lpg)
67+
set(USE_XETLA_XE_LPG OFF)
68+
set(USE_XETLA_XE_HPG OFF)
69+
set(USE_XETLA_XE_HPC OFF)
70+
71+
foreach(used_arch IN LISTS USE_XETLA)
72+
if (used_arch IN_LIST XETLA_AVAILABLE_ARCHS)
73+
string(TOUPPER "${used_arch}" arch_upper)
74+
set(USE_XETLA_${arch_upper} ON)
75+
message(STATUS "XeTLA: Found arch from list: ${arch_upper}")
76+
else()
77+
message(FATAL_ERROR "Unexpected XeTLA architecture: ${used_arch}")
78+
endif()
79+
endforeach()
80+
5781
# AOT device
5882
set(USE_AOT_DEVLIST "" CACHE STRING "Set device list for AOT build")
5983
if (USE_AOT_DEVLIST)

tests/integration/CMakeLists.txt

Lines changed: 26 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -19,15 +19,29 @@ function(add_integration_test target "host_cpp")
1919
# target_link_libraries(${TARGET} PUBLIC MKL::MKL_SYCL)
2020
endfunction()
2121

22-
# add_subdirectory(vector_add)
23-
add_subdirectory(gemm)
24-
add_subdirectory(gemv)
25-
add_subdirectory(row_reduction)
26-
add_subdirectory(layer_norm)
27-
add_subdirectory(data_transformer)
28-
add_subdirectory(default_config)
29-
add_subdirectory(sg_dropout_op)
30-
add_subdirectory(limitation)
31-
add_subdirectory(softmax)
32-
add_subdirectory(fmha)
33-
add_subdirectory(col_major_shuf)
22+
if (USE_XETLA_XE_LPG)
23+
add_subdirectory(vector_add)
24+
add_subdirectory(gemm)
25+
# add_subdirectory(row_reduction)
26+
# add_subdirectory(layer_norm)
27+
# add_subdirectory(data_transformer)
28+
# add_subdirectory(default_config)
29+
# add_subdirectory(sg_dropout_op)
30+
add_subdirectory(limitation)
31+
# add_subdirectory(softmax)
32+
add_subdirectory(fmha)
33+
add_subdirectory(col_major_shuf)
34+
else()
35+
# add_subdirectory(vector_add)
36+
add_subdirectory(gemm)
37+
add_subdirectory(gemv)
38+
add_subdirectory(row_reduction)
39+
add_subdirectory(layer_norm)
40+
add_subdirectory(data_transformer)
41+
add_subdirectory(default_config)
42+
add_subdirectory(sg_dropout_op)
43+
add_subdirectory(limitation)
44+
add_subdirectory(softmax)
45+
add_subdirectory(fmha)
46+
add_subdirectory(col_major_shuf)
47+
endif()

tests/integration/gemm/CMakeLists.txt

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,16 @@
11
include_directories(${CMAKE_SOURCE_DIR}/tests/integration/gemm)
2-
3-
add_subdirectory(bf16)
4-
add_subdirectory(stream_k)
5-
add_subdirectory(fp16)
6-
add_subdirectory(fp32)
7-
add_subdirectory(int8_quantization)
8-
add_subdirectory(int8)
9-
add_subdirectory(tf32)
10-
add_subdirectory(int4_dequantization)
11-
add_subdirectory(int4_dequantization_bias)
12-
add_subdirectory(unaligned_bf16)
2+
if (USE_XETLA_XE_LPG)
3+
add_subdirectory(int4_dequantization)
4+
add_subdirectory(int4_dequantization_bias)
5+
else()
6+
add_subdirectory(bf16)
7+
add_subdirectory(stream_k)
8+
add_subdirectory(fp16)
9+
add_subdirectory(fp32)
10+
add_subdirectory(int8_quantization)
11+
add_subdirectory(int8)
12+
add_subdirectory(tf32)
13+
add_subdirectory(int4_dequantization)
14+
add_subdirectory(int4_dequantization_bias)
15+
add_subdirectory(unaligned_bf16)
16+
endif()

tests/integration/gemm/int4_dequantization/main.cpp

Lines changed: 97 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,11 @@ class last {
157157
using data_type_c = fp16;
158158
};
159159

160-
template <class Test>
160+
template <class Test, gpu_arch x, mma_engine y>
161+
class KernalName {
162+
163+
};
164+
template <class Test, gpu_arch x, mma_engine y>
161165
void dequantize_gemm_run(uint32_t iter) {
162166
using namespace gpu;
163167
// Accept incoming parameters
@@ -238,16 +242,16 @@ void dequantize_gemm_run(uint32_t iter) {
238242
data_type_scale,
239243
data_type_zero_pt,
240244
quant_info,
241-
mma_engine::xmx,
242-
gpu_arch::XeHpg>;
245+
y,
246+
x>;
243247
using gemm_t = xetla::group::
244248
gemm_t<compute_policy, tile_shape, mem_desc_a_t, mem_desc_b_t>;
245249

246250
using epilogue_t = xetla::group::epilogue_t<
247-
xetla::group::epilogue_policy_default<gpu_arch::XeHpg>,
251+
xetla::group::epilogue_policy_default<x>,
248252
tile_shape,
249253
mem_desc_c_t>;
250-
using group_swizzle = xetla::kernel::group_swizzle_default<gpu_arch::XeHpg>;
254+
using group_swizzle = xetla::kernel::group_swizzle_default<x>;
251255
using gemm_op_t = xetla::kernel::gemm_universal_t<
252256
gpu::xetla::kernel::dispatch_policy_int4_dequantize_kslicing<
253257
group_swizzle,
@@ -366,7 +370,7 @@ void dequantize_gemm_run(uint32_t iter) {
366370
for (uint32_t i = 0; i < iter; i++) {
367371
prof.cpu_start();
368372
auto e_esimd = queue.submit([&](handler& cgh) {
369-
cgh.parallel_for<Test>(nd_range, [=](nd_item<3> item) KERNEL_MAIN {
373+
cgh.parallel_for<KernalName<Test,x,y>>(nd_range, [=](nd_item<3> item) KERNEL_MAIN {
370374
// allocate slm and nbarrier resource
371375
slm_barrier_init<gemm_op_t>();
372376
gemm_op_t gemm_op;
@@ -433,8 +437,94 @@ template <typename T>
433437
class dequantize_gemm_test : public ::testing::Test {};
434438
TYPED_TEST_SUITE_P(dequantize_gemm_test);
435439

440+
template <template<gpu_arch, mma_engine, class T> class F, class G>
441+
class dispatch_arch_test
442+
{
443+
using T_RET = std::invoke_result_t<decltype(F<gpu_arch::XeHpc, mma_engine::xmx, G>::exec)>;
444+
445+
public:
446+
template <typename... Args>
447+
static T_RET exec(Args&&... args) {
448+
// save default formatting
449+
std::ios fmt_bak(nullptr);
450+
fmt_bak.copyfmt(std::cout);
451+
452+
sycl::device device;
453+
if (!device.has(aspect::ext_intel_device_id))
454+
throw std::runtime_error("Can not get device ID");
455+
auto deviceID = device.get_info<ext::intel::info::device::device_id>();
456+
std::cout << "deviceID: 0x" << std::hex //
457+
<< std::right << std::setfill('0') << deviceID << "\n";
458+
459+
// restore default formatting
460+
std::cout.copyfmt(fmt_bak);
461+
#if defined(SYCL_EXT_ONEAPI_DEVICE_ARCHITECTURE) && \
462+
SYCL_EXT_ONEAPI_DEVICE_ARCHITECTURE
463+
// https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_device_architecture.asciidoc#feature-test-macro
464+
try {
465+
namespace ENS = sycl::ext::oneapi::experimental;
466+
auto deviceArch = device.get_info<ENS::info::device::architecture>();
467+
switch (deviceArch) {
468+
case ENS::architecture::intel_gpu_pvc:
469+
return F<gpu_arch::XeHpc, mma_engine::xmx, G>::exec(std::forward<Args>(args)...);
470+
return;
471+
case ENS::architecture::intel_gpu_dg2_g10:
472+
case ENS::architecture::intel_gpu_dg2_g11:
473+
case ENS::architecture::intel_gpu_dg2_g12:
474+
return F<gpu_arch::XeHpg, mma_engine::xmx, G>::exec(std::forward<Args>(args)...);
475+
default:
476+
break;
477+
}
478+
}
479+
catch (...) {
480+
std::cout << "Execption occurred! Please check one api versions.";
481+
}
482+
#endif
483+
std::cout << "No matching architecture, checking device ID ...\n";
484+
switch (deviceID) {
485+
// MTL devices
486+
case 0x7d55: // Intel® Arc ™ Graphics
487+
std::cout << "MTL devices identified!" << std::endl;
488+
return F<gpu_arch::XeLpg, mma_engine::fpu, G>::exec(std::forward<Args>(args)...);
489+
// DG2 devices
490+
case 0x56a0: // Intel® Arc ™ A770 Graphics
491+
case 0x56a1: // Intel® Arc ™ A750 Graphics
492+
case 0x56a2: // Intel® Arc ™ A580 Graphics
493+
case 0x5690: // Intel® Arc ™ A770M Graphics
494+
case 0x5691: // Intel® Arc ™ A730M Graphics
495+
case 0x5692: // Intel® Arc ™ A550M Graphics
496+
return F<gpu_arch::XeHpg, mma_engine::xmx, G>::exec(std::forward<Args>(args)...);
497+
// PVC devices
498+
case 0x0bda: //
499+
return F<gpu_arch::XeHpc, mma_engine::xmx, G>::exec(std::forward<Args>(args)...);
500+
default:
501+
std::cout << "Unknown device ID \n";
502+
break;
503+
}
504+
505+
if (device.has(aspect::ext_intel_gpu_eu_simd_width))
506+
throw std::runtime_error("Can not get eu_simd_width");
507+
auto eu_simd_width =
508+
device.get_info<ext::intel::info::device::gpu_eu_simd_width>();
509+
if (eu_simd_width == 8) {
510+
return F<gpu_arch::XeHpg, mma_engine::xmx, G>::exec(std::forward<Args>(args)...);
511+
} else if (eu_simd_width == 16) {
512+
return F<gpu_arch::XeHpc, mma_engine::xmx, G>::exec(std::forward<Args>(args)...);
513+
} else {
514+
throw std::runtime_error("Can not get device ID");
515+
}
516+
}
517+
};
518+
519+
template <gpu_arch arch_tag, mma_engine engine_tag, typename T>
520+
struct main_wrapper {
521+
static constexpr auto exec = []() {
522+
dequantize_gemm_run<T, arch_tag, engine_tag>(ITER);
523+
};
524+
};
525+
436526
TYPED_TEST_P(dequantize_gemm_test, esimd) {
437-
dequantize_gemm_run<TypeParam>(ITER);
527+
dispatch_arch_test<main_wrapper, TypeParam>::exec();
438528
}
439529

440530
REGISTER_TYPED_TEST_SUITE_P(dequantize_gemm_test, esimd);

tests/integration/gemm/int4_dequantization_bias/CMakeLists.txt

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,10 @@ set(ProjectIdXe ${ProjectId})
55
string(PREPEND ProjectIdClient "gemm_client_")
66
string(PREPEND ProjectIdXe "gemm_xe_")
77

8-
FILE(GLOB src_client main_client.cpp)
9-
add_integration_test(${ProjectIdClient} ${src_client})
10-
FILE(GLOB src_xe main_xe.cpp)
11-
add_integration_test(${ProjectIdXe} ${src_xe})
8+
if (USE_XETLA_XE_LPG)
9+
FILE(GLOB src_client main_client.cpp)
10+
add_integration_test(${ProjectIdClient} ${src_client})
11+
else()
12+
FILE(GLOB src_xe main_xe.cpp)
13+
add_integration_test(${ProjectIdXe} ${src_xe})
14+
endif()
Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,8 @@
1-
add_subdirectory(tf32_1d)
2-
add_subdirectory(bf16_2d)
3-
add_subdirectory(int32_1d)
4-
add_subdirectory(int32_2d)
1+
if (USE_XETLA_XE_LPG)
2+
add_subdirectory(int32_1d)
3+
else()
4+
add_subdirectory(tf32_1d)
5+
add_subdirectory(bf16_2d)
6+
add_subdirectory(int32_1d)
7+
add_subdirectory(int32_2d)
8+
endif()

tests/unit/CMakeLists.txt

Lines changed: 45 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -19,25 +19,48 @@ function(add_unit_test target kernel_func_file test_host)
1919
set_tests_properties(${TARGET} PROPERTIES LABELS "unit" TIMEOUT ${UNIT_TIMEOUT})
2020
endfunction()
2121

22-
add_subdirectory(global_load_store)
23-
add_subdirectory(global_atomic)
24-
add_subdirectory(block_load_store)
25-
add_subdirectory(tile_load_store)
26-
add_subdirectory(tile_load_store_local)
27-
add_subdirectory(internal_type_load_store_cvt)
28-
add_subdirectory(local_load_store)
29-
add_subdirectory(raw_send)
30-
add_subdirectory(buff_compare)
31-
add_subdirectory(tile_mma)
32-
add_subdirectory(named_barrier)
33-
add_subdirectory(tile_row_reduction)
34-
add_subdirectory(add_c)
35-
add_subdirectory(imul)
36-
add_subdirectory(philox_rng)
37-
add_subdirectory(exp_inv_sqrt_tanh)
38-
add_subdirectory(reg_layout_conversion)
39-
add_subdirectory(reg_reduce)
40-
add_subdirectory(math_general)
41-
add_subdirectory(epilogue_tile_op)
42-
add_subdirectory(bit_mask_manipulation)
43-
22+
if (USE_XETLA_XE_LPG)
23+
add_subdirectory(global_load_store)
24+
add_subdirectory(global_atomic)
25+
# add_subdirectory(block_load_store)
26+
# add_subdirectory(tile_load_store)
27+
# add_subdirectory(tile_load_store_local)
28+
# add_subdirectory(internal_type_load_store_cvt)
29+
add_subdirectory(local_load_store)
30+
# add_subdirectory(raw_send)
31+
add_subdirectory(buff_compare)
32+
# add_subdirectory(tile_mma)
33+
# add_subdirectory(named_barrier)
34+
# add_subdirectory(tile_row_reduction)
35+
add_subdirectory(add_c)
36+
add_subdirectory(imul)
37+
add_subdirectory(philox_rng)
38+
# add_subdirectory(exp_inv_sqrt_tanh)
39+
# add_subdirectory(reg_layout_conversion)
40+
add_subdirectory(reg_reduce)
41+
add_subdirectory(math_general)
42+
# add_subdirectory(epilogue_tile_op)
43+
# add_subdirectory(bit_mask_manipulation)
44+
else()
45+
add_subdirectory(global_load_store)
46+
add_subdirectory(global_atomic)
47+
add_subdirectory(block_load_store)
48+
add_subdirectory(tile_load_store)
49+
add_subdirectory(tile_load_store_local)
50+
add_subdirectory(internal_type_load_store_cvt)
51+
add_subdirectory(local_load_store)
52+
add_subdirectory(raw_send)
53+
add_subdirectory(buff_compare)
54+
add_subdirectory(tile_mma)
55+
add_subdirectory(named_barrier)
56+
add_subdirectory(tile_row_reduction)
57+
add_subdirectory(add_c)
58+
add_subdirectory(imul)
59+
add_subdirectory(philox_rng)
60+
add_subdirectory(exp_inv_sqrt_tanh)
61+
add_subdirectory(reg_layout_conversion)
62+
add_subdirectory(reg_reduce)
63+
add_subdirectory(math_general)
64+
add_subdirectory(epilogue_tile_op)
65+
add_subdirectory(bit_mask_manipulation)
66+
endif()

0 commit comments

Comments
 (0)