From 9e069dc18e23358d16da2135f8eca9fc9a5e1900 Mon Sep 17 00:00:00 2001 From: "Mario Sieg (Neo)" Date: Wed, 22 Jan 2025 13:46:53 +0100 Subject: [PATCH 01/16] Omit frame pointer --- cmake/comflags.cmake | 1 + 1 file changed, 1 insertion(+) diff --git a/cmake/comflags.cmake b/cmake/comflags.cmake index 223bf2b..3ff933a 100644 --- a/cmake/comflags.cmake +++ b/cmake/comflags.cmake @@ -27,6 +27,7 @@ set(MAG_CLANG_COMPILE_FLAGS set(MAG_CLANG_RELEASE_COMPILE_FLAGS -O3 -flto + -fomit-frame-pointer ) set(MAG_CLANG_LINK_OPTIONS "") set(MAG_CLANG_RELEASE_LINK_OPTIONS -flto) From ee85f9f2aead3a3b9ee3f7c983fed9a6ac2e0789 Mon Sep 17 00:00:00 2001 From: "Mario Sieg (Neo)" Date: Thu, 23 Jan 2025 17:46:21 +0100 Subject: [PATCH 02/16] AArch64 Apple Silicon feature detection --- magnetron/magnetron.c | 59 ++++++++++-- magnetron/magnetron_cpu.c | 12 +-- magnetron/magnetron_internal.h | 168 +++++++++++++++++++-------------- 3 files changed, 152 insertions(+), 87 deletions(-) diff --git a/magnetron/magnetron.c b/magnetron/magnetron.c index 3d4ab9e..9b0f8e0 100644 --- a/magnetron/magnetron.c +++ b/magnetron/magnetron.c @@ -161,6 +161,12 @@ const uint32_t mag_x86_64_feature_masks[MAG_X86_64_FEATURE__NUM] = { }; #undef _ #undef mag_x86_64_feature_def +#elif defined(__aarch64__) +#define _(ident) #ident +const char* const mag_arm64_cap_names[MAG_ARM64_CAP__NUM] = { + mag_arm64_feature_def(_, MAG_SEP) +}; +#undef _ #endif void* (*mag_get_alloc_fn(void))(void* blk, size_t size) { @@ -484,10 +490,10 @@ static void mag_system_host_info_dump(mag_ctx_t* ctx) { #else #error "Unknwon CPU arch" #endif - mag_log_info("CPU (%s): %s, Virtual Cores: %u, Physical Cores: %u, Sockets: %u", cpu_arch, ctx->sys.cpu_name, ctx->sys.cpu_virtual_cores, ctx->sys.cpu_physical_cores, ctx->sys.cpu_sockets); + mag_log_info("CPU: %s, Virtual Cores: %u, Physical Cores: %u, Sockets: %u", ctx->sys.cpu_name, ctx->sys.cpu_virtual_cores, ctx->sys.cpu_physical_cores, ctx->sys.cpu_sockets); #if defined(__x86_64__) || defined(_M_X64) /* Print CPU features for x86-64 platforms. */ if (mag_log_enabled) { - printf("CPU Features:"); + mag_log_info("%s caps:", cpu_arch); for (uint32_t i=0, k=0; i < MAG_X86_64_FEATURE__NUM; ++i) { if (mag_ctx_x86_64_cpu_has_feature(ctx, i)) { if ((k++ & 7) == 0) printf("\n\t"); @@ -496,6 +502,16 @@ static void mag_system_host_info_dump(mag_ctx_t* ctx) { } putchar('\n'); } + #elif defined(__aarch64__) + if (mag_log_enabled) { + printf(MAG_CC_CYAN "[magnetron] " MAG_CC_RESET "%s caps: ", cpu_arch); + for (uint32_t i=0; i < MAG_ARM64_CAP__NUM; ++i) { + if (ctx->sys.arm64_cpu_features & (1u<sys.phys_mem_total, &ctx->sys.phys_mem_free); #if defined(__x86_64__) || defined(_M_X64) mag_system_info_query_x86_64_cpu_features(&ctx->sys.x86_64_cpu_features); - #elif defined(__aarch64__) && defined(__linux__) - mag_system_info_query_arm64_cpu_features(&ctx->sys.cpu_arm64_hwcap); + #elif defined(__aarch64__) + mag_system_info_query_arm64_cpu_features(&ctx->sys.arm64_cpu_features, &ctx->sys.arm64_cpu_sve_width); #endif if (mag_unlikely(!*ctx->sys.os_name)) snprintf(ctx->sys.os_name, sizeof(ctx->sys.os_name), "Unknown"); if (mag_unlikely(!*ctx->sys.cpu_name)) snprintf(ctx->sys.cpu_name, sizeof(ctx->sys.cpu_name), "Unknown"); diff --git a/magnetron/magnetron_cpu.c b/magnetron/magnetron_cpu.c index 8aaaa45..62d0dc5 100644 --- a/magnetron/magnetron_cpu.c +++ b/magnetron/magnetron_cpu.c @@ -50,7 +50,7 @@ static bool mag_blas_detect_gen_optimal_spec(const mag_ctx_t* ctx, mag_kernel_re has_all_features &= mag_ctx_x86_64_cpu_has_feature(ctx, features[j]); if (has_all_features) { /* Since specializations are sorted by score, we found the perfect spec. */ (*spec->inject_kernels)(kernels); - mag_log_info("Using BLAS specialization: %s", spec->name); + mag_log_info("Using tuned BLAS specialization: %s", spec->name); return true; } } @@ -70,16 +70,12 @@ static bool mag_blas_detect_gen_optimal_spec(const mag_ctx_t* ctx, mag_kernel_re mag_cpu_blas_spec_decl(82); static bool mag_blas_detect_gen_optimal_spec(const mag_ctx_t* ctx, mag_kernel_registry_t* kernels) { -#ifdef __linux__ - long hwcap = ctx->sys.cpu_arm64_hwcap; - if (hwcap & HWCAP_FPHP) && (hwcap & HWCAP_ASIMDHP) && (hwcap && HWCAP_ASIMDDP)) { /* ARM v.8.2 f16 scalar + f16 vec + dotprod */ + mag_arm64_cap_t feat = ctx->sys.arm64_cpu_features; + if (feat & (1u<var, prefix) #if defined(__x86_64__) || defined(_M_X64) -#define MAG_X86_64_CPUID_0H 0 -#define MAG_X86_64_CPUID_1H 1 -#define MAG_X86_64_CPUID_2H 2 -#define MAG_X86_64_CPUID_7H 3 -#define MAG_X86_64_CPUID_80000001H 4 -#define MAG_X86_64_CPUID_80000007H 5 -#define MAG_X86_64_CPUID_16H 6 -#define MAG_X86_64_CPUID_7H_1H 7 -#define MAG_X86_64_CPUID_EAX 0 -#define MAG_X86_64_CPUID_EBX 1 -#define MAG_X86_64_CPUID_ECX 2 -#define MAG_X86_64_CPUID_EDX 3 - -#define mag_x86_64_feature_def(_, __) /* Enumerator | CPUDID Leaf | Register | Bit Index */\ - _(AVX , 1H, ECX, 28)__\ - _(AVX2 , 7H, EBX, 5)__\ - _(AVXVNNI , 7H_1H, EAX, 4)__\ - _(AVXVNNIINT8 , 7H_1H, EDX, 4)__\ - _(AVXVNNIINT16 , 7H_1H, EDX, 10)__\ - _(AVX512BW , 7H, EBX, 30)__\ - _(AVX512CD , 7H, EBX, 28)__\ - _(AVX512DQ , 7H, EBX, 17)__\ - _(AVX512ER , 7H, EBX, 27)__\ - _(AVX512F , 7H, EBX, 16)__\ - _(AVX512IFMA , 7H, EBX, 21)__\ - _(AVX512PF , 7H, EBX, 26)__\ - _(AVX512VBMI , 7H, ECX, 1)__\ - _(AVX512VL , 7H, EBX, 31)__\ - _(AVX512_4FMAPS , 7H, EDX, 3)__\ - _(AVX512_4VNNIW , 7H, EDX, 2)__\ - _(AVX512_FP16 , 7H, EDX, 23)__\ - _(AVX512_BF16 , 7H_1H, EAX, 5)__\ - _(AVX512_BITALG , 7H, ECX, 12)__\ - _(AVX512_VBMI2 , 7H, ECX, 6)__\ - _(AVX512_VNNI , 7H, ECX, 11)__\ - _(AVX512_VP2INTERSECT , 7H, EDX, 8)__\ - _(AVX512_VPOPCNTDQ , 7H, ECX, 14)__\ - _(BMI , 7H, EBX, 3)__\ - _(BMI2 , 7H, EBX, 8)__\ - _(F16C , 1H, ECX, 29)__\ - _(FMA , 1H, ECX, 12)__\ - _(FPU , 1H, EDX, 0)__\ - _(GFNI , 7H, ECX, 8)__\ - _(IA64 , 1H, EDX, 30)__\ - _(MMX , 1H, EDX, 23)__\ - _(OSXSAVE , 1H, ECX, 27)__\ - _(PCLMUL , 1H, ECX, 1)__\ - _(RDRND , 1H, ECX, 30)__\ - _(RDSEED , 7H, EBX, 18)__\ - _(RDTSCP , 80000001H, EDX, 27)__\ - _(SHA , 7H, EBX, 29)__\ - _(SSE , 1H, EDX, 25)__\ - _(SSE2 , 1H, EDX, 26)__\ - _(SSE3 , 1H, ECX, 0)__\ - _(SSE4_1 , 1H, ECX, 19)__\ - _(SSE4_2 , 1H, ECX, 20)__\ - _(SSSE3 , 1H, ECX, 9)__\ - _(VAES , 7H, ECX, 9)__\ - _(VME , 1H, EDX, 1)__\ - _(VMX , 1H, ECX, 5)__\ - _(VPCLMULQDQ , 7H, ECX, 10)__\ - _(XSAVE , 1H, ECX, 26)__\ - _(HYBRID_CPU , 7H, EDX, 15)__ - -#define _(enumerator, leaf, reg, bit) MAG_X86_64_FEATURE_##enumerator -typedef enum mag_x86_64_feature_t { - mag_x86_64_feature_def(_, MAG_SEP) - MAG_X86_64_FEATURE__NUM -} mag_x86_64_feature_t; -#undef _ -extern const char* const mag_x86_64_feature_names[MAG_X86_64_FEATURE__NUM]; extern const uint8_t mag_x86_64_feature_leaves[MAG_X86_64_FEATURE__NUM]; extern const uint8_t mag_x86_64_feature_regs[MAG_X86_64_FEATURE__NUM]; extern const uint32_t mag_x86_64_feature_masks[MAG_X86_64_FEATURE__NUM]; From b89a65dc2bd7683a5e2a5cdc08f5ed6c991da156 Mon Sep 17 00:00:00 2001 From: "Mario Sieg (Neo)" Date: Thu, 23 Jan 2025 18:33:49 +0100 Subject: [PATCH 03/16] armv9 blas backend --- CMakeLists.txt | 2 +- cmake/blas_tune.cmake | 6 +- .../{comflags.cmake => compiler_config.cmake} | 8 +++ magnetron/magnetron_cpu.c | 54 +++++++++++++----- magnetron/magnetron_cpu_blas.inl | 57 ++++++++++++++++++- ...4_82.c => magnetron_cpu_blas_arm64_v8_2.c} | 3 +- magnetron/magnetron_cpu_blas_arm64_v9.c | 10 ++++ 7 files changed, 120 insertions(+), 20 deletions(-) rename cmake/{comflags.cmake => compiler_config.cmake} (89%) rename magnetron/{magnetron_cpu_blas_arm64_82.c => magnetron_cpu_blas_arm64_v8_2.c} (78%) create mode 100644 magnetron/magnetron_cpu_blas_arm64_v9.c diff --git a/CMakeLists.txt b/CMakeLists.txt index 30be04a..2af510a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -35,7 +35,7 @@ endif() include(cmake/arch.cmake) include(cmake/lib.cmake) -include(cmake/comflags.cmake) +include(cmake/compiler_config.cmake) if (${MAGNETRON_ENABLE_CUDA}) include(cmake/cuda.cmake) diff --git a/cmake/blas_tune.cmake b/cmake/blas_tune.cmake index ca90d90..6ca481c 100644 --- a/cmake/blas_tune.cmake +++ b/cmake/blas_tune.cmake @@ -18,7 +18,8 @@ set(MAGNETRON_BLAS_SPEC_AMD64_SOURCES ) set(MAGNETRON_BLAS_SPEC_ARM64_SOURCES - magnetron/magnetron_cpu_blas_arm64_82.c + magnetron/magnetron_cpu_blas_arm64_v8_2.c + magnetron/magnetron_cpu_blas_arm64_v9.c ) if (${IS_AMD64}) # x86-64 specific compilation options @@ -30,5 +31,6 @@ if (${IS_AMD64}) # x86-64 specific compilation options set_blas_spec_arch("magnetron_cpu_blas_amd64_znver4.c" "-mavx -mavx2 -mfma -mf16c -mavx512f -mavx512vl -mavx512vnni -mavx512bf16 -mavx512bw -mavx512dq" "/arch:AVX512") elseif(${IS_ARM64}) set(MAGNETRON_SOURCES ${MAGNETRON_SOURCES} ${MAGNETRON_BLAS_SPEC_ARM64_SOURCES}) - set_blas_spec_arch("magnetron_cpu_blas_arm64_82.c" "-march=armv8.2-a+dotprod+fp16" "") + set_blas_spec_arch("magnetron_cpu_blas_arm64_v8_2.c" "-march=armv8.2-a+dotprod+fp16" "") + set_blas_spec_arch("magnetron_cpu_blas_arm64_v9.c" "-march=armv9-a+sve+sve2" "") endif() \ No newline at end of file diff --git a/cmake/comflags.cmake b/cmake/compiler_config.cmake similarity index 89% rename from cmake/comflags.cmake rename to cmake/compiler_config.cmake index 3ff933a..155a970 100644 --- a/cmake/comflags.cmake +++ b/cmake/compiler_config.cmake @@ -51,6 +51,14 @@ set(MAG_GCC_RELEASE_COMPILE_FLAGS set(MAG_GCC_LINK_OPTIONS "") set(MAG_GCC_RELEASE_LINK_OPTIONS -flto) +if (${IS_ARM64}) + set(MAG_CLANG_COMPILE_FLAGS ${MAG_CLANG_COMPILE_FLAGS} -march=armv8.2-a) + set(MAG_GCC_COMPILE_FLAGS ${MAG_CLANG_COMPILE_FLAGS} -march=armv8.2-a) +elseif (${IS_AMD64}) + set(MAG_CLANG_COMPILE_FLAGS ${MAG_CLANG_COMPILE_FLAGS} -msse -msse2) + set(MAG_GCC_COMPILE_FLAGS ${MAG_CLANG_COMPILE_FLAGS} -msse -msse2) +endif() + if (WIN32) # Windows (MSVC) specific config target_compile_options(magnetron PRIVATE ${MAG_MSVC_COMPILE_FLAGS}) target_link_options(magnetron PRIVATE ${MAG_MSVC_LINK_OPTIONS}) diff --git a/magnetron/magnetron_cpu.c b/magnetron/magnetron_cpu.c index 62d0dc5..53e1518 100644 --- a/magnetron/magnetron_cpu.c +++ b/magnetron/magnetron_cpu.c @@ -14,7 +14,7 @@ typedef struct mag_amd64_blas_specialization { void (*inject_kernels)(mag_kernel_registry_t* kernels); } mag_amd64_blas_specialization; -#define mag_cpu_blas_spec_decl(feat) \ +#define mag_amd64_blas_spec_decl(feat) \ const mag_x86_64_feature_t* mag_cpu_blas_specialization_amd64_##feat##_features(size_t* out_num); \ extern void mag_cpu_blas_specialization_amd64_##feat(mag_kernel_registry_t* kernels) @@ -25,11 +25,11 @@ typedef struct mag_amd64_blas_specialization { .inject_kernels = &mag_cpu_blas_specialization_amd64_##feat \ } -mag_cpu_blas_spec_decl(znver4); -mag_cpu_blas_spec_decl(avx512f); -mag_cpu_blas_spec_decl(avx2); -mag_cpu_blas_spec_decl(avx); -mag_cpu_blas_spec_decl(sse41); +mag_amd64_blas_spec_decl(znver4); +mag_amd64_blas_spec_decl(avx512f); +mag_amd64_blas_spec_decl(avx2); +mag_amd64_blas_spec_decl(avx); +mag_amd64_blas_spec_decl(sse41); static const mag_amd64_blas_specialization mag_amd64_blas_specializations[] = { /* Dynamic selectable BLAS permutations, sorted from best to worst score. */ mag_amd64_blas_spec_permute(znver4), @@ -56,6 +56,7 @@ static bool mag_blas_detect_gen_optimal_spec(const mag_ctx_t* ctx, mag_kernel_re } /* No matching specialization found, use generic */ mag_cpu_blas_specialization_fallback(kernels); + mag_log_warn("Using fallback BLAS specialization"); return false; /* No spec used, fallback is active */ } @@ -64,20 +65,45 @@ static bool mag_blas_detect_gen_optimal_spec(const mag_ctx_t* ctx, mag_kernel_re #elif defined(__aarch64__) || defined(_M_ARM64) -#define mag_cpu_blas_spec_name(feat) mag_cpu_blas_specialization_arm64_##feat -#define mag_cpu_blas_spec_decl(feat) extern void mag_cpu_blas_spec_name(feat)(mag_kernel_registry_t* kernels) +typedef struct mag_arm64_blas_specialization { + const char* name; + mag_arm64_cap_t (*get_cap_permutation)(void); + void (*inject_kernels)(mag_kernel_registry_t* kernels); +} mag_arm64_blas_specialization; + +#define mag_arm64_blas_spec_decl(feat) \ + mag_arm64_cap_t mag_cpu_blas_specialization_arm64_v_##feat##_features(void); \ + extern void mag_cpu_blas_specialization_arm64_v_##feat(mag_kernel_registry_t* kernels) -mag_cpu_blas_spec_decl(82); +#define mag_arm64_blas_spec_permute(feat) \ + (mag_arm64_blas_specialization) { \ + .name = "arm64_"#feat, \ + .get_cap_permutation = &mag_cpu_blas_specialization_arm64_v_##feat##_features, \ + .inject_kernels = &mag_cpu_blas_specialization_arm64_v_##feat \ +} + +mag_arm64_blas_spec_decl(9); +mag_arm64_blas_spec_decl(8_2); + +static const mag_arm64_blas_specialization mag_arm64_blas_specializations[] = { /* Dynamic selectable BLAS permutations, sorted from best to worst score. */ + mag_arm64_blas_spec_permute(9), + mag_arm64_blas_spec_permute(8_2), +}; static bool mag_blas_detect_gen_optimal_spec(const mag_ctx_t* ctx, mag_kernel_registry_t* kernels) { - mag_arm64_cap_t feat = ctx->sys.arm64_cpu_features; - if (feat & (1u<sys.arm64_cpu_features; + for (size_t i=0; i < sizeof(mag_arm64_blas_specializations)/sizeof(*mag_arm64_blas_specializations); ++i) { /* Find best blas spec for the host CPU */ + const mag_arm64_blas_specialization* spec = mag_arm64_blas_specializations+i; + mag_arm64_cap_t cap_required = (*spec->get_cap_permutation)(); /* Get requires features */ + if ((cap_avail & cap_required) == cap_required) { /* Since specializations are sorted by score, we found the perfect spec. */ + (*spec->inject_kernels)(kernels); + mag_log_info("Using tuned BLAS specialization: %s", spec->name); + return true; + } } /* No matching specialization found, use generic */ mag_cpu_blas_specialization_fallback(kernels); + mag_log_warn("Using fallback BLAS specialization"); return false; /* No spec used, fallback is active */ } diff --git a/magnetron/magnetron_cpu_blas.inl b/magnetron/magnetron_cpu_blas.inl index d525884..82587d6 100644 --- a/magnetron/magnetron_cpu_blas.inl +++ b/magnetron/magnetron_cpu_blas.inl @@ -1,3 +1,28 @@ +/* (c) 2025 Mario "Neo" Sieg. */ + +/* +** +** + ARM 64 Versions and Features +** +==============+=============+==============+======================================================+ +** | armv8-a | Armv8-A | | +fp, +simd +** | armv8.1-a | Armv8.1-A | armv8-a, | +crc, +lse, +rdma +** | armv8.2-a | Armv8.2-A | armv8.1-a | +** | armv8.3-a | Armv8.3-A | armv8.2-a, | +pauth, +fcma, +jscvt +** | armv8.4-a | Armv8.4-A | armv8.3-a, | +flagm, +fp16fml, +dotprod, +rcpc2 +** | armv8.5-a | Armv8.5-A | armv8.4-a, | +sb, +ssbs, +predres, +frintts, +flagm2 +** | armv8.6-a | Armv8.6-A | armv8.5-a, | +bf16, +i8mm +** | armv8.7-a | Armv8.7-A | armv8.6-a, | +wfxt, +xs +** | armv8.8-a | Armv8.8-a | armv8.7-a, | +mops +** | armv8.9-a | Armv8.9-a | armv8.8-a | +** | armv9-a | Armv9-A | armv8.5-a, | +sve, +sve2 +** | armv9.1-a | Armv9.1-A | armv9-a, | +bf16, +i8mm +** | armv9.2-a | Armv9.2-A | armv9.1-a | +** | armv9.3-a | Armv9.3-A | armv9.2-a, | +mops +** | armv9.4-a | Armv9.4-A | armv9.3-a | +** | armv8-r | Armv8-R | armv8-r | +** +==============+=============+==============+======================================================+ +*/ + #include "magnetron_internal.h" #include @@ -1381,12 +1406,11 @@ static void MAG_HOTPROC mag_blas_matmul_f32(const mag_compute_payload_t* payload #ifndef MAG_BLAS_SPECIALIZATION #error "BLAS specialization undefined" #endif - -#if defined(__x86_64__) || defined(_M_X64) #ifndef MAG_BLAS_SPECIALIZATION_FEAT_REQUEST #error "Feature request routine undefined" #endif +#if defined(__x86_64__) || defined(_M_X64) const mag_x86_64_feature_t* MAG_BLAS_SPECIALIZATION_FEAT_REQUEST(size_t* out_num) { static const mag_x86_64_feature_t required_features[] = { #ifdef __AVX512F__ @@ -1514,6 +1538,35 @@ const mag_x86_64_feature_t* MAG_BLAS_SPECIALIZATION_FEAT_REQUEST(size_t* out_num *out_num = sizeof(required_features)/sizeof(*required_features); return required_features; } + +#elif defined(__aarch64__) + +mag_arm64_cap_t MAG_BLAS_SPECIALIZATION_FEAT_REQUEST(void) { + mag_arm64_cap_t caps = 1u< */ + +#if !defined(__ARM_FEATURE_SVE) || !defined(__ARM_FEATURE_SVE2) +#error "BLAS specialization requires matching compile flags" +#endif + +#define MAG_BLAS_SPECIALIZATION mag_cpu_blas_specialization_arm64_v_9 +#define MAG_BLAS_SPECIALIZATION_FEAT_REQUEST mag_cpu_blas_specialization_arm64_v_9_features + +#include "magnetron_cpu_blas.inl" From 8cb7eb981fc5d5cb18ab727ba31e3479da778d4c Mon Sep 17 00:00:00 2001 From: "Mario Sieg (Neo)" Date: Thu, 23 Jan 2025 18:50:32 +0100 Subject: [PATCH 04/16] Separated benchmarks --- python/benchmarks/bench.py | 57 +++++++++++------------ python/benchmarks/bench_all.py | 8 ++++ python/benchmarks/bench_square_matmul.py | 5 ++ python/benchmarks/bench_various_matmul.py | 5 ++ 4 files changed, 45 insertions(+), 30 deletions(-) create mode 100644 python/benchmarks/bench_all.py create mode 100644 python/benchmarks/bench_square_matmul.py create mode 100644 python/benchmarks/bench_various_matmul.py diff --git a/python/benchmarks/bench.py b/python/benchmarks/bench.py index c031939..18a9fcb 100644 --- a/python/benchmarks/bench.py +++ b/python/benchmarks/bench.py @@ -54,39 +54,36 @@ def allocate_args(self, shape_a: tuple[int, int], shape_b: tuple[int, int]): ('Matrix Multiplication', lambda x, y: x @ y), ] -max_dim = 256 -square_step = 8 -all_step = max_dim // 4 - print('Running performance benchmark...') print('Magnetron VS') for participant in participants: if not isinstance(participant, MagnetronBenchmark): print(f' {participant.name}') -print('\nSquare Matrix Benchmarks (NxN):') -square_shapes = generate_square_shapes(max_dim, square_step) -for op in elementwise_ops: - name, fn = op - print(f'Benchmarking {name} Operator') - benchmark(name, participants, fn, square_shapes, plot_style='lines') - -for op in matmul_ops: - name, fn = op - print(f'Benchmarking {name} Operator') - benchmark(name, participants, fn, square_shapes, plot_style='lines') - -print('\nAll Shapes Benchmarks:') -print('Elementwise Operations:') -elementwise_shapes = generate_elementwise_shapes(max_dim, all_step) -for op in elementwise_ops: - name, fn = op - print(f'Benchmarking {name} Operator') - benchmark(name, participants, fn, elementwise_shapes, plot_style='bars') - -print('\nMatrix Multiplication:') -matmul_shapes = generate_matmul_shapes(max_dim, all_step) -for op in matmul_ops: - name, fn = op - print(f'Benchmarking {name} Operator') - benchmark(name, participants, fn, matmul_shapes, plot_style='bars') +def bench_square_bin_ops(dim_lim: int=2048, step: int=32): + square_shapes = generate_square_shapes(dim_lim, step) + for op in elementwise_ops: + name, fn = op + print(f'Benchmarking {name} Operator') + benchmark(name, participants, fn, square_shapes, plot_style='lines') + +def bench_square_matmul(dim_lim: int=2048, step: int=32): + square_shapes = generate_square_shapes(dim_lim, step) + for op in matmul_ops: + name, fn = op + print(f'Benchmarking {name} Operator') + benchmark(name, participants, fn, square_shapes, plot_style='lines') + +def bench_permuted_bin_ops(dim_lim: int=2048, step: int=32): + elementwise_shapes = generate_elementwise_shapes(dim_lim, step) + for op in elementwise_ops: + name, fn = op + print(f'Benchmarking {name} Operator') + benchmark(name, participants, fn, elementwise_shapes, plot_style='bars') + +def bench_permuted_matmul(dim_lim: int=2048, step: int=32): + matmul_shapes = generate_matmul_shapes(dim_lim, step) + for op in matmul_ops: + name, fn = op + print(f'Benchmarking {name} Operator') + benchmark(name, participants, fn, matmul_shapes, plot_style='bars') diff --git a/python/benchmarks/bench_all.py b/python/benchmarks/bench_all.py new file mode 100644 index 0000000..f083108 --- /dev/null +++ b/python/benchmarks/bench_all.py @@ -0,0 +1,8 @@ +# (c) 2025 Mario "Neo" Sieg. + +from benchmarks.bench import * + +bench_square_bin_ops(dim_lim=4096) +bench_square_matmul(dim_lim=4096) +bench_permuted_bin_ops(dim_lim=4096) +bench_permuted_matmul(dim_lim=4096) diff --git a/python/benchmarks/bench_square_matmul.py b/python/benchmarks/bench_square_matmul.py new file mode 100644 index 0000000..76a021a --- /dev/null +++ b/python/benchmarks/bench_square_matmul.py @@ -0,0 +1,5 @@ +# (c) 2025 Mario "Neo" Sieg. + +from benchmarks.bench import bench_square_matmul + +bench_square_matmul(dim_lim=1024) diff --git a/python/benchmarks/bench_various_matmul.py b/python/benchmarks/bench_various_matmul.py new file mode 100644 index 0000000..cd5c232 --- /dev/null +++ b/python/benchmarks/bench_various_matmul.py @@ -0,0 +1,5 @@ +# (c) 2025 Mario "Neo" Sieg. + +from benchmarks.bench import bench_permuted_matmul + +bench_permuted_matmul() From 34286c963d5ced37b856535d80aef5a0d364754f Mon Sep 17 00:00:00 2001 From: "Mario Sieg (Neo)" Date: Thu, 23 Jan 2025 19:42:16 +0100 Subject: [PATCH 05/16] Removed old comments --- .../magnetron/{models.py => model.py} | 49 +------------------ 1 file changed, 2 insertions(+), 47 deletions(-) rename python/magnetron_framework/magnetron/{models.py => model.py} (61%) diff --git a/python/magnetron_framework/magnetron/models.py b/python/magnetron_framework/magnetron/model.py similarity index 61% rename from python/magnetron_framework/magnetron/models.py rename to python/magnetron_framework/magnetron/model.py index 8a6bbad..10393eb 100644 --- a/python/magnetron_framework/magnetron/models.py +++ b/python/magnetron_framework/magnetron/model.py @@ -44,7 +44,6 @@ def cross_entropy(y: Tensor, y_hat: Tensor) -> float: class DenseLayer(Layer): def __init__(self, in_features: int, out_features: int): super().__init__() - # For column-based math, shape=(out_features, in_features) self.weight = Tensor.uniform(shape=(out_features, in_features)) self.bias = Tensor.uniform(shape=(out_features, 1)) self._x = None @@ -52,56 +51,23 @@ def __init__(self, in_features: int, out_features: int): self._out = None def forward(self, x: Tensor) -> Tensor: - """ - If we do: z = W @ x + b, - then out = sigmoid(z). - - We'll store both x and out (or z) for backward(). - """ - self._x = x # store input (shape=(in_features, batch_size)) + self._x = x self._z = self.weight @ x + self.bias self._out = self._z.sigmoid() return self._out def backward(self, is_hidden_layer: bool, delta: Tensor, rate: float) -> Tensor: - """ - `delta` here is dL/d(output_of_this_layer). We do: - - dW = delta @ x^T (since x is shape=(in_features, batch_size)) - db = mean of delta, per each output neuron - next_delta = W^T @ delta * σ'(z) [ only if is_hidden_layer=True ] - """ - # Weight update - # delta shape = (out_features, batch_size) - # x^T shape = (batch_size, in_features) - # so delta @ x^T is (out_features, in_features), which matches weight self.weight -= (delta @ self._x.transpose().clone()) * rate - - # Bias update: one bias per out_feature => take mean along batch_size axis=1 - # delta.mean(axis=1) gives shape (out_features,) so we keepdims to (out_features, 1) - #self.bias -= delta.mean(axis=1, keepdims=True) * rate - batch_size = delta.shape[1] ones_vec = Tensor.const([[1.0] for _ in range(batch_size)]) - row_sums = delta @ ones_vec # shape (out_features, 1) + row_sums = delta @ ones_vec row_means = row_sums * (1.0 / batch_size) self.bias -= row_means * rate - - # For the next layer’s delta = (W^T @ delta) * sigmoid'(z) - # We must use the derivative of the *post*–linear pre-activation z, - # or equivalently the derivative wrt the output if we have it stored. if is_hidden_layer: - # shape(W^T) = (in_features, out_features) - # shape(delta) = (out_features, batch_size) d_in = self.weight.transpose().clone() @ delta - # Multiply by derivative of out = sigmoid(z) - # i.e. out * (1 - out). If your library’s .sigmoid(derivative=True) - # expects the “pre-activated” z, you can do that here. d_in *= self._z.sigmoid(derivative=True) return d_in else: - # For the last layer, we return delta as is, - # or skip the activation derivative if you already did it in the top-level. return delta @@ -118,15 +84,8 @@ def forward(self, inputs: Tensor) -> Tensor: return x def backward(self, outputs: Tensor, targets: Tensor, rate: float): - """ - For the final layer delta, we do: delta = dL/dOut * sigmoid'(Out) - Then pass delta backward through each layer. - """ error = outputs - targets - # For the final layer’s activation derivative: delta = error * outputs.sigmoid(derivative=True) - - # Backprop through layers from last to first for i in reversed(range(len(self.layers))): is_hidden = (i > 0) delta = self.layers[i].backward(is_hidden, delta, rate) @@ -136,17 +95,13 @@ def train(self, inputs: Tensor, targets: Tensor, epochs: int, rate: float): import time start_time = time.time_ns() - # Optionally transpose if you want (features, batch) layout inputs = inputs.transpose().clone() targets = targets.transpose().clone() losses = [] for epoch in range(epochs): - # Forward pass output = self.forward(inputs) - # Backward pass self.backward(output, targets, rate) - # Compute and record loss loss = Optim.mse(output, targets) losses.append(loss) if epoch % self.loss_epoch_step == 0: From afe8a1d56d9070a423f15ce8e2db30cb3bd389a2 Mon Sep 17 00:00:00 2001 From: "Mario Sieg (Neo)" Date: Thu, 23 Jan 2025 20:02:12 +0100 Subject: [PATCH 06/16] To list --- python/magnetron_framework/magnetron/core.py | 4 ++-- python/magnetron_viewer/main.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/python/magnetron_framework/magnetron/core.py b/python/magnetron_framework/magnetron/core.py index ff65b94..0410579 100644 --- a/python/magnetron_framework/magnetron/core.py +++ b/python/magnetron_framework/magnetron/core.py @@ -474,7 +474,7 @@ def __del__(self) -> None: """Releases _ptr resources upon object destruction.""" if hasattr(self, '_ptr') and isinstance(self._ptr, ffi.CData) and self._ptr != ffi.NULL: C.mag_tensor_decref(self._ptr) - self._ptr = ffi.NULL + self._ptr = ffi.NULL _DISPATCH = { 1: C.mag_tensor_create_1d, @@ -827,7 +827,7 @@ def data_ptr(self) -> int: """ return int(ffi.cast('uintptr_t', C.mag_tensor_data_ptr(self._ptr))) - def to_list(self) -> list[float]: + def tolist(self) -> list[float]: """ Returns the tensor data as a Python list of floats. diff --git a/python/magnetron_viewer/main.py b/python/magnetron_viewer/main.py index 8d03e70..14e6f0b 100644 --- a/python/magnetron_viewer/main.py +++ b/python/magnetron_viewer/main.py @@ -119,7 +119,7 @@ def show_tensor_data(self, item): if tensor_name not in self.tensors: return tensor = self.tensors[tensor_name] - tensor_data = tensor.to_list() + tensor_data = tensor.tolist() rows = [] elements_per_row = 16 From 7a765bffd7f3d038c58619f52fa4dc869cecb75a Mon Sep 17 00:00:00 2001 From: "Mario Sieg (Neo)" Date: Thu, 23 Jan 2025 21:11:24 +0100 Subject: [PATCH 07/16] Convenience update --- python/examples/xor.py | 2 +- python/magnetron_framework/magnetron/core.py | 18 +++++++++++++++--- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/python/examples/xor.py b/python/examples/xor.py index ffae237..ae537c0 100644 --- a/python/examples/xor.py +++ b/python/examples/xor.py @@ -1,7 +1,7 @@ # (c) 2025 Mario "Neo" Sieg. from magnetron import Tensor -from magnetron.models import SequentialModel, DenseLayer +from magnetron.model import SequentialModel, DenseLayer import matplotlib.pyplot as plt EPOCHS: int = 10000 diff --git a/python/magnetron_framework/magnetron/core.py b/python/magnetron_framework/magnetron/core.py index 0410579..b1648a0 100644 --- a/python/magnetron_framework/magnetron/core.py +++ b/python/magnetron_framework/magnetron/core.py @@ -474,7 +474,7 @@ def __del__(self) -> None: """Releases _ptr resources upon object destruction.""" if hasattr(self, '_ptr') and isinstance(self._ptr, ffi.CData) and self._ptr != ffi.NULL: C.mag_tensor_decref(self._ptr) - self._ptr = ffi.NULL + self._ptr = ffi.NULL _DISPATCH = { 1: C.mag_tensor_create_1d, @@ -656,7 +656,7 @@ def uniform(cls, shape: tuple[int, ...], *, interval: (float, float) = (-1.0, 1. return tensor @classmethod - def normal(cls, shape: tuple[int, ...], *, mean: float, stddev: float) -> 'Tensor': + def normal(cls, shape: tuple[int, ...], *, mean: float=0.0, stddev: float=1.0) -> 'Tensor': """ Creates a _ptr filled with random values from a normal distribution. @@ -1171,7 +1171,7 @@ def view(self) -> 'Tensor': def transpose(self) -> 'Tensor': """ - Transposes the tensor (swaps the last two dimensions). + Transposes the tensor (swaps the last two dimensions). Same as tensor.T. Returns ------- @@ -1180,6 +1180,18 @@ def transpose(self) -> 'Tensor': """ return Tensor(C.mag_transpose(self._ptr)) + @property + def T(self) -> 'Tensor': + """ + Transposes the tensor (swaps the last two dimensions). Same as tensor.transpose(). + + Returns + ------- + Tensor + A transposed tensor. + """ + return Tensor(C.mag_transpose(self._ptr)) + def permute(self, axes: tuple[int, ...]) -> 'Tensor': """ Permutes the dimensions of the tensor. From cc1a4049042dd9a2e747c04c1c3b7f989f364636 Mon Sep 17 00:00:00 2001 From: Neo Date: Thu, 23 Jan 2025 23:32:41 +0100 Subject: [PATCH 08/16] Refractored x86-64 CPU detection --- magnetron/magnetron.c | 230 +++++++++++++++------------- magnetron/magnetron_cpu.c | 18 +-- magnetron/magnetron_cpu_blas.inl | 249 +++++++++++++++---------------- magnetron/magnetron_internal.h | 147 ++++++++---------- 4 files changed, 319 insertions(+), 325 deletions(-) diff --git a/magnetron/magnetron.c b/magnetron/magnetron.c index 9b0f8e0..ce77c0e 100644 --- a/magnetron/magnetron.c +++ b/magnetron/magnetron.c @@ -139,36 +139,6 @@ void mag_free_aligned(void* blk) { #define STB_IMAGE_WRITE_IMPLEMENTATION #include -#if defined(__x86_64__) || defined(_M_X64) -#define _(enumerator, leaf, reg, bit) #enumerator -const char* const mag_x86_64_feature_names[MAG_X86_64_FEATURE__NUM] = { - mag_x86_64_feature_def(_, MAG_SEP) -}; -#undef _ -#define _(enumerator, leaf, reg, bit) (0xff&MAG_X86_64_CPUID_##leaf) -const uint8_t mag_x86_64_feature_leaves[MAG_X86_64_FEATURE__NUM] = { - mag_x86_64_feature_def(_, MAG_SEP) -}; -#undef _ -#define _(enumerator, leaf, reg, bit) (0xff&MAG_X86_64_CPUID_##reg) -const uint8_t mag_x86_64_feature_regs[MAG_X86_64_FEATURE__NUM] = { - mag_x86_64_feature_def(_, MAG_SEP) -}; -#undef _ -#define _(enumerator, leaf, reg, bit) (1u<<(bit)) -const uint32_t mag_x86_64_feature_masks[MAG_X86_64_FEATURE__NUM] = { - mag_x86_64_feature_def(_, MAG_SEP) -}; -#undef _ -#undef mag_x86_64_feature_def -#elif defined(__aarch64__) -#define _(ident) #ident -const char* const mag_arm64_cap_names[MAG_ARM64_CAP__NUM] = { - mag_arm64_feature_def(_, MAG_SEP) -}; -#undef _ -#endif - void* (*mag_get_alloc_fn(void))(void* blk, size_t size) { return mag_alloc; } @@ -314,6 +284,19 @@ uintptr_t mag_thread_id(void) { return tid; } +#if defined(__x86_64__) || defined(_M_X64) +#define _(enumerator, leaf, reg, bit) #enumerator +const char* const mag_amd64_cap_names[MAG_AMD64_CAP__NUM] = { + mag_x86_64_feature_def(_, MAG_SEP) +}; +#undef _ +#elif defined(__aarch64__) +#define _(ident) #ident +const char* const mag_arm64_cap_names[MAG_ARM64_CAP__NUM] = { + mag_arm64_feature_def(_, MAG_SEP) +}; +#endif + static uint64_t mag_hpc_clock_ns(void) { /* High precision clock in nanoseconds. */ #ifdef _WIN32 static LONGLONG t_freq; @@ -481,7 +464,7 @@ static void mag_prng_init(mag_ctx_t* ctx, uint64_t seed) { static void mag_system_host_info_query(mag_ctx_t* ctx); /* Query host system information. */ static void mag_system_host_info_dump(mag_ctx_t* ctx) { - mag_log_info("OS/Kernel: %s", ctx->sys.os_name); + mag_log_info("OS/Kernel: %s", ctx->machine.os_name); const char* cpu_arch = "?"; #if defined(__x86_64__) || defined(_M_X64) cpu_arch = "x86-64"; @@ -490,14 +473,13 @@ static void mag_system_host_info_dump(mag_ctx_t* ctx) { #else #error "Unknwon CPU arch" #endif - mag_log_info("CPU: %s, Virtual Cores: %u, Physical Cores: %u, Sockets: %u", ctx->sys.cpu_name, ctx->sys.cpu_virtual_cores, ctx->sys.cpu_physical_cores, ctx->sys.cpu_sockets); + mag_log_info("CPU: %s, Virtual Cores: %u, Physical Cores: %u, Sockets: %u", ctx->machine.cpu_name, ctx->machine.cpu_virtual_cores, ctx->machine.cpu_physical_cores, ctx->machine.cpu_sockets); #if defined(__x86_64__) || defined(_M_X64) /* Print CPU features for x86-64 platforms. */ if (mag_log_enabled) { - mag_log_info("%s caps:", cpu_arch); - for (uint32_t i=0, k=0; i < MAG_X86_64_FEATURE__NUM; ++i) { - if (mag_ctx_x86_64_cpu_has_feature(ctx, i)) { - if ((k++ & 7) == 0) printf("\n\t"); - printf("%s ", mag_x86_64_feature_names[i]); + printf(MAG_CC_CYAN "[magnetron] " MAG_CC_RESET "%s caps: ", cpu_arch); + for (uint64_t i=0; i < MAG_AMD64_CAP__NUM; ++i) { + if (ctx->machine.amd64_cpu_caps & (1ull<sys.arm64_cpu_features & (1u<machine.arm64_cpu_features & (1u<sys.phys_mem_total, &mem_total, &mem_unit_total); - mag_humanize_memory_size(ctx->sys.phys_mem_free, &mem_free, &mem_unit_free); - mag_humanize_memory_size((size_t)llabs((int64_t)ctx->sys.phys_mem_total-(int64_t)ctx->sys.phys_mem_free), &mem_used, &mem_unit_used); - double mem_used_percent = fabs((double)(ctx->sys.phys_mem_total-ctx->sys.phys_mem_free))/(double)ctx->sys.phys_mem_total*100.0; + mag_humanize_memory_size(ctx->machine.phys_mem_total, &mem_total, &mem_unit_total); + mag_humanize_memory_size(ctx->machine.phys_mem_free, &mem_free, &mem_unit_free); + mag_humanize_memory_size((size_t)llabs((int64_t)ctx->machine.phys_mem_total-(int64_t)ctx->machine.phys_mem_free), &mem_used, &mem_unit_used); + double mem_used_percent = fabs((double)(ctx->machine.phys_mem_total-ctx->machine.phys_mem_free))/(double)ctx->machine.phys_mem_total*100.0; mag_log_info("Physical Machine Memory: %.03f %s, Free: %.03f %s, Used: %.03f %s (%.02f%%)", mem_total, mem_unit_total, mem_free, mem_unit_free, mem_used, mem_unit_used, mem_used_percent); } @@ -638,13 +620,13 @@ void mag_ctx_set_prng_algorithm(mag_ctx_t* ctx, mag_prng_algorithm_t algorithm, mag_compute_device_type_t mag_ctx_get_compute_device_type(const mag_ctx_t* ctx) { return ctx->device_type; } const char* mag_ctx_get_compute_device_name(const mag_ctx_t* ctx) { return ctx->device->name; } -const char* mag_ctx_get_os_name(const mag_ctx_t* ctx) { return ctx->sys.os_name; } -const char* mag_ctx_get_cpu_name(const mag_ctx_t* ctx) { return ctx->sys.cpu_name; } -uint32_t mag_ctx_get_cpu_virtual_cores(const mag_ctx_t* ctx) { return ctx->sys.cpu_virtual_cores; } -uint32_t mag_ctx_get_cpu_physical_cores(const mag_ctx_t* ctx) { return ctx->sys.cpu_physical_cores; } -uint32_t mag_ctx_get_cpu_sockets(const mag_ctx_t* ctx) { return ctx->sys.cpu_sockets; } -uint64_t mag_ctx_get_physical_memory_total(const mag_ctx_t* ctx) { return ctx->sys.phys_mem_total; } -uint64_t mag_ctx_get_physical_memory_free(const mag_ctx_t* ctx) { return ctx->sys.phys_mem_free; } +const char* mag_ctx_get_os_name(const mag_ctx_t* ctx) { return ctx->machine.os_name; } +const char* mag_ctx_get_cpu_name(const mag_ctx_t* ctx) { return ctx->machine.cpu_name; } +uint32_t mag_ctx_get_cpu_virtual_cores(const mag_ctx_t* ctx) { return ctx->machine.cpu_virtual_cores; } +uint32_t mag_ctx_get_cpu_physical_cores(const mag_ctx_t* ctx) { return ctx->machine.cpu_physical_cores; } +uint32_t mag_ctx_get_cpu_sockets(const mag_ctx_t* ctx) { return ctx->machine.cpu_sockets; } +uint64_t mag_ctx_get_physical_memory_total(const mag_ctx_t* ctx) { return ctx->machine.phys_mem_total; } +uint64_t mag_ctx_get_physical_memory_free(const mag_ctx_t* ctx) { return ctx->machine.phys_mem_free; } bool mag_ctx_is_numa_system(const mag_ctx_t* ctx) { return false; /* TODO */ } size_t mag_ctx_get_total_tensors_created(const mag_ctx_t* ctx) { return 0; /* TODO */ } @@ -812,24 +794,14 @@ void mag_ctx_profile_stop_recording(mag_ctx_t* ctx, const char* export_csv_file) bool csv = export_csv_file && *export_csv_file; if (!csv) { mag_print_separator(stdout); - printf("OS/Kernel: %s\n", ctx->sys.os_name); - printf("CPU: %s, Virtual Cores: %u, Physical Cores: %u, Sockets: %u\n", ctx->sys.cpu_name, ctx->sys.cpu_virtual_cores, ctx->sys.cpu_physical_cores, ctx->sys.cpu_sockets); - #if defined(__x86_64__) || defined(_M_X64) /* Print CPU features for x86-64 platforms. */ - printf("CPU Features:"); - for (unsigned i=0, k=0; i < MAG_X86_64_FEATURE__NUM; ++i) { - if (mag_ctx_x86_64_cpu_has_feature(ctx, i)) { - if (k++ % 8 == 0) printf("\n\t"); - printf("%s ", mag_x86_64_feature_names[i]); - } - } - putchar('\n'); - #endif + printf("OS/Kernel: %s\n", ctx->machine.os_name); + printf("CPU: %s, Virtual Cores: %u, Physical Cores: %u, Sockets: %u\n", ctx->machine.cpu_name, ctx->machine.cpu_virtual_cores, ctx->machine.cpu_physical_cores, ctx->machine.cpu_sockets); double mem_total, mem_free, mem_used; const char* mem_unit_total, *mem_unit_free, *mem_unit_used; - mag_humanize_memory_size(ctx->sys.phys_mem_total, &mem_total, &mem_unit_total); - mag_humanize_memory_size(ctx->sys.phys_mem_free, &mem_free, &mem_unit_free); - mag_humanize_memory_size((size_t)llabs((int64_t)ctx->sys.phys_mem_total-(int64_t)ctx->sys.phys_mem_free), &mem_used, &mem_unit_used); - double mem_used_percent = fabs((double)(ctx->sys.phys_mem_total-ctx->sys.phys_mem_free))/(double)ctx->sys.phys_mem_total*100.0; + mag_humanize_memory_size(ctx->machine.phys_mem_total, &mem_total, &mem_unit_total); + mag_humanize_memory_size(ctx->machine.phys_mem_free, &mem_free, &mem_unit_free); + mag_humanize_memory_size((size_t)llabs((int64_t)ctx->machine.phys_mem_total-(int64_t)ctx->machine.phys_mem_free), &mem_used, &mem_unit_used); + double mem_used_percent = fabs((double)(ctx->machine.phys_mem_total-ctx->machine.phys_mem_free))/(double)ctx->machine.phys_mem_total*100.0; printf("Physical memory: %.03f %s, Free: %.03f %s, Used: %.03f %s (%.02f%%)\n", mem_total, mem_unit_total, mem_free, mem_unit_free, mem_used, mem_unit_used, mem_used_percent); mag_print_separator(stdout); printf("%16s %16s %16s %16s %16s\n", "Operation", "Executions", "Usage (%)", "AVG Time (μs)", "Total Time (μs)"); @@ -2686,74 +2658,126 @@ static void MAG_COLDPROC mag_system_host_info_query_memory(uint64_t* out_phys_me return (uint64_t)lo | ((uint64_t)hi << 32); #endif } - #define mag_cpy_regs(id) \ - (*features)[MAG_X86_64_CPUID_##id][MAG_X86_64_CPUID_EAX] = eax; \ - (*features)[MAG_X86_64_CPUID_##id][MAG_X86_64_CPUID_EBX] = ebx; \ - (*features)[MAG_X86_64_CPUID_##id][MAG_X86_64_CPUID_ECX] = ecx; \ - (*features)[MAG_X86_64_CPUID_##id][MAG_X86_64_CPUID_EDX] = edx - static void MAG_COLDPROC mag_system_info_query_x86_64_cpu_features(uint32_t (*features)[8][4]) { + static void MAG_COLDPROC mag_system_info_query_amd64_cpu_caps(uint64_t* caps) { + *caps = 0; + uint32_t regs[8][4] = {0}; + + #define H0 0 + #define H1 1 + #define H2 2 + #define H7 3 + #define H80000001 4 + #define H80000007 5 + #define H16 6 + #define H7_1H 7 + #define EAX 0 + #define EBX 1 + #define ECX 2 + #define EDX 3 + + #define mag_cpy_regs(id) \ + regs[id][EAX] = eax; \ + regs[id][EBX] = ebx; \ + regs[id][ECX] = ecx; \ + regs[id][EDX] = edx + + #define _(enumerator, leaf, reg, shift) (0xff&leaf) + static const uint8_t feature_leaves[MAG_AMD64_CAP__NUM] = { + mag_x86_64_feature_def(_, MAG_SEP) + }; + #undef _ + #define _(enumerator, leaf, reg, shift) (0xff®) + static const uint8_t feature_regs[MAG_AMD64_CAP__NUM] = { + mag_x86_64_feature_def(_, MAG_SEP) + }; + #undef _ + #define _(enumerator, leaf, reg, shift) (1u<<(shift)) + static const uint32_t feature_masks[MAG_AMD64_CAP__NUM] = { + mag_x86_64_feature_def(_, MAG_SEP) + }; + #undef _ + #undef mag_x86_64_feature_def + #undef _ + uint32_t eax=0, ebx=0, ecx=0, edx=0; uint32_t max_basic_leaf, max_extended_leaf; mag_cpuid(0, -1, &eax, &ebx, &ecx, &edx); - mag_cpy_regs(0H); + mag_cpy_regs(H0); max_basic_leaf = eax; mag_cpuid(0x80000000u, -1, &eax, &ebx, &ecx, &edx); max_extended_leaf = eax; if (max_basic_leaf >= 1u) { mag_cpuid(1, -1, &eax, &ebx, &ecx, &edx); - mag_cpy_regs(1H); + mag_cpy_regs(H1); } if (max_basic_leaf >= 2u) { mag_cpuid(2u, -1, &eax, &ebx, &ecx, &edx); - mag_cpy_regs(2H); + mag_cpy_regs(H2); } if (max_basic_leaf >= 7u) { mag_cpuid(7u, 0, &eax, &ebx, &ecx, &edx); - mag_cpy_regs(7H); + mag_cpy_regs(H7); } if (max_basic_leaf >= 7u) { mag_cpuid(7u, 1, &eax, &ebx, &ecx, &edx); - mag_cpy_regs(7H_1H); + mag_cpy_regs(H7_1H); } if (max_basic_leaf >= 0x16u) { mag_cpuid(0x16u, -1, &eax, &ebx, &ecx, &edx); - mag_cpy_regs(16H); + mag_cpy_regs(H16); } if (max_extended_leaf >= 0x80000001u) { mag_cpuid(0x80000001u, -1, &eax, &ebx, &ecx, &edx); - mag_cpy_regs(80000001H); + mag_cpy_regs(H80000001); } if (max_extended_leaf >= 0x80000007u) { mag_cpuid(0x80000007u, -1, &eax, &ebx, &ecx, &edx); - mag_cpy_regs(80000007H); + mag_cpy_regs(H80000007); } - bool cpu_avx_support = ((*features)[MAG_X86_64_CPUID_1H][MAG_X86_64_CPUID_ECX] & 0x10000000u) != 0; - bool cpu_osxsave_support = ((*features)[MAG_X86_64_CPUID_1H][MAG_X86_64_CPUID_ECX] & 0x8000000u) != 0; + bool cpu_avx_support = !!(regs[H1][ECX] & 0x10000000u); + bool cpu_osxsave_support = !!(regs[H1][ECX] & 0x8000000u); if (cpu_avx_support && cpu_osxsave_support) { uint64_t xcr0 = mag_xgetbv(); if ((xcr0 & 0x6) != 0x6u) { - (*features)[MAG_X86_64_CPUID_1H][MAG_X86_64_CPUID_ECX] &= ~0x10000000u; /* Clear AVX */ - (*features)[MAG_X86_64_CPUID_7H][MAG_X86_64_CPUID_EBX] &= ~0x20u; /* Clear AVX2 */ + regs[H1][ECX] &= ~0x10000000u; /* Clear AVX */ + regs[H7][EBX] &= ~0x20u; /* Clear AVX2 */ } if ((xcr0 & 0xe0) != 0xe0u) { /* OS does not support AVX-512, clear AVX512 */ - (*features)[MAG_X86_64_CPUID_7H][MAG_X86_64_CPUID_EBX] &= ~0xdc230000u; - (*features)[MAG_X86_64_CPUID_7H][MAG_X86_64_CPUID_ECX] &= ~0x5842u; - (*features)[MAG_X86_64_CPUID_7H][MAG_X86_64_CPUID_EDX] &= ~0x10cu; - (*features)[MAG_X86_64_CPUID_7H_1H][MAG_X86_64_CPUID_EAX] &= ~0x20u; + regs[H7][EBX] &= ~0xdc230000u; + regs[H7][ECX] &= ~0x5842u; + regs[H7][EDX] &= ~0x10cu; + regs[H7_1H][EAX] &= ~0x20u; } } else { - (*features)[MAG_X86_64_CPUID_1H][MAG_X86_64_CPUID_ECX] &= ~0x10000000u; /* Clear AVX */ - (*features)[MAG_X86_64_CPUID_7H][MAG_X86_64_CPUID_EBX] &= ~0x20u; /* Clear AVX2 */ - (*features)[MAG_X86_64_CPUID_7H][MAG_X86_64_CPUID_EBX] &= ~0xdc230000u; /* Clear AVX512 */ - (*features)[MAG_X86_64_CPUID_7H][MAG_X86_64_CPUID_ECX] &= ~0x5842u; /* Clear AVX512 */ - (*features)[MAG_X86_64_CPUID_7H][MAG_X86_64_CPUID_EDX] &= ~0x10cu; /* Clear AVX512 */ - (*features)[MAG_X86_64_CPUID_7H_1H][MAG_X86_64_CPUID_EAX] &= ~0x20u; /* Clear AVX512 */ + regs[H1][ECX] &= ~0x10000000u; /* Clear AVX */ + regs[H7][EBX] &= ~0x20u; /* Clear AVX2 */ + regs[H7][EBX] &= ~0xdc230000u; /* Clear AVX512 */ + regs[H7][ECX] &= ~0x5842u; /* Clear AVX512 */ + regs[H7][EDX] &= ~0x10cu; /* Clear AVX512 */ + regs[H7_1H][EAX] &= ~0x20u; /* Clear AVX512 */ } + + for (uint64_t i=1; i < MAG_AMD64_CAP__NUM; ++i) /* Create bitset of features */ + if (regs[feature_leaves[i]][feature_regs[i]] & feature_masks[i]) + *caps |= 1ull<sys.os_name); - mag_system_host_info_query_cpu_name(&ctx->sys.cpu_name); - mag_system_host_info_query_cpu_cores(&ctx->sys.cpu_virtual_cores, &ctx->sys.cpu_physical_cores, &ctx->sys.cpu_sockets); - mag_system_host_info_query_memory(&ctx->sys.phys_mem_total, &ctx->sys.phys_mem_free); + mag_system_host_info_query_os_name(&ctx->machine.os_name); + mag_system_host_info_query_cpu_name(&ctx->machine.cpu_name); + mag_system_host_info_query_cpu_cores(&ctx->machine.cpu_virtual_cores, &ctx->machine.cpu_physical_cores, &ctx->machine.cpu_sockets); + mag_system_host_info_query_memory(&ctx->machine.phys_mem_total, &ctx->machine.phys_mem_free); #if defined(__x86_64__) || defined(_M_X64) - mag_system_info_query_x86_64_cpu_features(&ctx->sys.x86_64_cpu_features); + mag_system_info_query_amd64_cpu_caps(&ctx->machine.amd64_cpu_caps); #elif defined(__aarch64__) - mag_system_info_query_arm64_cpu_features(&ctx->sys.arm64_cpu_features, &ctx->sys.arm64_cpu_sve_width); + mag_system_info_query_arm64_cpu_caps(&ctx->sys.arm64_cpu_features, &ctx->sys.arm64_cpu_sve_width); #endif - if (mag_unlikely(!*ctx->sys.os_name)) snprintf(ctx->sys.os_name, sizeof(ctx->sys.os_name), "Unknown"); - if (mag_unlikely(!*ctx->sys.cpu_name)) snprintf(ctx->sys.cpu_name, sizeof(ctx->sys.cpu_name), "Unknown"); + if (mag_unlikely(!*ctx->machine.os_name)) snprintf(ctx->machine.os_name, sizeof(ctx->machine.os_name), "Unknown"); + if (mag_unlikely(!*ctx->machine.cpu_name)) snprintf(ctx->machine.cpu_name, sizeof(ctx->machine.cpu_name), "Unknown"); } static MAG_AINLINE void mag_sto_write_u32_le(uint8_t** p, uint32_t x) { diff --git a/magnetron/magnetron_cpu.c b/magnetron/magnetron_cpu.c index 53e1518..d411166 100644 --- a/magnetron/magnetron_cpu.c +++ b/magnetron/magnetron_cpu.c @@ -10,12 +10,12 @@ extern void mag_cpu_blas_specialization_fallback(mag_kernel_registry_t* kernels) typedef struct mag_amd64_blas_specialization { const char* name; - const mag_x86_64_feature_t* (*get_feature_permutation)(size_t* out_num); + uint64_t (*get_feature_permutation)(void); void (*inject_kernels)(mag_kernel_registry_t* kernels); } mag_amd64_blas_specialization; #define mag_amd64_blas_spec_decl(feat) \ - const mag_x86_64_feature_t* mag_cpu_blas_specialization_amd64_##feat##_features(size_t* out_num); \ + uint64_t mag_cpu_blas_specialization_amd64_##feat##_features(void); \ extern void mag_cpu_blas_specialization_amd64_##feat(mag_kernel_registry_t* kernels) #define mag_amd64_blas_spec_permute(feat) \ @@ -40,15 +40,11 @@ static const mag_amd64_blas_specialization mag_amd64_blas_specializations[] = { }; static bool mag_blas_detect_gen_optimal_spec(const mag_ctx_t* ctx, mag_kernel_registry_t* kernels) { + uint64_t cap_avail = ctx->machine.amd64_cpu_caps; for (size_t i=0; i < sizeof(mag_amd64_blas_specializations)/sizeof(*mag_amd64_blas_specializations); ++i) { /* Find best blas spec for the host CPU */ const mag_amd64_blas_specialization* spec = mag_amd64_blas_specializations+i; - size_t num_features = 0; - const mag_x86_64_feature_t* features = (*spec->get_feature_permutation)(&num_features); /* Get requires features */ - if (mag_unlikely(!num_features || !features)) continue; - bool has_all_features = true; - for (size_t j=0; j < num_features; ++j) /* For each requested feature, check if host CPU supports it */ - has_all_features &= mag_ctx_x86_64_cpu_has_feature(ctx, features[j]); - if (has_all_features) { /* Since specializations are sorted by score, we found the perfect spec. */ + uint64_t cap_required = (*spec->get_feature_permutation)(); /* Get requires features */ + if ((cap_avail & cap_required) == cap_required) { /* Since specializations are sorted by score, we found the perfect spec. */ (*spec->inject_kernels)(kernels); mag_log_info("Using tuned BLAS specialization: %s", spec->name); return true; @@ -444,7 +440,7 @@ static mag_compute_device_t* mag_cpu_init_interface(mag_ctx_t* ctx, uint32_t num .alloc_storage = &mag_cpu_alloc_storage, .free_storage = &mag_cpu_free_storage }; - snprintf(dvc->name, sizeof(dvc->name), "%s - %s - Using %u Compute Threads", mag_device_type_get_name(dvc->type), ctx->sys.cpu_name, num_threads); + snprintf(dvc->name, sizeof(dvc->name), "%s", ctx->machine.cpu_name); return dvc; } @@ -455,7 +451,7 @@ static void mag_cpu_release_interface(mag_compute_device_t* ctx) { } mag_compute_device_t* mag_init_device_cpu(mag_ctx_t* ctx, const mag_device_descriptor_t* desc) { - uint32_t hw_concurrency = mag_xmax(1, ctx->sys.cpu_virtual_cores); + uint32_t hw_concurrency = mag_xmax(1, ctx->machine.cpu_virtual_cores); uint32_t num_threads = desc->thread_count; num_threads = num_threads ? num_threads : hw_concurrency; mag_compute_device_t* dvc = mag_cpu_init_interface(ctx, num_threads); diff --git a/magnetron/magnetron_cpu_blas.inl b/magnetron/magnetron_cpu_blas.inl index 82587d6..ad1e854 100644 --- a/magnetron/magnetron_cpu_blas.inl +++ b/magnetron/magnetron_cpu_blas.inl @@ -1411,132 +1411,129 @@ static void MAG_HOTPROC mag_blas_matmul_f32(const mag_compute_payload_t* payload #endif #if defined(__x86_64__) || defined(_M_X64) -const mag_x86_64_feature_t* MAG_BLAS_SPECIALIZATION_FEAT_REQUEST(size_t* out_num) { - static const mag_x86_64_feature_t required_features[] = { - #ifdef __AVX512F__ - MAG_X86_64_FEATURE_AVX512F, - #endif - #ifdef __AVX512BW__ - MAG_X86_64_FEATURE_AVX512BW, - #endif - #ifdef __AVX512CD__ - MAG_X86_64_FEATURE_AVX512CD, - #endif - #ifdef __AVX512DQ__ - MAG_X86_64_FEATURE_AVX512DQ, - #endif - #ifdef __AVX512ER__ - MAG_X86_64_FEATURE_AVX512ER, - #endif - #ifdef __AVX512IFMA__ - MAG_X86_64_FEATURE_AVX512IFMA, - #endif - #ifdef __AVX512PF__ - MAG_X86_64_FEATURE_AVX512PF, - #endif - #ifdef __AVX512VBMI__ - MAG_X86_64_FEATURE_AVX512VBMI, - #endif - #ifdef __AVX512VL__ - MAG_X86_64_FEATURE_AVX512VL, - #endif - #ifdef __AVX512_4FMAPS__ - MAG_X86_64_FEATURE_AVX512_4FMAPS, - #endif - #ifdef __AVX512_4VNNIW__ - MAG_X86_64_FEATURE_AVX512_4VNNIW, - #endif - #ifdef __AVX512_FP16__ - MAG_X86_64_FEATURE_AVX512_FP16, - #endif - #ifdef __AVX512_BF16__ - MAG_X86_64_FEATURE_AVX512_BF16, - #endif - #ifdef __AVX512_BITALG__ - MAG_X86_64_FEATURE_AVX512_BITALG, - #endif - #ifdef __AVX512_VBMI2__ - MAG_X86_64_FEATURE_AVX512_VBMI2, - #endif - #ifdef __AVX512_VNNI__ - MAG_X86_64_FEATURE_AVX512_VNNI, - #endif - #ifdef __AVX512_VP2INTERSECT__ - MAG_X86_64_FEATURE_AVX512_VP2INTERSECT, - #endif - #ifdef __AVX512_VPOPCNTDQ__ - MAG_X86_64_FEATURE_AVX512_VPOPCNTDQ, - #endif - #ifdef __AVX__ - MAG_X86_64_FEATURE_AVX, - #endif - #ifdef __AVX2__ - MAG_X86_64_FEATURE_AVX2, - #endif - #ifdef __AVXVNNI__ - MAG_X86_64_FEATURE_AVXVNNI, - #endif - #ifdef __AVXVNNIINT8__ - MAG_X86_64_FEATURE_AVXVNNIINT8, - #endif - #ifdef __AVXVNNIINT16__ - MAG_X86_64_FEATURE_AVXVNNIINT16, - #endif - #ifdef __BMI__ - MAG_X86_64_FEATURE_BMI, - #endif - #ifdef __BMI2__ - MAG_X86_64_FEATURE_BMI2, - #endif - #ifdef __F16C__ - MAG_X86_64_FEATURE_F16C, - #endif - #ifdef __FMA__ - MAG_X86_64_FEATURE_FMA, - #endif - #ifdef __GFNI__ - MAG_X86_64_FEATURE_GFNI, - #endif - #ifdef __PCLMUL__ - MAG_X86_64_FEATURE_PCLMUL, - #endif - #ifdef __RDRND__ - MAG_X86_64_FEATURE_RDRND, - #endif - #ifdef __RDSEED__ - MAG_X86_64_FEATURE_RDSEED, - #endif - #ifdef __RDTSCP__ - MAG_X86_64_FEATURE_RDTSCP, - #endif - #ifdef __SHA__ - MAG_X86_64_FEATURE_SHA, - #endif - #ifdef __SSE3__ - MAG_X86_64_FEATURE_SSE3, - #endif - #ifdef __SSE4_1__ - MAG_X86_64_FEATURE_SSE4_1, - #endif - #ifdef __SSE4_2__ - MAG_X86_64_FEATURE_SSE4_2, - #endif - #ifdef __SSSE3__ - MAG_X86_64_FEATURE_SSSE3, - #endif - #ifdef __VAES__ - MAG_X86_64_FEATURE_VAES, - #endif - #ifdef __VPCLMULQDQ__ - MAG_X86_64_FEATURE_VPCLMULQDQ, - #endif - #ifdef __XSAVE__ - MAG_X86_64_FEATURE_XSAVE, - #endif - MAG_X86_64_FEATURE_SSE2, /* always required */ - }; - *out_num = sizeof(required_features)/sizeof(*required_features); - return required_features; +uint64_t MAG_BLAS_SPECIALIZATION_FEAT_REQUEST() { + uint64_t caps = 1ull<var, prefix) -#if defined(__x86_64__) || defined(_M_X64) -extern const uint8_t mag_x86_64_feature_leaves[MAG_X86_64_FEATURE__NUM]; -extern const uint8_t mag_x86_64_feature_regs[MAG_X86_64_FEATURE__NUM]; -extern const uint32_t mag_x86_64_feature_masks[MAG_X86_64_FEATURE__NUM]; - -static inline bool mag_ctx_x86_64_cpu_has_feature(const mag_ctx_t* ctx, mag_x86_64_feature_t feature) { - const uint8_t (*leafs)[49] = &mag_x86_64_feature_leaves; - const uint8_t (*regs)[49] = &mag_x86_64_feature_regs; - const uint32_t (*masks)[49] = &mag_x86_64_feature_masks; - const uint32_t (*features)[8][4] = &ctx->sys.x86_64_cpu_features; - return (*features)[(*leafs)[feature]][(*regs)[feature]] & (*masks)[feature]; -} -#endif - #ifdef __cplusplus } #endif From 7b08b1351952f44120f88049bfeadc4197eff67a Mon Sep 17 00:00:00 2001 From: Neo Date: Fri, 24 Jan 2025 00:18:34 +0100 Subject: [PATCH 09/16] Refractored blas specializations and tuning --- cmake/blas_tune.cmake | 20 ++++++------ magnetron/magnetron_cpu.c | 32 +++++++++---------- magnetron/magnetron_cpu_blas.inl | 23 ++++++++++++- magnetron/magnetron_cpu_blas_amd64_avx2.c | 13 -------- magnetron/magnetron_cpu_blas_amd64_avx512f.c | 13 -------- ..._sse42.c => magnetron_cpu_blas_amd64_v2.c} | 11 +++++-- ..._avx.c => magnetron_cpu_blas_amd64_v2_5.c} | 12 +++++-- magnetron/magnetron_cpu_blas_amd64_v3.c | 24 ++++++++++++++ magnetron/magnetron_cpu_blas_amd64_v4.c | 28 ++++++++++++++++ magnetron/magnetron_cpu_blas_amd64_v4_5.c | 30 +++++++++++++++++ magnetron/magnetron_cpu_blas_amd64_znver4.c | 11 ------- 11 files changed, 147 insertions(+), 70 deletions(-) delete mode 100644 magnetron/magnetron_cpu_blas_amd64_avx2.c delete mode 100644 magnetron/magnetron_cpu_blas_amd64_avx512f.c rename magnetron/{magnetron_cpu_blas_amd64_sse42.c => magnetron_cpu_blas_amd64_v2.c} (64%) rename magnetron/{magnetron_cpu_blas_amd64_avx.c => magnetron_cpu_blas_amd64_v2_5.c} (61%) create mode 100644 magnetron/magnetron_cpu_blas_amd64_v3.c create mode 100644 magnetron/magnetron_cpu_blas_amd64_v4.c create mode 100644 magnetron/magnetron_cpu_blas_amd64_v4_5.c delete mode 100644 magnetron/magnetron_cpu_blas_amd64_znver4.c diff --git a/cmake/blas_tune.cmake b/cmake/blas_tune.cmake index 6ca481c..216511a 100644 --- a/cmake/blas_tune.cmake +++ b/cmake/blas_tune.cmake @@ -10,11 +10,11 @@ function(set_blas_spec_arch filename posix_arch msvc_arch) endfunction() set(MAGNETRON_BLAS_SPEC_AMD64_SOURCES - magnetron/magnetron_cpu_blas_amd64_sse42.c - magnetron/magnetron_cpu_blas_amd64_avx.c - magnetron/magnetron_cpu_blas_amd64_avx2.c - magnetron/magnetron_cpu_blas_amd64_avx512f.c - magnetron/magnetron_cpu_blas_amd64_znver4.c + magnetron/magnetron_cpu_blas_amd64_v2.c + magnetron/magnetron_cpu_blas_amd64_v2_5.c + magnetron/magnetron_cpu_blas_amd64_v3.c + magnetron/magnetron_cpu_blas_amd64_v4.c + magnetron/magnetron_cpu_blas_amd64_v4_5.c ) set(MAGNETRON_BLAS_SPEC_ARM64_SOURCES @@ -24,11 +24,11 @@ set(MAGNETRON_BLAS_SPEC_ARM64_SOURCES if (${IS_AMD64}) # x86-64 specific compilation options set(MAGNETRON_SOURCES ${MAGNETRON_SOURCES} ${MAGNETRON_BLAS_SPEC_AMD64_SOURCES}) - set_blas_spec_arch("magnetron_cpu_blas_amd64_sse42.c" "-mtune=nehalem -msse4.2" "/arch:SSE4.2") - set_blas_spec_arch("magnetron_cpu_blas_amd64_avx.c" "-mtune=sandybridge -mavx" "/arch:AVX") - set_blas_spec_arch("magnetron_cpu_blas_amd64_avx2.c" "-mtune=skylake -mavx -mavx2 -mfma -mf16c" "/arch:AVX2") - set_blas_spec_arch("magnetron_cpu_blas_amd64_avx512f.c" "-mtune=cannonlake -mavx -mavx2 -mfma -mf16c -mavx512f" "/arch:AVX512") - set_blas_spec_arch("magnetron_cpu_blas_amd64_znver4.c" "-mavx -mavx2 -mfma -mf16c -mavx512f -mavx512vl -mavx512vnni -mavx512bf16 -mavx512bw -mavx512dq" "/arch:AVX512") + set_blas_spec_arch("magnetron_cpu_blas_amd64_v2.c" "-mtune=nehalem -mcx16 -mpopcnt -msse3 -mssse3 -msse4.1 -msse4.2" "/arch:SSE4.2") + set_blas_spec_arch("magnetron_cpu_blas_amd64_v2_5.c" "-mtune=ivybridge -mavx -mno-avx2 -mcx16 -mpopcnt -msse3 -mssse3 -msse4.1 -msse4.2" "/arch:AVX") + set_blas_spec_arch("magnetron_cpu_blas_amd64_v3.c" "-mtune=haswell -mavx -mavx2 -mbmi -mbmi2 -mf16c -mfma -mlzcnt -mmovbe" "/arch:AVX2") + set_blas_spec_arch("magnetron_cpu_blas_amd64_v4.c" "-mtune=cannonlake -mavx512f -mavx512bw -mavx512vl -mavx512dq -mavx -mavx2 -mbmi -mbmi2 -mf16c -mfma -mlzcnt -mmovbe" "/arch:AVX512") + set_blas_spec_arch("magnetron_cpu_blas_amd64_v4_5.c" "-mtune=generic -mavx512f -mavx512bw -mavx512vl -mavx512dq -mavx512vnni -mavx512bf16 -mavx -mavx2 -mbmi -mbmi2 -mf16c -mfma -mlzcnt -mmovbe" "/arch:AVX512") elseif(${IS_ARM64}) set(MAGNETRON_SOURCES ${MAGNETRON_SOURCES} ${MAGNETRON_BLAS_SPEC_ARM64_SOURCES}) set_blas_spec_arch("magnetron_cpu_blas_arm64_v8_2.c" "-march=armv8.2-a+dotprod+fp16" "") diff --git a/magnetron/magnetron_cpu.c b/magnetron/magnetron_cpu.c index d411166..cdc0862 100644 --- a/magnetron/magnetron_cpu.c +++ b/magnetron/magnetron_cpu.c @@ -15,28 +15,28 @@ typedef struct mag_amd64_blas_specialization { } mag_amd64_blas_specialization; #define mag_amd64_blas_spec_decl(feat) \ - uint64_t mag_cpu_blas_specialization_amd64_##feat##_features(void); \ - extern void mag_cpu_blas_specialization_amd64_##feat(mag_kernel_registry_t* kernels) + uint64_t mag_cpu_blas_specialization_amd64_v##feat##_features(void); \ + extern void mag_cpu_blas_specialization_amd64_v##feat(mag_kernel_registry_t* kernels) #define mag_amd64_blas_spec_permute(feat) \ (mag_amd64_blas_specialization) { \ - .name = "amd64_"#feat, \ - .get_feature_permutation = &mag_cpu_blas_specialization_amd64_##feat##_features, \ - .inject_kernels = &mag_cpu_blas_specialization_amd64_##feat \ + .name = "amd64-v"#feat, \ + .get_feature_permutation = &mag_cpu_blas_specialization_amd64_v##feat##_features, \ + .inject_kernels = &mag_cpu_blas_specialization_amd64_v##feat \ } -mag_amd64_blas_spec_decl(znver4); -mag_amd64_blas_spec_decl(avx512f); -mag_amd64_blas_spec_decl(avx2); -mag_amd64_blas_spec_decl(avx); -mag_amd64_blas_spec_decl(sse41); +mag_amd64_blas_spec_decl(4_5); +mag_amd64_blas_spec_decl(4); +mag_amd64_blas_spec_decl(3); +mag_amd64_blas_spec_decl(2_5); +mag_amd64_blas_spec_decl(2); static const mag_amd64_blas_specialization mag_amd64_blas_specializations[] = { /* Dynamic selectable BLAS permutations, sorted from best to worst score. */ - mag_amd64_blas_spec_permute(znver4), - mag_amd64_blas_spec_permute(avx512f), - mag_amd64_blas_spec_permute(avx2), - mag_amd64_blas_spec_permute(avx), - mag_amd64_blas_spec_permute(sse41), + mag_amd64_blas_spec_permute(4_5), + mag_amd64_blas_spec_permute(4), + mag_amd64_blas_spec_permute(3), + mag_amd64_blas_spec_permute(2_5), + mag_amd64_blas_spec_permute(2), }; static bool mag_blas_detect_gen_optimal_spec(const mag_ctx_t* ctx, mag_kernel_registry_t* kernels) { @@ -73,7 +73,7 @@ typedef struct mag_arm64_blas_specialization { #define mag_arm64_blas_spec_permute(feat) \ (mag_arm64_blas_specialization) { \ - .name = "arm64_"#feat, \ + .name = "arm64-"#feat, \ .get_cap_permutation = &mag_cpu_blas_specialization_arm64_v_##feat##_features, \ .inject_kernels = &mag_cpu_blas_specialization_arm64_v_##feat \ } diff --git a/magnetron/magnetron_cpu_blas.inl b/magnetron/magnetron_cpu_blas.inl index ad1e854..332ed9d 100644 --- a/magnetron/magnetron_cpu_blas.inl +++ b/magnetron/magnetron_cpu_blas.inl @@ -1,8 +1,29 @@ /* (c) 2025 Mario "Neo" Sieg. */ /* +** This file implements the core math for magnetron, optimized for different CPU instruction sets. +** This file is also included into different compilation units, which are all compiled with different architecture flags, thus the impl is 'cloned'. +** At runtime the best impl for the host-CPU is chose automatically, by detecting the CPU and querying the hardware features. ** -** + ARM 64 Versions and Features +** !!! Minimum Requirements!!! +** AMD 64 CPUs: SSE & SSE2 (any 64-bit AMD64 CPU). +** ARM 64 CPUs: ARM v8-a (Raspberry Pi 4, 5, Apple M1-4, Neoverse/Graviton etc..) +** +** +==============+=============+==============+======================================================+ +** | AMD 64 Versions and Features +** +==============+=============+==============+======================================================+ +** | x86-64-v1 | CMOV, CX8, FPU, FXSR, MMX, OSFXSR, SCE, SSE, SSE2 +** | x86-64-v2 | CMPXCHG16B, LAHF-SAHF, POPCNT, SSE3, SSE4_1, SSE4_2, SSSE3 +** | x86-64-v3 | AVX, AVX2, BMI1, BMI2, F16C, FMA, LZCNT, MOVBE, OSXSAVE +** | x86-64-v4 | AVX512F, AVX512BW, AVX512CD, AVX512DQ, AVX512VL +** +==============+=============+==============+======================================================+ +** Some CPUs fall inbetween those, for example my old rusty test server has four old AMD Opteron CPUs with 16 cores each. They support AVX but not AVX2. +** For CPUs like this, we still support more granular feature levels: SSE42, AVX, AVX2 and AVX512F. +** +** +** +** +==============+=============+==============+======================================================+ +** | ARM 64 Versions and Features ** +==============+=============+==============+======================================================+ ** | armv8-a | Armv8-A | | +fp, +simd ** | armv8.1-a | Armv8.1-A | armv8-a, | +crc, +lse, +rdma diff --git a/magnetron/magnetron_cpu_blas_amd64_avx2.c b/magnetron/magnetron_cpu_blas_amd64_avx2.c deleted file mode 100644 index 505d4c1..0000000 --- a/magnetron/magnetron_cpu_blas_amd64_avx2.c +++ /dev/null @@ -1,13 +0,0 @@ -/* (c) 2025 Mario "Neo" Sieg. */ - -#ifndef __AVX2__ -#error "BLAS specialization requires matching compile flags" -#endif -#ifdef __AVX512f__ -#error "BLAS specialization feature too high" -#endif - -#define MAG_BLAS_SPECIALIZATION mag_cpu_blas_specialization_amd64_avx2 -#define MAG_BLAS_SPECIALIZATION_FEAT_REQUEST mag_cpu_blas_specialization_amd64_avx2_features - -#include "magnetron_cpu_blas.inl" diff --git a/magnetron/magnetron_cpu_blas_amd64_avx512f.c b/magnetron/magnetron_cpu_blas_amd64_avx512f.c deleted file mode 100644 index df03a17..0000000 --- a/magnetron/magnetron_cpu_blas_amd64_avx512f.c +++ /dev/null @@ -1,13 +0,0 @@ -/* (c) 2025 Mario "Neo" Sieg. */ - -#ifndef __AVX512F__ -#error "BLAS specialization requires matching compile flags" -#endif -#ifdef __AVX512DQ__ -//#error "BLAS specialization feature too high" -#endif - -#define MAG_BLAS_SPECIALIZATION mag_cpu_blas_specialization_amd64_avx512f -#define MAG_BLAS_SPECIALIZATION_FEAT_REQUEST mag_cpu_blas_specialization_amd64_avx512f_features - -#include "magnetron_cpu_blas.inl" diff --git a/magnetron/magnetron_cpu_blas_amd64_sse42.c b/magnetron/magnetron_cpu_blas_amd64_v2.c similarity index 64% rename from magnetron/magnetron_cpu_blas_amd64_sse42.c rename to magnetron/magnetron_cpu_blas_amd64_v2.c index 656cd98..29b2119 100644 --- a/magnetron/magnetron_cpu_blas_amd64_sse42.c +++ b/magnetron/magnetron_cpu_blas_amd64_v2.c @@ -1,13 +1,18 @@ /* (c) 2025 Mario "Neo" Sieg. */ -#ifndef __SSE4_2__ +#if !defined(__SSE__) \ + || !defined(__SSE2__) \ + || !defined(__SSE3__) \ + || !defined(__SSSE3__) \ + || !defined(__SSE4_1__) \ + || !defined(__SSE4_2__) #error "BLAS specialization requires matching compile flags" #endif #ifdef __AVX__ #error "BLAS specialization feature too high" #endif -#define MAG_BLAS_SPECIALIZATION mag_cpu_blas_specialization_amd64_sse41 -#define MAG_BLAS_SPECIALIZATION_FEAT_REQUEST mag_cpu_blas_specialization_amd64_sse41_features +#define MAG_BLAS_SPECIALIZATION mag_cpu_blas_specialization_amd64_v2 +#define MAG_BLAS_SPECIALIZATION_FEAT_REQUEST mag_cpu_blas_specialization_amd64_v2_features #include "magnetron_cpu_blas.inl" diff --git a/magnetron/magnetron_cpu_blas_amd64_avx.c b/magnetron/magnetron_cpu_blas_amd64_v2_5.c similarity index 61% rename from magnetron/magnetron_cpu_blas_amd64_avx.c rename to magnetron/magnetron_cpu_blas_amd64_v2_5.c index f82ad40..625a5cd 100644 --- a/magnetron/magnetron_cpu_blas_amd64_avx.c +++ b/magnetron/magnetron_cpu_blas_amd64_v2_5.c @@ -1,13 +1,19 @@ /* (c) 2025 Mario "Neo" Sieg. */ -#ifndef __AVX__ +#if !defined(__SSE__) \ + || !defined(__SSE2__) \ + || !defined(__SSE3__) \ + || !defined(__SSSE3__) \ + || !defined(__SSE4_1__) \ + || !defined(__SSE4_2__) \ + || !defined(__AVX__) #error "BLAS specialization requires matching compile flags" #endif #ifdef __AVX2__ #error "BLAS specialization feature too high" #endif -#define MAG_BLAS_SPECIALIZATION mag_cpu_blas_specialization_amd64_avx -#define MAG_BLAS_SPECIALIZATION_FEAT_REQUEST mag_cpu_blas_specialization_amd64_avx_features +#define MAG_BLAS_SPECIALIZATION mag_cpu_blas_specialization_amd64_v2_5 +#define MAG_BLAS_SPECIALIZATION_FEAT_REQUEST mag_cpu_blas_specialization_amd64_v2_5_features #include "magnetron_cpu_blas.inl" diff --git a/magnetron/magnetron_cpu_blas_amd64_v3.c b/magnetron/magnetron_cpu_blas_amd64_v3.c new file mode 100644 index 0000000..d0064fb --- /dev/null +++ b/magnetron/magnetron_cpu_blas_amd64_v3.c @@ -0,0 +1,24 @@ +/* (c) 2025 Mario "Neo" Sieg. */ + +#if !defined(__SSE__) \ + || !defined(__SSE2__) \ + || !defined(__SSE3__) \ + || !defined(__SSSE3__) \ + || !defined(__SSE4_1__) \ + || !defined(__SSE4_2__) \ + || !defined(__AVX__) \ + || !defined(__AVX2__) \ + || !defined(__BMI__) \ + || !defined(__BMI2__) \ + || !defined(__F16C__) \ + || !defined(__FMA__) +#error "BLAS specialization requires matching compile flags" +#endif +#ifdef __AVX512F__ +#error "BLAS specialization feature too high" +#endif + +#define MAG_BLAS_SPECIALIZATION mag_cpu_blas_specialization_amd64_v3 +#define MAG_BLAS_SPECIALIZATION_FEAT_REQUEST mag_cpu_blas_specialization_amd64_v3_features + +#include "magnetron_cpu_blas.inl" diff --git a/magnetron/magnetron_cpu_blas_amd64_v4.c b/magnetron/magnetron_cpu_blas_amd64_v4.c new file mode 100644 index 0000000..2c706f6 --- /dev/null +++ b/magnetron/magnetron_cpu_blas_amd64_v4.c @@ -0,0 +1,28 @@ +/* (c) 2025 Mario "Neo" Sieg. */ + +#if !defined(__SSE__) \ + || !defined(__SSE2__) \ + || !defined(__SSE3__) \ + || !defined(__SSSE3__) \ + || !defined(__SSE4_1__) \ + || !defined(__SSE4_2__) \ + || !defined(__AVX__) \ + || !defined(__AVX2__) \ + || !defined(__BMI__) \ + || !defined(__BMI2__) \ + || !defined(__F16C__) \ + || !defined(__FMA__) \ + || !defined(__AVX512F__) \ + || !defined(__AVX512BW__) \ + || !defined(__AVX512DQ__) \ + || !defined(__AVX512VL__) +#error "BLAS specialization requires matching compile flags" +#endif +#ifdef __AVX512VNNI__ +#error "BLAS specialization feature too high" +#endif + +#define MAG_BLAS_SPECIALIZATION mag_cpu_blas_specialization_amd64_v4 +#define MAG_BLAS_SPECIALIZATION_FEAT_REQUEST mag_cpu_blas_specialization_amd64_v4_features + +#include "magnetron_cpu_blas.inl" diff --git a/magnetron/magnetron_cpu_blas_amd64_v4_5.c b/magnetron/magnetron_cpu_blas_amd64_v4_5.c new file mode 100644 index 0000000..ddca599 --- /dev/null +++ b/magnetron/magnetron_cpu_blas_amd64_v4_5.c @@ -0,0 +1,30 @@ +/* (c) 2025 Mario "Neo" Sieg. */ + +#if !defined(__SSE__) \ + || !defined(__SSE2__) \ + || !defined(__SSE3__) \ + || !defined(__SSSE3__) \ + || !defined(__SSE4_1__) \ + || !defined(__SSE4_2__) \ + || !defined(__AVX__) \ + || !defined(__AVX2__) \ + || !defined(__BMI__) \ + || !defined(__BMI2__) \ + || !defined(__F16C__) \ + || !defined(__FMA__) \ + || !defined(__AVX512F__) \ + || !defined(__AVX512BW__) \ + || !defined(__AVX512DQ__) \ + || !defined(__AVX512VL__) \ + || !defined(__AVX512VNNI__) \ + || !defined(__AVX512BF16__) +#error "BLAS specialization requires matching compile flags" +#endif +#ifdef __APX__ +#error "BLAS specialization feature too high" +#endif + +#define MAG_BLAS_SPECIALIZATION mag_cpu_blas_specialization_amd64_v4_5 +#define MAG_BLAS_SPECIALIZATION_FEAT_REQUEST mag_cpu_blas_specialization_amd64_v4_5_features + +#include "magnetron_cpu_blas.inl" diff --git a/magnetron/magnetron_cpu_blas_amd64_znver4.c b/magnetron/magnetron_cpu_blas_amd64_znver4.c deleted file mode 100644 index 23bca62..0000000 --- a/magnetron/magnetron_cpu_blas_amd64_znver4.c +++ /dev/null @@ -1,11 +0,0 @@ -/* (c) 2025 Mario "Neo" Sieg. */ - -#if !defined(__AVX512F__) || !defined(__AVX512VL__) || !defined(__AVX512VNNI__) - || !defined(__AVX512BF16__) || !defined(__AVX512BW__) || !defined(__AVX512DQ__) -#error "BLAS specialization requires matching compile flags" -#endif - -#define MAG_BLAS_SPECIALIZATION mag_cpu_blas_specialization_amd64_znver4 -#define MAG_BLAS_SPECIALIZATION_FEAT_REQUEST mag_cpu_blas_specialization_amd64_znver4_features - -#include "magnetron_cpu_blas.inl" From 462df365ca5370c4185841593f6b63ae94a398dc Mon Sep 17 00:00:00 2001 From: Mario Sieg Date: Fri, 24 Jan 2025 01:11:47 +0000 Subject: [PATCH 10/16] Fix linux-arm64 build and runtime --- cmake/compiler_config.cmake | 4 +-- magnetron/magnetron.c | 57 ++++++++++++++++------------------ magnetron/magnetron_cpu.c | 14 ++++----- magnetron/magnetron_internal.h | 4 +-- 4 files changed, 38 insertions(+), 41 deletions(-) diff --git a/cmake/compiler_config.cmake b/cmake/compiler_config.cmake index 155a970..a4e443d 100644 --- a/cmake/compiler_config.cmake +++ b/cmake/compiler_config.cmake @@ -52,8 +52,8 @@ set(MAG_GCC_LINK_OPTIONS "") set(MAG_GCC_RELEASE_LINK_OPTIONS -flto) if (${IS_ARM64}) - set(MAG_CLANG_COMPILE_FLAGS ${MAG_CLANG_COMPILE_FLAGS} -march=armv8.2-a) - set(MAG_GCC_COMPILE_FLAGS ${MAG_CLANG_COMPILE_FLAGS} -march=armv8.2-a) + set(MAG_CLANG_COMPILE_FLAGS ${MAG_CLANG_COMPILE_FLAGS} -march=armv8-a) + set(MAG_GCC_COMPILE_FLAGS ${MAG_CLANG_COMPILE_FLAGS} -march=armv8-a) elseif (${IS_AMD64}) set(MAG_CLANG_COMPILE_FLAGS ${MAG_CLANG_COMPILE_FLAGS} -msse -msse2) set(MAG_GCC_COMPILE_FLAGS ${MAG_CLANG_COMPILE_FLAGS} -msse -msse2) diff --git a/magnetron/magnetron.c b/magnetron/magnetron.c index ce77c0e..ccb475b 100644 --- a/magnetron/magnetron.c +++ b/magnetron/magnetron.c @@ -57,8 +57,7 @@ #include #include #ifdef __aarch64__ -#include -#include #endif #endif #endif @@ -477,21 +476,17 @@ static void mag_system_host_info_dump(mag_ctx_t* ctx) { #if defined(__x86_64__) || defined(_M_X64) /* Print CPU features for x86-64 platforms. */ if (mag_log_enabled) { printf(MAG_CC_CYAN "[magnetron] " MAG_CC_RESET "%s caps: ", cpu_arch); - for (uint64_t i=0; i < MAG_AMD64_CAP__NUM; ++i) { - if (ctx->machine.amd64_cpu_caps & (1ull<machine.amd64_cpu_caps & (1ull<machine.arm64_cpu_features & (1u<machine.arm64_cpu_caps & (1ull<machine.amd64_cpu_caps); #elif defined(__aarch64__) - mag_system_info_query_arm64_cpu_caps(&ctx->sys.arm64_cpu_features, &ctx->sys.arm64_cpu_sve_width); + mag_system_info_query_arm64_cpu_caps(&ctx->machine.arm64_cpu_caps, &ctx->machine.arm64_cpu_sve_width); #endif if (mag_unlikely(!*ctx->machine.os_name)) snprintf(ctx->machine.os_name, sizeof(ctx->machine.os_name), "Unknown"); if (mag_unlikely(!*ctx->machine.cpu_name)) snprintf(ctx->machine.cpu_name, sizeof(ctx->machine.cpu_name), "Unknown"); @@ -3073,7 +3070,7 @@ MAG_EXPORT mag_tensor_t** mag_sto_read_buffered(mag_ctx_t* ctx, const uint8_t* b if (mag_unlikely(!mag_sto_read_file_header(&needle, end, out_version, &n_tensors, &ud))) return NULL; /* Read file header */ if (mag_unlikely(!*out_version || *out_version > MAG_VERSION)) return NULL; if (mag_unlikely(!n_tensors)) return NULL; - mag_tensor_t** tensors = (mag_tensor_t**)(*mag_alloc)(NULL, n_tensors*sizeof(*tensors)); /* Allocate return tensor array */ + mag_tensor_t** tensors = (*mag_alloc)(NULL, n_tensors*sizeof(*tensors)); /* Allocate return tensor array */ for (size_t i=0; i < n_tensors; ++i) { /* Read tensor headers */ char name[MAG_MAX_TENSOR_NAME_LEN] = {0}; mag_tensor_flags_t flags = 0; diff --git a/magnetron/magnetron_cpu.c b/magnetron/magnetron_cpu.c index cdc0862..8179728 100644 --- a/magnetron/magnetron_cpu.c +++ b/magnetron/magnetron_cpu.c @@ -52,7 +52,7 @@ static bool mag_blas_detect_gen_optimal_spec(const mag_ctx_t* ctx, mag_kernel_re } /* No matching specialization found, use generic */ mag_cpu_blas_specialization_fallback(kernels); - mag_log_warn("Using fallback BLAS specialization"); + mag_log_info("Using fallback BLAS specialization"); return false; /* No spec used, fallback is active */ } @@ -63,12 +63,12 @@ static bool mag_blas_detect_gen_optimal_spec(const mag_ctx_t* ctx, mag_kernel_re typedef struct mag_arm64_blas_specialization { const char* name; - mag_arm64_cap_t (*get_cap_permutation)(void); + uint64_t (*get_cap_permutation)(void); void (*inject_kernels)(mag_kernel_registry_t* kernels); } mag_arm64_blas_specialization; #define mag_arm64_blas_spec_decl(feat) \ - mag_arm64_cap_t mag_cpu_blas_specialization_arm64_v_##feat##_features(void); \ + uint64_t mag_cpu_blas_specialization_arm64_v_##feat##_features(void); \ extern void mag_cpu_blas_specialization_arm64_v_##feat(mag_kernel_registry_t* kernels) #define mag_arm64_blas_spec_permute(feat) \ @@ -87,10 +87,10 @@ static const mag_arm64_blas_specialization mag_arm64_blas_specializations[] = { }; static bool mag_blas_detect_gen_optimal_spec(const mag_ctx_t* ctx, mag_kernel_registry_t* kernels) { - mag_arm64_cap_t cap_avail = ctx->sys.arm64_cpu_features; - for (size_t i=0; i < sizeof(mag_arm64_blas_specializations)/sizeof(*mag_arm64_blas_specializations); ++i) { /* Find best blas spec for the host CPU */ + uint64_t cap_avail = ctx->machine.arm64_cpu_caps; + for (size_t i=1; i < sizeof(mag_arm64_blas_specializations)/sizeof(*mag_arm64_blas_specializations); ++i) { /* Find best blas spec for the host CPU */ const mag_arm64_blas_specialization* spec = mag_arm64_blas_specializations+i; - mag_arm64_cap_t cap_required = (*spec->get_cap_permutation)(); /* Get requires features */ + uint64_t cap_required = (*spec->get_cap_permutation)(); /* Get requires features */ if ((cap_avail & cap_required) == cap_required) { /* Since specializations are sorted by score, we found the perfect spec. */ (*spec->inject_kernels)(kernels); mag_log_info("Using tuned BLAS specialization: %s", spec->name); @@ -99,7 +99,7 @@ static bool mag_blas_detect_gen_optimal_spec(const mag_ctx_t* ctx, mag_kernel_re } /* No matching specialization found, use generic */ mag_cpu_blas_specialization_fallback(kernels); - mag_log_warn("Using fallback BLAS specialization"); + mag_log_info("Using fallback BLAS specialization"); return false; /* No spec used, fallback is active */ } diff --git a/magnetron/magnetron_internal.h b/magnetron/magnetron_internal.h index c8d8e4d..d1defb6 100644 --- a/magnetron/magnetron_internal.h +++ b/magnetron/magnetron_internal.h @@ -632,7 +632,7 @@ typedef enum mag_amd64_cap_t { MAG_AMD64_CAP__NUM } mag_amd64_cap_t; #undef _ -#define mag_amd64_cap(cap) (1ull<<(MAG_AMD64_CAP_##cap)) + extern const char* const mag_amd64_cap_names[MAG_AMD64_CAP__NUM]; #elif defined(__aarch64__) @@ -675,7 +675,7 @@ struct mag_ctx_t { #if defined(__x86_64__) || defined(_M_X64) uint64_t amd64_cpu_caps; /* x86-64 CPU features. Bitset of 1ull< Date: Fri, 24 Jan 2025 02:14:45 +0100 Subject: [PATCH 11/16] arm64 OSX fix CPU detection --- magnetron/magnetron.c | 14 +++++++------- magnetron/magnetron_cpu.c | 4 ++-- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/magnetron/magnetron.c b/magnetron/magnetron.c index ccb475b..80561aa 100644 --- a/magnetron/magnetron.c +++ b/magnetron/magnetron.c @@ -2790,19 +2790,19 @@ static void MAG_COLDPROC mag_system_info_query_arm64_cpu_caps(uint64_t* caps, in int sx = 0; size_t size = sizeof(sx); if (sysctlbyname("hw.optional.AdvSIMD", &sx, &size, NULL, 0) != 0) sx = 0; - if (sx) *feat |= 1ull< Date: Fri, 24 Jan 2025 02:20:40 +0100 Subject: [PATCH 12/16] Fix python tests --- .github/workflows/cmake-multi-platform.yml | 2 +- .gitignore | 1 + python/tests/__init__.py | 0 python/tests/tensor.py | 2 +- python/tests/tensor_fill.py | 24 +++++++++++----------- python/tests/tensor_ops1.py | 22 ++++++++++---------- python/tests/tests.py | 2 +- 7 files changed, 27 insertions(+), 26 deletions(-) create mode 100644 python/tests/__init__.py diff --git a/.github/workflows/cmake-multi-platform.yml b/.github/workflows/cmake-multi-platform.yml index 71af77e..471b207 100644 --- a/.github/workflows/cmake-multi-platform.yml +++ b/.github/workflows/cmake-multi-platform.yml @@ -103,4 +103,4 @@ jobs: shell: bash run: | . .venv/bin/activate - python -m pytest ${{ github.workspace }}/python/tests/tests.py + python -m pytest ${{ github.workspace }}/python/tests/* diff --git a/.gitignore b/.gitignore index dd3c6b5..f42a950 100644 --- a/.gitignore +++ b/.gitignore @@ -33,3 +33,4 @@ out/ .tmp CMakeSettings.json magnetron_chat/.idea +/build/ diff --git a/python/tests/__init__.py b/python/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/python/tests/tensor.py b/python/tests/tensor.py index c3eb5a7..2151636 100644 --- a/python/tests/tensor.py +++ b/python/tests/tensor.py @@ -39,4 +39,4 @@ def test_tensor_to_list(): tensor[1] = 255 tensor[2] = -22333 tensor[3] = 22 - assert tensor.to_list() == [128, 255, -22333, 22] + assert tensor.tolist() == [128, 255, -22333, 22] diff --git a/python/tests/tensor_fill.py b/python/tests/tensor_fill.py index 116fe27..79a2fff 100644 --- a/python/tests/tensor_fill.py +++ b/python/tests/tensor_fill.py @@ -4,31 +4,31 @@ def test_tensor_fill_zero(): tensor = Tensor.zeros((1, 2, 3, 4, 5, 6)) - data = tensor.to_list() + data = tensor.tolist() assert len(data) == 1 * 2 * 3 * 4 * 5 * 6 assert all([x == 0 for x in data]) def test_tensor_fill_x(): tensor = Tensor.full((1, 2, 3, 4, 5, 6), fill_value=-22) - data = tensor.to_list() + data = tensor.tolist() assert len(data) == 1 * 2 * 3 * 4 * 5 * 6 assert all([x == -22 for x in data]) def test_tensor_fill_uniform(): tensor = Tensor.uniform((1, 2, 3, 4, 5, 6), interval=(-1, 1)) - data = tensor.to_list() + data = tensor.tolist() assert len(data) == 1 * 2 * 3 * 4 * 5 * 6 assert all([-1 <= x <= 1 for x in data]) def test_tensor_fill_uniform2(): tensor = Tensor.uniform((1, 2, 3, 4, 5, 6), interval=(0, 100)) - data = tensor.to_list() + data = tensor.tolist() assert len(data) == 1 * 2 * 3 * 4 * 5 * 6 assert all([0 <= x <= 100 for x in data]) def test_tensor_fill_uniform3(): tensor = Tensor.uniform((1, 2, 3, 4, 5, 6), interval=(-1000, -20)) - data = tensor.to_list() + data = tensor.tolist() assert len(data) == 1 * 2 * 3 * 4 * 5 * 6 assert all([-1000 <= x <= -20 for x in data]) @@ -36,7 +36,7 @@ def test_tensor_fill_normal(): mean = 0.0 stddev = 1 tensor = Tensor.normal((1, 2, 3, 4, 5, 6), mean=mean, stddev=stddev) - data = tensor.to_list() + data = tensor.tolist() assert len(data) == 1 * 2 * 3 * 4 * 5 * 6 #assert all([abs(x - mean) <= 3 * stddev for x in data]) TODO @@ -46,7 +46,7 @@ def test_tensor_fill_const_1d(): assert tensor.shape == (4,) assert tensor.numel == 4 assert tensor.rank == 1 - data = tensor.to_list() + data = tensor.tolist() assert data == [1, 2, 3, 4] def test_tensor_fill_const_2d(): @@ -58,7 +58,7 @@ def test_tensor_fill_const_2d(): assert tensor.shape == (2, 2) assert tensor.numel == 4 assert tensor.rank == 2 - data = tensor.to_list() + data = tensor.tolist() assert data == [1, 2, 3, 4] def test_tensor_fill_const_3d(): @@ -76,7 +76,7 @@ def test_tensor_fill_const_3d(): assert tensor.shape == (2, 2, 2) assert tensor.numel == 8 assert tensor.rank == 3 - data = tensor.to_list() + data = tensor.tolist() assert data == [1, 2, 3, 4, 1, 2, 3, 4] def test_tensor_fill_const_4d(): @@ -106,7 +106,7 @@ def test_tensor_fill_const_4d(): assert tensor.shape == (2, 2, 2, 2) assert tensor.numel == 16 assert tensor.rank == 4 - data = tensor.to_list() + data = tensor.tolist() assert data == [1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4] def test_tensor_fill_const_5d(): @@ -160,7 +160,7 @@ def test_tensor_fill_const_5d(): assert tensor.shape == (2, 2, 2, 2, 2) assert tensor.numel == 32 assert tensor.rank == 5 - data = tensor.to_list() + data = tensor.tolist() assert data == [1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4] def test_tensor_fill_const_6d(): @@ -262,6 +262,6 @@ def test_tensor_fill_const_6d(): assert tensor.shape == (2, 2, 2, 2, 2, 2) assert tensor.numel == 64 assert tensor.rank == 6 - data = tensor.to_list() + data = tensor.tolist() assert data == [1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4] diff --git a/python/tests/tensor_ops1.py b/python/tests/tensor_ops1.py index 87c8440..e49b10a 100644 --- a/python/tests/tensor_ops1.py +++ b/python/tests/tensor_ops1.py @@ -8,7 +8,7 @@ def test_tensor_clone(): assert a.shape == b.shape assert a.numel == b.numel assert a.rank == b.rank - assert a.to_list() == b.to_list() + assert a.tolist() == b.tolist() assert a.is_contiguous == b.is_contiguous def test_tensor_transpose(): @@ -20,15 +20,15 @@ def test_tensor_transpose(): assert b.numel == 6 assert a.rank == 2 assert b.rank == 2 - assert a.to_list() == [1, 1, 1, 1, 1, 1] - assert b.to_list() == [1, 1, 1, 1, 1, 1] + assert a.tolist() == [1, 1, 1, 1, 1, 1] + assert b.tolist() == [1, 1, 1, 1, 1, 1] assert a.is_contiguous assert not a.is_transposed assert not a.is_permuted assert not b.is_contiguous assert b.is_transposed assert b.is_permuted - +""" def test_tensor_transpose_6d(): a = Tensor.full((1, 2, 3, 4, 5, 6), fill_value=1) b = a.transpose() @@ -38,15 +38,15 @@ def test_tensor_transpose_6d(): assert b.numel == 720 assert a.rank == 6 assert b.rank == 6 - assert a.to_list() == [1] * 720 - assert b.to_list() == [1] * 720 + assert a.tolist() == [1] * 720 + assert b.tolist() == [1] * 720 assert a.is_contiguous assert not a.is_transposed assert not a.is_permuted assert not b.is_contiguous assert b.is_transposed assert b.is_permuted - +""" def test_tensor_permute(): a = Tensor.full((2, 3), fill_value=1) b = a.permute((1, 0)) @@ -56,8 +56,8 @@ def test_tensor_permute(): assert b.numel == 6 assert a.rank == 2 assert b.rank == 2 - assert a.to_list() == [1, 1, 1, 1, 1, 1] - assert b.to_list() == [1, 1, 1, 1, 1, 1] + assert a.tolist() == [1, 1, 1, 1, 1, 1] + assert b.tolist() == [1, 1, 1, 1, 1, 1] assert a.is_contiguous assert not a.is_transposed assert not a.is_permuted @@ -74,8 +74,8 @@ def test_tensor_permute_6d(): assert b.numel == 720 assert a.rank == 6 assert b.rank == 6 - assert a.to_list() == [1] * 720 - assert b.to_list() == [1] * 720 + assert a.tolist() == [1] * 720 + assert b.tolist() == [1] * 720 assert a.is_contiguous assert not a.is_transposed assert not a.is_permuted diff --git a/python/tests/tests.py b/python/tests/tests.py index 5df02eb..a39f602 100644 --- a/python/tests/tests.py +++ b/python/tests/tests.py @@ -8,4 +8,4 @@ def test_import_magnetron(): def test_simple_exec(): import magnetron as mag a = mag.Tensor.const([1, 4, 1]) - assert a.max().scalar() == 4 + assert a.max()[0] == 4 From d1bc1a9775b6d38417eef5d73d02a8412876019e Mon Sep 17 00:00:00 2001 From: "Mario Sieg (Neo)" Date: Fri, 24 Jan 2025 02:26:44 +0100 Subject: [PATCH 13/16] Add linux arm64 runner --- .github/workflows/cmake-multi-platform.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/cmake-multi-platform.yml b/.github/workflows/cmake-multi-platform.yml index 471b207..c4b7c79 100644 --- a/.github/workflows/cmake-multi-platform.yml +++ b/.github/workflows/cmake-multi-platform.yml @@ -13,7 +13,7 @@ jobs: strategy: fail-fast: false matrix: - os: [ubuntu-latest, windows-latest, macos-latest] + os: [ubuntu-latest, windows-latest, macos-latest, ubuntu-24.04-arm] build_type: [Release] c_compiler: [gcc, clang, cl] include: From f6be80a0e03fd1512b05697832a32c8f985e474c Mon Sep 17 00:00:00 2001 From: "Mario Sieg (Neo)" Date: Fri, 24 Jan 2025 02:28:50 +0100 Subject: [PATCH 14/16] Fix arm64 feature matrix --- .github/workflows/cmake-multi-platform.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.github/workflows/cmake-multi-platform.yml b/.github/workflows/cmake-multi-platform.yml index c4b7c79..f592834 100644 --- a/.github/workflows/cmake-multi-platform.yml +++ b/.github/workflows/cmake-multi-platform.yml @@ -26,6 +26,12 @@ jobs: - os: ubuntu-latest c_compiler: clang cpp_compiler: clang++ + - os: ubuntu-24.04-arm + c_compiler: gcc + cpp_compiler: g++ + - os: ubuntu-24.04-arm + c_compiler: clang + cpp_compiler: clang++ - os: macos-latest c_compiler: clang cpp_compiler: clang++ @@ -36,6 +42,8 @@ jobs: c_compiler: clang - os: ubuntu-latest c_compiler: cl + - os: ubuntu-24.04-arm + c_compiler: cl - os: macos-latest c_compiler: gcc - os: macos-latest From 62843026af95884c8e3f54cee0d9f270684dd7d2 Mon Sep 17 00:00:00 2001 From: "Mario Sieg (Neo)" Date: Fri, 24 Jan 2025 02:37:40 +0100 Subject: [PATCH 15/16] Fixed arm64 loop search index --- magnetron/magnetron_cpu.c | 2 +- magnetron/magnetron_cpu_blas.inl | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/magnetron/magnetron_cpu.c b/magnetron/magnetron_cpu.c index d2d3c64..089cf21 100644 --- a/magnetron/magnetron_cpu.c +++ b/magnetron/magnetron_cpu.c @@ -88,7 +88,7 @@ static const mag_arm64_blas_specialization mag_arm64_blas_specializations[] = { static bool mag_blas_detect_gen_optimal_spec(const mag_ctx_t* ctx, mag_kernel_registry_t* kernels) { uint64_t cap_avail = ctx->machine.arm64_cpu_caps; - for (size_t i=1; i < sizeof(mag_arm64_blas_specializations)/sizeof(*mag_arm64_blas_specializations); ++i) { /* Find best blas spec for the host CPU */ + for (size_t i=0; i < sizeof(mag_arm64_blas_specializations)/sizeof(*mag_arm64_blas_specializations); ++i) { /* Find best blas spec for the host CPU */ const mag_arm64_blas_specialization* spec = mag_arm64_blas_specializations+i; uint64_t cap_required = (*spec->get_cap_permutation)(); /* Get requires features */ if ((cap_avail & cap_required) == cap_required) { /* Since specializations are sorted by score, we found the perfect spec. */ diff --git a/magnetron/magnetron_cpu_blas.inl b/magnetron/magnetron_cpu_blas.inl index 332ed9d..182bd20 100644 --- a/magnetron/magnetron_cpu_blas.inl +++ b/magnetron/magnetron_cpu_blas.inl @@ -1559,8 +1559,8 @@ uint64_t MAG_BLAS_SPECIALIZATION_FEAT_REQUEST() { #elif defined(__aarch64__) -mag_arm64_cap_t MAG_BLAS_SPECIALIZATION_FEAT_REQUEST(void) { - mag_arm64_cap_t caps = 1u< Date: Fri, 24 Jan 2025 03:04:05 +0100 Subject: [PATCH 16/16] Win32 fixes --- cmake/blas_tune.cmake | 3 ++- magnetron/magnetron.c | 1 - magnetron/magnetron_cpu.c | 26 +++++++++++------------ magnetron/magnetron_cpu_blas_amd64_v2.c | 4 ++++ magnetron/magnetron_cpu_blas_amd64_v2_5.c | 3 +++ magnetron/magnetron_cpu_blas_amd64_v3.c | 2 ++ magnetron/magnetron_cpu_blas_amd64_v4.c | 5 +++-- magnetron/magnetron_cpu_blas_amd64_v4_5.c | 2 ++ 8 files changed, 29 insertions(+), 17 deletions(-) diff --git a/cmake/blas_tune.cmake b/cmake/blas_tune.cmake index 216511a..4eea9cb 100644 --- a/cmake/blas_tune.cmake +++ b/cmake/blas_tune.cmake @@ -26,7 +26,8 @@ if (${IS_AMD64}) # x86-64 specific compilation options set(MAGNETRON_SOURCES ${MAGNETRON_SOURCES} ${MAGNETRON_BLAS_SPEC_AMD64_SOURCES}) set_blas_spec_arch("magnetron_cpu_blas_amd64_v2.c" "-mtune=nehalem -mcx16 -mpopcnt -msse3 -mssse3 -msse4.1 -msse4.2" "/arch:SSE4.2") set_blas_spec_arch("magnetron_cpu_blas_amd64_v2_5.c" "-mtune=ivybridge -mavx -mno-avx2 -mcx16 -mpopcnt -msse3 -mssse3 -msse4.1 -msse4.2" "/arch:AVX") - set_blas_spec_arch("magnetron_cpu_blas_amd64_v3.c" "-mtune=haswell -mavx -mavx2 -mbmi -mbmi2 -mf16c -mfma -mlzcnt -mmovbe" "/arch:AVX2") + set_blas_spec_arch("magnetron_cpu_blas_amd64_v3.c" "-mtune=haswell -mavx -mavx2 -mbmi -mbmi2 -mf16c -mfma -mlzcnt -mmovbe" + "/arch:AVX2 /D__BMI__=1 /D__BMI2__=1 /D__F16C__=1 /D__FMA__=1") # MSVC is just annoying set_blas_spec_arch("magnetron_cpu_blas_amd64_v4.c" "-mtune=cannonlake -mavx512f -mavx512bw -mavx512vl -mavx512dq -mavx -mavx2 -mbmi -mbmi2 -mf16c -mfma -mlzcnt -mmovbe" "/arch:AVX512") set_blas_spec_arch("magnetron_cpu_blas_amd64_v4_5.c" "-mtune=generic -mavx512f -mavx512bw -mavx512vl -mavx512dq -mavx512vnni -mavx512bf16 -mavx -mavx2 -mbmi -mbmi2 -mf16c -mfma -mlzcnt -mmovbe" "/arch:AVX512") elseif(${IS_ARM64}) diff --git a/magnetron/magnetron.c b/magnetron/magnetron.c index 80561aa..188c249 100644 --- a/magnetron/magnetron.c +++ b/magnetron/magnetron.c @@ -590,7 +590,6 @@ void mag_ctx_destroy(mag_ctx_t* ctx) { } *head = NULL; #endif - mag_fixed_intrusive_pool_print_info(&ctx->tensor_pool, "Tensor Hull Pool"); mag_fixed_intrusive_pool_destroy(&ctx->tensor_pool); mag_destroy_dynamic_device(ctx->device); ctx->device = NULL; memset(ctx, 0, sizeof(*ctx)); diff --git a/magnetron/magnetron_cpu.c b/magnetron/magnetron_cpu.c index 089cf21..ac16892 100644 --- a/magnetron/magnetron_cpu.c +++ b/magnetron/magnetron_cpu.c @@ -31,15 +31,15 @@ mag_amd64_blas_spec_decl(3); mag_amd64_blas_spec_decl(2_5); mag_amd64_blas_spec_decl(2); -static const mag_amd64_blas_specialization mag_amd64_blas_specializations[] = { /* Dynamic selectable BLAS permutations, sorted from best to worst score. */ - mag_amd64_blas_spec_permute(4_5), - mag_amd64_blas_spec_permute(4), - mag_amd64_blas_spec_permute(3), - mag_amd64_blas_spec_permute(2_5), - mag_amd64_blas_spec_permute(2), -}; - static bool mag_blas_detect_gen_optimal_spec(const mag_ctx_t* ctx, mag_kernel_registry_t* kernels) { + const mag_amd64_blas_specialization mag_amd64_blas_specializations[] = { /* Dynamic selectable BLAS permutations, sorted from best to worst score. */ + mag_amd64_blas_spec_permute(4_5), + mag_amd64_blas_spec_permute(4), + mag_amd64_blas_spec_permute(3), + mag_amd64_blas_spec_permute(2_5), + mag_amd64_blas_spec_permute(2), + }; + uint64_t cap_avail = ctx->machine.amd64_cpu_caps; for (size_t i=0; i < sizeof(mag_amd64_blas_specializations)/sizeof(*mag_amd64_blas_specializations); ++i) { /* Find best blas spec for the host CPU */ const mag_amd64_blas_specialization* spec = mag_amd64_blas_specializations+i; @@ -81,12 +81,12 @@ typedef struct mag_arm64_blas_specialization { mag_arm64_blas_spec_decl(9); mag_arm64_blas_spec_decl(8_2); -static const mag_arm64_blas_specialization mag_arm64_blas_specializations[] = { /* Dynamic selectable BLAS permutations, sorted from best to worst score. */ - mag_arm64_blas_spec_permute(9), - mag_arm64_blas_spec_permute(8_2), -}; - static bool mag_blas_detect_gen_optimal_spec(const mag_ctx_t* ctx, mag_kernel_registry_t* kernels) { + const mag_arm64_blas_specialization mag_arm64_blas_specializations[] = { /* Dynamic selectable BLAS permutations, sorted from best to worst score. */ + mag_arm64_blas_spec_permute(9), + mag_arm64_blas_spec_permute(8_2), + }; + uint64_t cap_avail = ctx->machine.arm64_cpu_caps; for (size_t i=0; i < sizeof(mag_arm64_blas_specializations)/sizeof(*mag_arm64_blas_specializations); ++i) { /* Find best blas spec for the host CPU */ const mag_arm64_blas_specialization* spec = mag_arm64_blas_specializations+i; diff --git a/magnetron/magnetron_cpu_blas_amd64_v2.c b/magnetron/magnetron_cpu_blas_amd64_v2.c index 29b2119..39a5bfa 100644 --- a/magnetron/magnetron_cpu_blas_amd64_v2.c +++ b/magnetron/magnetron_cpu_blas_amd64_v2.c @@ -1,5 +1,7 @@ /* (c) 2025 Mario "Neo" Sieg. */ + +#ifndef _MSC_VER #if !defined(__SSE__) \ || !defined(__SSE2__) \ || !defined(__SSE3__) \ @@ -8,6 +10,7 @@ || !defined(__SSE4_2__) #error "BLAS specialization requires matching compile flags" #endif +#endif #ifdef __AVX__ #error "BLAS specialization feature too high" #endif @@ -16,3 +19,4 @@ #define MAG_BLAS_SPECIALIZATION_FEAT_REQUEST mag_cpu_blas_specialization_amd64_v2_features #include "magnetron_cpu_blas.inl" + diff --git a/magnetron/magnetron_cpu_blas_amd64_v2_5.c b/magnetron/magnetron_cpu_blas_amd64_v2_5.c index 625a5cd..a025549 100644 --- a/magnetron/magnetron_cpu_blas_amd64_v2_5.c +++ b/magnetron/magnetron_cpu_blas_amd64_v2_5.c @@ -1,5 +1,7 @@ /* (c) 2025 Mario "Neo" Sieg. */ + +#ifndef _MSC_VER #if !defined(__SSE__) \ || !defined(__SSE2__) \ || !defined(__SSE3__) \ @@ -9,6 +11,7 @@ || !defined(__AVX__) #error "BLAS specialization requires matching compile flags" #endif +#endif #ifdef __AVX2__ #error "BLAS specialization feature too high" #endif diff --git a/magnetron/magnetron_cpu_blas_amd64_v3.c b/magnetron/magnetron_cpu_blas_amd64_v3.c index d0064fb..474299c 100644 --- a/magnetron/magnetron_cpu_blas_amd64_v3.c +++ b/magnetron/magnetron_cpu_blas_amd64_v3.c @@ -1,5 +1,6 @@ /* (c) 2025 Mario "Neo" Sieg. */ +#ifndef _MSC_VER #if !defined(__SSE__) \ || !defined(__SSE2__) \ || !defined(__SSE3__) \ @@ -14,6 +15,7 @@ || !defined(__FMA__) #error "BLAS specialization requires matching compile flags" #endif +#endif #ifdef __AVX512F__ #error "BLAS specialization feature too high" #endif diff --git a/magnetron/magnetron_cpu_blas_amd64_v4.c b/magnetron/magnetron_cpu_blas_amd64_v4.c index 2c706f6..2ed24fc 100644 --- a/magnetron/magnetron_cpu_blas_amd64_v4.c +++ b/magnetron/magnetron_cpu_blas_amd64_v4.c @@ -1,5 +1,7 @@ /* (c) 2025 Mario "Neo" Sieg. */ + +#ifndef _MSC_VER #if !defined(__SSE__) \ || !defined(__SSE2__) \ || !defined(__SSE3__) \ @@ -8,8 +10,6 @@ || !defined(__SSE4_2__) \ || !defined(__AVX__) \ || !defined(__AVX2__) \ - || !defined(__BMI__) \ - || !defined(__BMI2__) \ || !defined(__F16C__) \ || !defined(__FMA__) \ || !defined(__AVX512F__) \ @@ -18,6 +18,7 @@ || !defined(__AVX512VL__) #error "BLAS specialization requires matching compile flags" #endif +#endif #ifdef __AVX512VNNI__ #error "BLAS specialization feature too high" #endif diff --git a/magnetron/magnetron_cpu_blas_amd64_v4_5.c b/magnetron/magnetron_cpu_blas_amd64_v4_5.c index ddca599..de8e725 100644 --- a/magnetron/magnetron_cpu_blas_amd64_v4_5.c +++ b/magnetron/magnetron_cpu_blas_amd64_v4_5.c @@ -1,5 +1,6 @@ /* (c) 2025 Mario "Neo" Sieg. */ +#ifndef _MSC_VER #if !defined(__SSE__) \ || !defined(__SSE2__) \ || !defined(__SSE3__) \ @@ -20,6 +21,7 @@ || !defined(__AVX512BF16__) #error "BLAS specialization requires matching compile flags" #endif +#endif #ifdef __APX__ #error "BLAS specialization feature too high" #endif