diff --git a/.github/workflows/cmake-multi-platform.yml b/.github/workflows/cmake-multi-platform.yml index 71af77e..f592834 100644 --- a/.github/workflows/cmake-multi-platform.yml +++ b/.github/workflows/cmake-multi-platform.yml @@ -13,7 +13,7 @@ jobs: strategy: fail-fast: false matrix: - os: [ubuntu-latest, windows-latest, macos-latest] + os: [ubuntu-latest, windows-latest, macos-latest, ubuntu-24.04-arm] build_type: [Release] c_compiler: [gcc, clang, cl] include: @@ -26,6 +26,12 @@ jobs: - os: ubuntu-latest c_compiler: clang cpp_compiler: clang++ + - os: ubuntu-24.04-arm + c_compiler: gcc + cpp_compiler: g++ + - os: ubuntu-24.04-arm + c_compiler: clang + cpp_compiler: clang++ - os: macos-latest c_compiler: clang cpp_compiler: clang++ @@ -36,6 +42,8 @@ jobs: c_compiler: clang - os: ubuntu-latest c_compiler: cl + - os: ubuntu-24.04-arm + c_compiler: cl - os: macos-latest c_compiler: gcc - os: macos-latest @@ -103,4 +111,4 @@ jobs: shell: bash run: | . .venv/bin/activate - python -m pytest ${{ github.workspace }}/python/tests/tests.py + python -m pytest ${{ github.workspace }}/python/tests/* diff --git a/.gitignore b/.gitignore index dd3c6b5..f42a950 100644 --- a/.gitignore +++ b/.gitignore @@ -33,3 +33,4 @@ out/ .tmp CMakeSettings.json magnetron_chat/.idea +/build/ diff --git a/CMakeLists.txt b/CMakeLists.txt index 30be04a..2af510a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -35,7 +35,7 @@ endif() include(cmake/arch.cmake) include(cmake/lib.cmake) -include(cmake/comflags.cmake) +include(cmake/compiler_config.cmake) if (${MAGNETRON_ENABLE_CUDA}) include(cmake/cuda.cmake) diff --git a/cmake/blas_tune.cmake b/cmake/blas_tune.cmake index ca90d90..4eea9cb 100644 --- a/cmake/blas_tune.cmake +++ b/cmake/blas_tune.cmake @@ -10,25 +10,28 @@ function(set_blas_spec_arch filename posix_arch msvc_arch) endfunction() set(MAGNETRON_BLAS_SPEC_AMD64_SOURCES - magnetron/magnetron_cpu_blas_amd64_sse42.c - magnetron/magnetron_cpu_blas_amd64_avx.c - magnetron/magnetron_cpu_blas_amd64_avx2.c - magnetron/magnetron_cpu_blas_amd64_avx512f.c - magnetron/magnetron_cpu_blas_amd64_znver4.c + magnetron/magnetron_cpu_blas_amd64_v2.c + magnetron/magnetron_cpu_blas_amd64_v2_5.c + magnetron/magnetron_cpu_blas_amd64_v3.c + magnetron/magnetron_cpu_blas_amd64_v4.c + magnetron/magnetron_cpu_blas_amd64_v4_5.c ) set(MAGNETRON_BLAS_SPEC_ARM64_SOURCES - magnetron/magnetron_cpu_blas_arm64_82.c + magnetron/magnetron_cpu_blas_arm64_v8_2.c + magnetron/magnetron_cpu_blas_arm64_v9.c ) if (${IS_AMD64}) # x86-64 specific compilation options set(MAGNETRON_SOURCES ${MAGNETRON_SOURCES} ${MAGNETRON_BLAS_SPEC_AMD64_SOURCES}) - set_blas_spec_arch("magnetron_cpu_blas_amd64_sse42.c" "-mtune=nehalem -msse4.2" "/arch:SSE4.2") - set_blas_spec_arch("magnetron_cpu_blas_amd64_avx.c" "-mtune=sandybridge -mavx" "/arch:AVX") - set_blas_spec_arch("magnetron_cpu_blas_amd64_avx2.c" "-mtune=skylake -mavx -mavx2 -mfma -mf16c" "/arch:AVX2") - set_blas_spec_arch("magnetron_cpu_blas_amd64_avx512f.c" "-mtune=cannonlake -mavx -mavx2 -mfma -mf16c -mavx512f" "/arch:AVX512") - set_blas_spec_arch("magnetron_cpu_blas_amd64_znver4.c" "-mavx -mavx2 -mfma -mf16c -mavx512f -mavx512vl -mavx512vnni -mavx512bf16 -mavx512bw -mavx512dq" "/arch:AVX512") + set_blas_spec_arch("magnetron_cpu_blas_amd64_v2.c" "-mtune=nehalem -mcx16 -mpopcnt -msse3 -mssse3 -msse4.1 -msse4.2" "/arch:SSE4.2") + set_blas_spec_arch("magnetron_cpu_blas_amd64_v2_5.c" "-mtune=ivybridge -mavx -mno-avx2 -mcx16 -mpopcnt -msse3 -mssse3 -msse4.1 -msse4.2" "/arch:AVX") + set_blas_spec_arch("magnetron_cpu_blas_amd64_v3.c" "-mtune=haswell -mavx -mavx2 -mbmi -mbmi2 -mf16c -mfma -mlzcnt -mmovbe" + "/arch:AVX2 /D__BMI__=1 /D__BMI2__=1 /D__F16C__=1 /D__FMA__=1") # MSVC is just annoying + set_blas_spec_arch("magnetron_cpu_blas_amd64_v4.c" "-mtune=cannonlake -mavx512f -mavx512bw -mavx512vl -mavx512dq -mavx -mavx2 -mbmi -mbmi2 -mf16c -mfma -mlzcnt -mmovbe" "/arch:AVX512") + set_blas_spec_arch("magnetron_cpu_blas_amd64_v4_5.c" "-mtune=generic -mavx512f -mavx512bw -mavx512vl -mavx512dq -mavx512vnni -mavx512bf16 -mavx -mavx2 -mbmi -mbmi2 -mf16c -mfma -mlzcnt -mmovbe" "/arch:AVX512") elseif(${IS_ARM64}) set(MAGNETRON_SOURCES ${MAGNETRON_SOURCES} ${MAGNETRON_BLAS_SPEC_ARM64_SOURCES}) - set_blas_spec_arch("magnetron_cpu_blas_arm64_82.c" "-march=armv8.2-a+dotprod+fp16" "") + set_blas_spec_arch("magnetron_cpu_blas_arm64_v8_2.c" "-march=armv8.2-a+dotprod+fp16" "") + set_blas_spec_arch("magnetron_cpu_blas_arm64_v9.c" "-march=armv9-a+sve+sve2" "") endif() \ No newline at end of file diff --git a/cmake/comflags.cmake b/cmake/compiler_config.cmake similarity index 88% rename from cmake/comflags.cmake rename to cmake/compiler_config.cmake index 223bf2b..a4e443d 100644 --- a/cmake/comflags.cmake +++ b/cmake/compiler_config.cmake @@ -27,6 +27,7 @@ set(MAG_CLANG_COMPILE_FLAGS set(MAG_CLANG_RELEASE_COMPILE_FLAGS -O3 -flto + -fomit-frame-pointer ) set(MAG_CLANG_LINK_OPTIONS "") set(MAG_CLANG_RELEASE_LINK_OPTIONS -flto) @@ -50,6 +51,14 @@ set(MAG_GCC_RELEASE_COMPILE_FLAGS set(MAG_GCC_LINK_OPTIONS "") set(MAG_GCC_RELEASE_LINK_OPTIONS -flto) +if (${IS_ARM64}) + set(MAG_CLANG_COMPILE_FLAGS ${MAG_CLANG_COMPILE_FLAGS} -march=armv8-a) + set(MAG_GCC_COMPILE_FLAGS ${MAG_CLANG_COMPILE_FLAGS} -march=armv8-a) +elseif (${IS_AMD64}) + set(MAG_CLANG_COMPILE_FLAGS ${MAG_CLANG_COMPILE_FLAGS} -msse -msse2) + set(MAG_GCC_COMPILE_FLAGS ${MAG_CLANG_COMPILE_FLAGS} -msse -msse2) +endif() + if (WIN32) # Windows (MSVC) specific config target_compile_options(magnetron PRIVATE ${MAG_MSVC_COMPILE_FLAGS}) target_link_options(magnetron PRIVATE ${MAG_MSVC_LINK_OPTIONS}) diff --git a/magnetron/magnetron.c b/magnetron/magnetron.c index 3d4ab9e..188c249 100644 --- a/magnetron/magnetron.c +++ b/magnetron/magnetron.c @@ -57,8 +57,7 @@ #include #include #ifdef __aarch64__ -#include -#include #endif #endif #endif @@ -139,30 +138,6 @@ void mag_free_aligned(void* blk) { #define STB_IMAGE_WRITE_IMPLEMENTATION #include -#if defined(__x86_64__) || defined(_M_X64) -#define _(enumerator, leaf, reg, bit) #enumerator -const char* const mag_x86_64_feature_names[MAG_X86_64_FEATURE__NUM] = { - mag_x86_64_feature_def(_, MAG_SEP) -}; -#undef _ -#define _(enumerator, leaf, reg, bit) (0xff&MAG_X86_64_CPUID_##leaf) -const uint8_t mag_x86_64_feature_leaves[MAG_X86_64_FEATURE__NUM] = { - mag_x86_64_feature_def(_, MAG_SEP) -}; -#undef _ -#define _(enumerator, leaf, reg, bit) (0xff&MAG_X86_64_CPUID_##reg) -const uint8_t mag_x86_64_feature_regs[MAG_X86_64_FEATURE__NUM] = { - mag_x86_64_feature_def(_, MAG_SEP) -}; -#undef _ -#define _(enumerator, leaf, reg, bit) (1u<<(bit)) -const uint32_t mag_x86_64_feature_masks[MAG_X86_64_FEATURE__NUM] = { - mag_x86_64_feature_def(_, MAG_SEP) -}; -#undef _ -#undef mag_x86_64_feature_def -#endif - void* (*mag_get_alloc_fn(void))(void* blk, size_t size) { return mag_alloc; } @@ -308,6 +283,19 @@ uintptr_t mag_thread_id(void) { return tid; } +#if defined(__x86_64__) || defined(_M_X64) +#define _(enumerator, leaf, reg, bit) #enumerator +const char* const mag_amd64_cap_names[MAG_AMD64_CAP__NUM] = { + mag_x86_64_feature_def(_, MAG_SEP) +}; +#undef _ +#elif defined(__aarch64__) +#define _(ident) #ident +const char* const mag_arm64_cap_names[MAG_ARM64_CAP__NUM] = { + mag_arm64_feature_def(_, MAG_SEP) +}; +#endif + static uint64_t mag_hpc_clock_ns(void) { /* High precision clock in nanoseconds. */ #ifdef _WIN32 static LONGLONG t_freq; @@ -475,7 +463,7 @@ static void mag_prng_init(mag_ctx_t* ctx, uint64_t seed) { static void mag_system_host_info_query(mag_ctx_t* ctx); /* Query host system information. */ static void mag_system_host_info_dump(mag_ctx_t* ctx) { - mag_log_info("OS/Kernel: %s", ctx->sys.os_name); + mag_log_info("OS/Kernel: %s", ctx->machine.os_name); const char* cpu_arch = "?"; #if defined(__x86_64__) || defined(_M_X64) cpu_arch = "x86-64"; @@ -484,25 +472,30 @@ static void mag_system_host_info_dump(mag_ctx_t* ctx) { #else #error "Unknwon CPU arch" #endif - mag_log_info("CPU (%s): %s, Virtual Cores: %u, Physical Cores: %u, Sockets: %u", cpu_arch, ctx->sys.cpu_name, ctx->sys.cpu_virtual_cores, ctx->sys.cpu_physical_cores, ctx->sys.cpu_sockets); + mag_log_info("CPU: %s, Virtual Cores: %u, Physical Cores: %u, Sockets: %u", ctx->machine.cpu_name, ctx->machine.cpu_virtual_cores, ctx->machine.cpu_physical_cores, ctx->machine.cpu_sockets); #if defined(__x86_64__) || defined(_M_X64) /* Print CPU features for x86-64 platforms. */ if (mag_log_enabled) { - printf("CPU Features:"); - for (uint32_t i=0, k=0; i < MAG_X86_64_FEATURE__NUM; ++i) { - if (mag_ctx_x86_64_cpu_has_feature(ctx, i)) { - if ((k++ & 7) == 0) printf("\n\t"); - printf("%s ", mag_x86_64_feature_names[i]); - } - } + printf(MAG_CC_CYAN "[magnetron] " MAG_CC_RESET "%s caps: ", cpu_arch); + for (uint64_t i=0; i < MAG_AMD64_CAP__NUM; ++i) + if (ctx->machine.amd64_cpu_caps & (1ull<machine.arm64_cpu_caps & (1ull<sys.phys_mem_total, &mem_total, &mem_unit_total); - mag_humanize_memory_size(ctx->sys.phys_mem_free, &mem_free, &mem_unit_free); - mag_humanize_memory_size((size_t)llabs((int64_t)ctx->sys.phys_mem_total-(int64_t)ctx->sys.phys_mem_free), &mem_used, &mem_unit_used); - double mem_used_percent = fabs((double)(ctx->sys.phys_mem_total-ctx->sys.phys_mem_free))/(double)ctx->sys.phys_mem_total*100.0; + mag_humanize_memory_size(ctx->machine.phys_mem_total, &mem_total, &mem_unit_total); + mag_humanize_memory_size(ctx->machine.phys_mem_free, &mem_free, &mem_unit_free); + mag_humanize_memory_size((size_t)llabs((int64_t)ctx->machine.phys_mem_total-(int64_t)ctx->machine.phys_mem_free), &mem_used, &mem_unit_used); + double mem_used_percent = fabs((double)(ctx->machine.phys_mem_total-ctx->machine.phys_mem_free))/(double)ctx->machine.phys_mem_total*100.0; mag_log_info("Physical Machine Memory: %.03f %s, Free: %.03f %s, Used: %.03f %s (%.02f%%)", mem_total, mem_unit_total, mem_free, mem_unit_free, mem_used, mem_unit_used, mem_used_percent); } @@ -597,7 +590,6 @@ void mag_ctx_destroy(mag_ctx_t* ctx) { } *head = NULL; #endif - mag_fixed_intrusive_pool_print_info(&ctx->tensor_pool, "Tensor Hull Pool"); mag_fixed_intrusive_pool_destroy(&ctx->tensor_pool); mag_destroy_dynamic_device(ctx->device); ctx->device = NULL; memset(ctx, 0, sizeof(*ctx)); @@ -622,13 +614,13 @@ void mag_ctx_set_prng_algorithm(mag_ctx_t* ctx, mag_prng_algorithm_t algorithm, mag_compute_device_type_t mag_ctx_get_compute_device_type(const mag_ctx_t* ctx) { return ctx->device_type; } const char* mag_ctx_get_compute_device_name(const mag_ctx_t* ctx) { return ctx->device->name; } -const char* mag_ctx_get_os_name(const mag_ctx_t* ctx) { return ctx->sys.os_name; } -const char* mag_ctx_get_cpu_name(const mag_ctx_t* ctx) { return ctx->sys.cpu_name; } -uint32_t mag_ctx_get_cpu_virtual_cores(const mag_ctx_t* ctx) { return ctx->sys.cpu_virtual_cores; } -uint32_t mag_ctx_get_cpu_physical_cores(const mag_ctx_t* ctx) { return ctx->sys.cpu_physical_cores; } -uint32_t mag_ctx_get_cpu_sockets(const mag_ctx_t* ctx) { return ctx->sys.cpu_sockets; } -uint64_t mag_ctx_get_physical_memory_total(const mag_ctx_t* ctx) { return ctx->sys.phys_mem_total; } -uint64_t mag_ctx_get_physical_memory_free(const mag_ctx_t* ctx) { return ctx->sys.phys_mem_free; } +const char* mag_ctx_get_os_name(const mag_ctx_t* ctx) { return ctx->machine.os_name; } +const char* mag_ctx_get_cpu_name(const mag_ctx_t* ctx) { return ctx->machine.cpu_name; } +uint32_t mag_ctx_get_cpu_virtual_cores(const mag_ctx_t* ctx) { return ctx->machine.cpu_virtual_cores; } +uint32_t mag_ctx_get_cpu_physical_cores(const mag_ctx_t* ctx) { return ctx->machine.cpu_physical_cores; } +uint32_t mag_ctx_get_cpu_sockets(const mag_ctx_t* ctx) { return ctx->machine.cpu_sockets; } +uint64_t mag_ctx_get_physical_memory_total(const mag_ctx_t* ctx) { return ctx->machine.phys_mem_total; } +uint64_t mag_ctx_get_physical_memory_free(const mag_ctx_t* ctx) { return ctx->machine.phys_mem_free; } bool mag_ctx_is_numa_system(const mag_ctx_t* ctx) { return false; /* TODO */ } size_t mag_ctx_get_total_tensors_created(const mag_ctx_t* ctx) { return 0; /* TODO */ } @@ -796,24 +788,14 @@ void mag_ctx_profile_stop_recording(mag_ctx_t* ctx, const char* export_csv_file) bool csv = export_csv_file && *export_csv_file; if (!csv) { mag_print_separator(stdout); - printf("OS/Kernel: %s\n", ctx->sys.os_name); - printf("CPU: %s, Virtual Cores: %u, Physical Cores: %u, Sockets: %u\n", ctx->sys.cpu_name, ctx->sys.cpu_virtual_cores, ctx->sys.cpu_physical_cores, ctx->sys.cpu_sockets); - #if defined(__x86_64__) || defined(_M_X64) /* Print CPU features for x86-64 platforms. */ - printf("CPU Features:"); - for (unsigned i=0, k=0; i < MAG_X86_64_FEATURE__NUM; ++i) { - if (mag_ctx_x86_64_cpu_has_feature(ctx, i)) { - if (k++ % 8 == 0) printf("\n\t"); - printf("%s ", mag_x86_64_feature_names[i]); - } - } - putchar('\n'); - #endif + printf("OS/Kernel: %s\n", ctx->machine.os_name); + printf("CPU: %s, Virtual Cores: %u, Physical Cores: %u, Sockets: %u\n", ctx->machine.cpu_name, ctx->machine.cpu_virtual_cores, ctx->machine.cpu_physical_cores, ctx->machine.cpu_sockets); double mem_total, mem_free, mem_used; const char* mem_unit_total, *mem_unit_free, *mem_unit_used; - mag_humanize_memory_size(ctx->sys.phys_mem_total, &mem_total, &mem_unit_total); - mag_humanize_memory_size(ctx->sys.phys_mem_free, &mem_free, &mem_unit_free); - mag_humanize_memory_size((size_t)llabs((int64_t)ctx->sys.phys_mem_total-(int64_t)ctx->sys.phys_mem_free), &mem_used, &mem_unit_used); - double mem_used_percent = fabs((double)(ctx->sys.phys_mem_total-ctx->sys.phys_mem_free))/(double)ctx->sys.phys_mem_total*100.0; + mag_humanize_memory_size(ctx->machine.phys_mem_total, &mem_total, &mem_unit_total); + mag_humanize_memory_size(ctx->machine.phys_mem_free, &mem_free, &mem_unit_free); + mag_humanize_memory_size((size_t)llabs((int64_t)ctx->machine.phys_mem_total-(int64_t)ctx->machine.phys_mem_free), &mem_used, &mem_unit_used); + double mem_used_percent = fabs((double)(ctx->machine.phys_mem_total-ctx->machine.phys_mem_free))/(double)ctx->machine.phys_mem_total*100.0; printf("Physical memory: %.03f %s, Free: %.03f %s, Used: %.03f %s (%.02f%%)\n", mem_total, mem_unit_total, mem_free, mem_unit_free, mem_used, mem_unit_used, mem_used_percent); mag_print_separator(stdout); printf("%16s %16s %16s %16s %16s\n", "Operation", "Executions", "Usage (%)", "AVG Time (μs)", "Total Time (μs)"); @@ -2670,90 +2652,173 @@ static void MAG_COLDPROC mag_system_host_info_query_memory(uint64_t* out_phys_me return (uint64_t)lo | ((uint64_t)hi << 32); #endif } - #define mag_cpy_regs(id) \ - (*features)[MAG_X86_64_CPUID_##id][MAG_X86_64_CPUID_EAX] = eax; \ - (*features)[MAG_X86_64_CPUID_##id][MAG_X86_64_CPUID_EBX] = ebx; \ - (*features)[MAG_X86_64_CPUID_##id][MAG_X86_64_CPUID_ECX] = ecx; \ - (*features)[MAG_X86_64_CPUID_##id][MAG_X86_64_CPUID_EDX] = edx - static void MAG_COLDPROC mag_system_info_query_x86_64_cpu_features(uint32_t (*features)[8][4]) { + static void MAG_COLDPROC mag_system_info_query_amd64_cpu_caps(uint64_t* caps) { + *caps = 0; + uint32_t regs[8][4] = {0}; + + #define H0 0 + #define H1 1 + #define H2 2 + #define H7 3 + #define H80000001 4 + #define H80000007 5 + #define H16 6 + #define H7_1H 7 + #define EAX 0 + #define EBX 1 + #define ECX 2 + #define EDX 3 + + #define mag_cpy_regs(id) \ + regs[id][EAX] = eax; \ + regs[id][EBX] = ebx; \ + regs[id][ECX] = ecx; \ + regs[id][EDX] = edx + + #define _(enumerator, leaf, reg, shift) (0xff&leaf) + static const uint8_t feature_leaves[MAG_AMD64_CAP__NUM] = { + mag_x86_64_feature_def(_, MAG_SEP) + }; + #undef _ + #define _(enumerator, leaf, reg, shift) (0xff®) + static const uint8_t feature_regs[MAG_AMD64_CAP__NUM] = { + mag_x86_64_feature_def(_, MAG_SEP) + }; + #undef _ + #define _(enumerator, leaf, reg, shift) (1u<<(shift)) + static const uint32_t feature_masks[MAG_AMD64_CAP__NUM] = { + mag_x86_64_feature_def(_, MAG_SEP) + }; + #undef _ + #undef mag_x86_64_feature_def + #undef _ + uint32_t eax=0, ebx=0, ecx=0, edx=0; uint32_t max_basic_leaf, max_extended_leaf; mag_cpuid(0, -1, &eax, &ebx, &ecx, &edx); - mag_cpy_regs(0H); + mag_cpy_regs(H0); max_basic_leaf = eax; mag_cpuid(0x80000000u, -1, &eax, &ebx, &ecx, &edx); max_extended_leaf = eax; if (max_basic_leaf >= 1u) { mag_cpuid(1, -1, &eax, &ebx, &ecx, &edx); - mag_cpy_regs(1H); + mag_cpy_regs(H1); } if (max_basic_leaf >= 2u) { mag_cpuid(2u, -1, &eax, &ebx, &ecx, &edx); - mag_cpy_regs(2H); + mag_cpy_regs(H2); } if (max_basic_leaf >= 7u) { mag_cpuid(7u, 0, &eax, &ebx, &ecx, &edx); - mag_cpy_regs(7H); + mag_cpy_regs(H7); } if (max_basic_leaf >= 7u) { mag_cpuid(7u, 1, &eax, &ebx, &ecx, &edx); - mag_cpy_regs(7H_1H); + mag_cpy_regs(H7_1H); } if (max_basic_leaf >= 0x16u) { mag_cpuid(0x16u, -1, &eax, &ebx, &ecx, &edx); - mag_cpy_regs(16H); + mag_cpy_regs(H16); } if (max_extended_leaf >= 0x80000001u) { mag_cpuid(0x80000001u, -1, &eax, &ebx, &ecx, &edx); - mag_cpy_regs(80000001H); + mag_cpy_regs(H80000001); } if (max_extended_leaf >= 0x80000007u) { mag_cpuid(0x80000007u, -1, &eax, &ebx, &ecx, &edx); - mag_cpy_regs(80000007H); + mag_cpy_regs(H80000007); } - bool cpu_avx_support = ((*features)[MAG_X86_64_CPUID_1H][MAG_X86_64_CPUID_ECX] & 0x10000000u) != 0; - bool cpu_osxsave_support = ((*features)[MAG_X86_64_CPUID_1H][MAG_X86_64_CPUID_ECX] & 0x8000000u) != 0; + bool cpu_avx_support = !!(regs[H1][ECX] & 0x10000000u); + bool cpu_osxsave_support = !!(regs[H1][ECX] & 0x8000000u); if (cpu_avx_support && cpu_osxsave_support) { uint64_t xcr0 = mag_xgetbv(); if ((xcr0 & 0x6) != 0x6u) { - (*features)[MAG_X86_64_CPUID_1H][MAG_X86_64_CPUID_ECX] &= ~0x10000000u; /* Clear AVX */ - (*features)[MAG_X86_64_CPUID_7H][MAG_X86_64_CPUID_EBX] &= ~0x20u; /* Clear AVX2 */ + regs[H1][ECX] &= ~0x10000000u; /* Clear AVX */ + regs[H7][EBX] &= ~0x20u; /* Clear AVX2 */ } if ((xcr0 & 0xe0) != 0xe0u) { /* OS does not support AVX-512, clear AVX512 */ - (*features)[MAG_X86_64_CPUID_7H][MAG_X86_64_CPUID_EBX] &= ~0xdc230000u; - (*features)[MAG_X86_64_CPUID_7H][MAG_X86_64_CPUID_ECX] &= ~0x5842u; - (*features)[MAG_X86_64_CPUID_7H][MAG_X86_64_CPUID_EDX] &= ~0x10cu; - (*features)[MAG_X86_64_CPUID_7H_1H][MAG_X86_64_CPUID_EAX] &= ~0x20u; + regs[H7][EBX] &= ~0xdc230000u; + regs[H7][ECX] &= ~0x5842u; + regs[H7][EDX] &= ~0x10cu; + regs[H7_1H][EAX] &= ~0x20u; } } else { - (*features)[MAG_X86_64_CPUID_1H][MAG_X86_64_CPUID_ECX] &= ~0x10000000u; /* Clear AVX */ - (*features)[MAG_X86_64_CPUID_7H][MAG_X86_64_CPUID_EBX] &= ~0x20u; /* Clear AVX2 */ - (*features)[MAG_X86_64_CPUID_7H][MAG_X86_64_CPUID_EBX] &= ~0xdc230000u; /* Clear AVX512 */ - (*features)[MAG_X86_64_CPUID_7H][MAG_X86_64_CPUID_ECX] &= ~0x5842u; /* Clear AVX512 */ - (*features)[MAG_X86_64_CPUID_7H][MAG_X86_64_CPUID_EDX] &= ~0x10cu; /* Clear AVX512 */ - (*features)[MAG_X86_64_CPUID_7H_1H][MAG_X86_64_CPUID_EAX] &= ~0x20u; /* Clear AVX512 */ + regs[H1][ECX] &= ~0x10000000u; /* Clear AVX */ + regs[H7][EBX] &= ~0x20u; /* Clear AVX2 */ + regs[H7][EBX] &= ~0xdc230000u; /* Clear AVX512 */ + regs[H7][ECX] &= ~0x5842u; /* Clear AVX512 */ + regs[H7][EDX] &= ~0x10cu; /* Clear AVX512 */ + regs[H7_1H][EAX] &= ~0x20u; /* Clear AVX512 */ } - } - #undef mag_cpy_regs -#elif defined(__aarch64__) && defined(__linux__) -static void MAG_COLDPROC mag_system_info_query_arm64_cpu_features(long* hwcap) { - *hwcap = getauxval(AT_HWCAP); + for (uint64_t i=1; i < MAG_AMD64_CAP__NUM; ++i) /* Create bitset of features */ + if (regs[feature_leaves[i]][feature_regs[i]] & feature_masks[i]) + *caps |= 1ull<sys.os_name); - mag_system_host_info_query_cpu_name(&ctx->sys.cpu_name); - mag_system_host_info_query_cpu_cores(&ctx->sys.cpu_virtual_cores, &ctx->sys.cpu_physical_cores, &ctx->sys.cpu_sockets); - mag_system_host_info_query_memory(&ctx->sys.phys_mem_total, &ctx->sys.phys_mem_free); + mag_system_host_info_query_os_name(&ctx->machine.os_name); + mag_system_host_info_query_cpu_name(&ctx->machine.cpu_name); + mag_system_host_info_query_cpu_cores(&ctx->machine.cpu_virtual_cores, &ctx->machine.cpu_physical_cores, &ctx->machine.cpu_sockets); + mag_system_host_info_query_memory(&ctx->machine.phys_mem_total, &ctx->machine.phys_mem_free); #if defined(__x86_64__) || defined(_M_X64) - mag_system_info_query_x86_64_cpu_features(&ctx->sys.x86_64_cpu_features); - #elif defined(__aarch64__) && defined(__linux__) - mag_system_info_query_arm64_cpu_features(&ctx->sys.cpu_arm64_hwcap); + mag_system_info_query_amd64_cpu_caps(&ctx->machine.amd64_cpu_caps); + #elif defined(__aarch64__) + mag_system_info_query_arm64_cpu_caps(&ctx->machine.arm64_cpu_caps, &ctx->machine.arm64_cpu_sve_width); #endif - if (mag_unlikely(!*ctx->sys.os_name)) snprintf(ctx->sys.os_name, sizeof(ctx->sys.os_name), "Unknown"); - if (mag_unlikely(!*ctx->sys.cpu_name)) snprintf(ctx->sys.cpu_name, sizeof(ctx->sys.cpu_name), "Unknown"); + if (mag_unlikely(!*ctx->machine.os_name)) snprintf(ctx->machine.os_name, sizeof(ctx->machine.os_name), "Unknown"); + if (mag_unlikely(!*ctx->machine.cpu_name)) snprintf(ctx->machine.cpu_name, sizeof(ctx->machine.cpu_name), "Unknown"); } static MAG_AINLINE void mag_sto_write_u32_le(uint8_t** p, uint32_t x) { @@ -3004,7 +3069,7 @@ MAG_EXPORT mag_tensor_t** mag_sto_read_buffered(mag_ctx_t* ctx, const uint8_t* b if (mag_unlikely(!mag_sto_read_file_header(&needle, end, out_version, &n_tensors, &ud))) return NULL; /* Read file header */ if (mag_unlikely(!*out_version || *out_version > MAG_VERSION)) return NULL; if (mag_unlikely(!n_tensors)) return NULL; - mag_tensor_t** tensors = (mag_tensor_t**)(*mag_alloc)(NULL, n_tensors*sizeof(*tensors)); /* Allocate return tensor array */ + mag_tensor_t** tensors = (*mag_alloc)(NULL, n_tensors*sizeof(*tensors)); /* Allocate return tensor array */ for (size_t i=0; i < n_tensors; ++i) { /* Read tensor headers */ char name[MAG_MAX_TENSOR_NAME_LEN] = {0}; mag_tensor_flags_t flags = 0; diff --git a/magnetron/magnetron_cpu.c b/magnetron/magnetron_cpu.c index 8aaaa45..ac16892 100644 --- a/magnetron/magnetron_cpu.c +++ b/magnetron/magnetron_cpu.c @@ -10,52 +10,49 @@ extern void mag_cpu_blas_specialization_fallback(mag_kernel_registry_t* kernels) typedef struct mag_amd64_blas_specialization { const char* name; - const mag_x86_64_feature_t* (*get_feature_permutation)(size_t* out_num); + uint64_t (*get_feature_permutation)(void); void (*inject_kernels)(mag_kernel_registry_t* kernels); } mag_amd64_blas_specialization; -#define mag_cpu_blas_spec_decl(feat) \ - const mag_x86_64_feature_t* mag_cpu_blas_specialization_amd64_##feat##_features(size_t* out_num); \ - extern void mag_cpu_blas_specialization_amd64_##feat(mag_kernel_registry_t* kernels) +#define mag_amd64_blas_spec_decl(feat) \ + uint64_t mag_cpu_blas_specialization_amd64_v##feat##_features(void); \ + extern void mag_cpu_blas_specialization_amd64_v##feat(mag_kernel_registry_t* kernels) #define mag_amd64_blas_spec_permute(feat) \ (mag_amd64_blas_specialization) { \ - .name = "amd64_"#feat, \ - .get_feature_permutation = &mag_cpu_blas_specialization_amd64_##feat##_features, \ - .inject_kernels = &mag_cpu_blas_specialization_amd64_##feat \ + .name = "amd64-v."#feat, \ + .get_feature_permutation = &mag_cpu_blas_specialization_amd64_v##feat##_features, \ + .inject_kernels = &mag_cpu_blas_specialization_amd64_v##feat \ } -mag_cpu_blas_spec_decl(znver4); -mag_cpu_blas_spec_decl(avx512f); -mag_cpu_blas_spec_decl(avx2); -mag_cpu_blas_spec_decl(avx); -mag_cpu_blas_spec_decl(sse41); - -static const mag_amd64_blas_specialization mag_amd64_blas_specializations[] = { /* Dynamic selectable BLAS permutations, sorted from best to worst score. */ - mag_amd64_blas_spec_permute(znver4), - mag_amd64_blas_spec_permute(avx512f), - mag_amd64_blas_spec_permute(avx2), - mag_amd64_blas_spec_permute(avx), - mag_amd64_blas_spec_permute(sse41), -}; +mag_amd64_blas_spec_decl(4_5); +mag_amd64_blas_spec_decl(4); +mag_amd64_blas_spec_decl(3); +mag_amd64_blas_spec_decl(2_5); +mag_amd64_blas_spec_decl(2); static bool mag_blas_detect_gen_optimal_spec(const mag_ctx_t* ctx, mag_kernel_registry_t* kernels) { + const mag_amd64_blas_specialization mag_amd64_blas_specializations[] = { /* Dynamic selectable BLAS permutations, sorted from best to worst score. */ + mag_amd64_blas_spec_permute(4_5), + mag_amd64_blas_spec_permute(4), + mag_amd64_blas_spec_permute(3), + mag_amd64_blas_spec_permute(2_5), + mag_amd64_blas_spec_permute(2), + }; + + uint64_t cap_avail = ctx->machine.amd64_cpu_caps; for (size_t i=0; i < sizeof(mag_amd64_blas_specializations)/sizeof(*mag_amd64_blas_specializations); ++i) { /* Find best blas spec for the host CPU */ const mag_amd64_blas_specialization* spec = mag_amd64_blas_specializations+i; - size_t num_features = 0; - const mag_x86_64_feature_t* features = (*spec->get_feature_permutation)(&num_features); /* Get requires features */ - if (mag_unlikely(!num_features || !features)) continue; - bool has_all_features = true; - for (size_t j=0; j < num_features; ++j) /* For each requested feature, check if host CPU supports it */ - has_all_features &= mag_ctx_x86_64_cpu_has_feature(ctx, features[j]); - if (has_all_features) { /* Since specializations are sorted by score, we found the perfect spec. */ + uint64_t cap_required = (*spec->get_feature_permutation)(); /* Get requires features */ + if ((cap_avail & cap_required) == cap_required) { /* Since specializations are sorted by score, we found the perfect spec. */ (*spec->inject_kernels)(kernels); - mag_log_info("Using BLAS specialization: %s", spec->name); + mag_log_info("Using tuned BLAS specialization: %s", spec->name); return true; } } /* No matching specialization found, use generic */ mag_cpu_blas_specialization_fallback(kernels); + mag_log_info("Using fallback BLAS specialization"); return false; /* No spec used, fallback is active */ } @@ -64,24 +61,45 @@ static bool mag_blas_detect_gen_optimal_spec(const mag_ctx_t* ctx, mag_kernel_re #elif defined(__aarch64__) || defined(_M_ARM64) -#define mag_cpu_blas_spec_name(feat) mag_cpu_blas_specialization_arm64_##feat -#define mag_cpu_blas_spec_decl(feat) extern void mag_cpu_blas_spec_name(feat)(mag_kernel_registry_t* kernels) +typedef struct mag_arm64_blas_specialization { + const char* name; + uint64_t (*get_cap_permutation)(void); + void (*inject_kernels)(mag_kernel_registry_t* kernels); +} mag_arm64_blas_specialization; + +#define mag_arm64_blas_spec_decl(feat) \ + uint64_t mag_cpu_blas_specialization_arm64_v_##feat##_features(void); \ + extern void mag_cpu_blas_specialization_arm64_v_##feat(mag_kernel_registry_t* kernels) + +#define mag_arm64_blas_spec_permute(feat) \ + (mag_arm64_blas_specialization) { \ + .name = "arm64-v."#feat, \ + .get_cap_permutation = &mag_cpu_blas_specialization_arm64_v_##feat##_features, \ + .inject_kernels = &mag_cpu_blas_specialization_arm64_v_##feat \ +} -mag_cpu_blas_spec_decl(82); +mag_arm64_blas_spec_decl(9); +mag_arm64_blas_spec_decl(8_2); static bool mag_blas_detect_gen_optimal_spec(const mag_ctx_t* ctx, mag_kernel_registry_t* kernels) { -#ifdef __linux__ - long hwcap = ctx->sys.cpu_arm64_hwcap; - if (hwcap & HWCAP_FPHP) && (hwcap & HWCAP_ASIMDHP) && (hwcap && HWCAP_ASIMDDP)) { /* ARM v.8.2 f16 scalar + f16 vec + dotprod */ - mag_cpu_blas_spec_name(82)(kernels); - return true; + const mag_arm64_blas_specialization mag_arm64_blas_specializations[] = { /* Dynamic selectable BLAS permutations, sorted from best to worst score. */ + mag_arm64_blas_spec_permute(9), + mag_arm64_blas_spec_permute(8_2), + }; + + uint64_t cap_avail = ctx->machine.arm64_cpu_caps; + for (size_t i=0; i < sizeof(mag_arm64_blas_specializations)/sizeof(*mag_arm64_blas_specializations); ++i) { /* Find best blas spec for the host CPU */ + const mag_arm64_blas_specialization* spec = mag_arm64_blas_specializations+i; + uint64_t cap_required = (*spec->get_cap_permutation)(); /* Get requires features */ + if ((cap_avail & cap_required) == cap_required) { /* Since specializations are sorted by score, we found the perfect spec. */ + (*spec->inject_kernels)(kernels); + mag_log_info("Using tuned BLAS specialization: %s", spec->name); + return true; + } } -#elif defined(__APPLE__) - /* TODO - currently using ARM v8 baseline but Apple M2/M3 have newer arm versions we could target */ - /* mag_cpu_blas_spec_name(82)(kernels); */ -#endif /* No matching specialization found, use generic */ mag_cpu_blas_specialization_fallback(kernels); + mag_log_info("Using fallback BLAS specialization"); return false; /* No spec used, fallback is active */ } @@ -422,7 +440,7 @@ static mag_compute_device_t* mag_cpu_init_interface(mag_ctx_t* ctx, uint32_t num .alloc_storage = &mag_cpu_alloc_storage, .free_storage = &mag_cpu_free_storage }; - snprintf(dvc->name, sizeof(dvc->name), "%s - %s - Using %u Compute Threads", mag_device_type_get_name(dvc->type), ctx->sys.cpu_name, num_threads); + snprintf(dvc->name, sizeof(dvc->name), "%s", ctx->machine.cpu_name); return dvc; } @@ -433,7 +451,7 @@ static void mag_cpu_release_interface(mag_compute_device_t* ctx) { } mag_compute_device_t* mag_init_device_cpu(mag_ctx_t* ctx, const mag_device_descriptor_t* desc) { - uint32_t hw_concurrency = mag_xmax(1, ctx->sys.cpu_virtual_cores); + uint32_t hw_concurrency = mag_xmax(1, ctx->machine.cpu_virtual_cores); uint32_t num_threads = desc->thread_count; num_threads = num_threads ? num_threads : hw_concurrency; mag_compute_device_t* dvc = mag_cpu_init_interface(ctx, num_threads); diff --git a/magnetron/magnetron_cpu_blas.inl b/magnetron/magnetron_cpu_blas.inl index d525884..182bd20 100644 --- a/magnetron/magnetron_cpu_blas.inl +++ b/magnetron/magnetron_cpu_blas.inl @@ -1,3 +1,49 @@ +/* (c) 2025 Mario "Neo" Sieg. */ + +/* +** This file implements the core math for magnetron, optimized for different CPU instruction sets. +** This file is also included into different compilation units, which are all compiled with different architecture flags, thus the impl is 'cloned'. +** At runtime the best impl for the host-CPU is chose automatically, by detecting the CPU and querying the hardware features. +** +** !!! Minimum Requirements!!! +** AMD 64 CPUs: SSE & SSE2 (any 64-bit AMD64 CPU). +** ARM 64 CPUs: ARM v8-a (Raspberry Pi 4, 5, Apple M1-4, Neoverse/Graviton etc..) +** +** +==============+=============+==============+======================================================+ +** | AMD 64 Versions and Features +** +==============+=============+==============+======================================================+ +** | x86-64-v1 | CMOV, CX8, FPU, FXSR, MMX, OSFXSR, SCE, SSE, SSE2 +** | x86-64-v2 | CMPXCHG16B, LAHF-SAHF, POPCNT, SSE3, SSE4_1, SSE4_2, SSSE3 +** | x86-64-v3 | AVX, AVX2, BMI1, BMI2, F16C, FMA, LZCNT, MOVBE, OSXSAVE +** | x86-64-v4 | AVX512F, AVX512BW, AVX512CD, AVX512DQ, AVX512VL +** +==============+=============+==============+======================================================+ +** Some CPUs fall inbetween those, for example my old rusty test server has four old AMD Opteron CPUs with 16 cores each. They support AVX but not AVX2. +** For CPUs like this, we still support more granular feature levels: SSE42, AVX, AVX2 and AVX512F. +** +** +** +** +==============+=============+==============+======================================================+ +** | ARM 64 Versions and Features +** +==============+=============+==============+======================================================+ +** | armv8-a | Armv8-A | | +fp, +simd +** | armv8.1-a | Armv8.1-A | armv8-a, | +crc, +lse, +rdma +** | armv8.2-a | Armv8.2-A | armv8.1-a | +** | armv8.3-a | Armv8.3-A | armv8.2-a, | +pauth, +fcma, +jscvt +** | armv8.4-a | Armv8.4-A | armv8.3-a, | +flagm, +fp16fml, +dotprod, +rcpc2 +** | armv8.5-a | Armv8.5-A | armv8.4-a, | +sb, +ssbs, +predres, +frintts, +flagm2 +** | armv8.6-a | Armv8.6-A | armv8.5-a, | +bf16, +i8mm +** | armv8.7-a | Armv8.7-A | armv8.6-a, | +wfxt, +xs +** | armv8.8-a | Armv8.8-a | armv8.7-a, | +mops +** | armv8.9-a | Armv8.9-a | armv8.8-a | +** | armv9-a | Armv9-A | armv8.5-a, | +sve, +sve2 +** | armv9.1-a | Armv9.1-A | armv9-a, | +bf16, +i8mm +** | armv9.2-a | Armv9.2-A | armv9.1-a | +** | armv9.3-a | Armv9.3-A | armv9.2-a, | +mops +** | armv9.4-a | Armv9.4-A | armv9.3-a | +** | armv8-r | Armv8-R | armv8-r | +** +==============+=============+==============+======================================================+ +*/ + #include "magnetron_internal.h" #include @@ -1381,139 +1427,164 @@ static void MAG_HOTPROC mag_blas_matmul_f32(const mag_compute_payload_t* payload #ifndef MAG_BLAS_SPECIALIZATION #error "BLAS specialization undefined" #endif - -#if defined(__x86_64__) || defined(_M_X64) #ifndef MAG_BLAS_SPECIALIZATION_FEAT_REQUEST #error "Feature request routine undefined" #endif -const mag_x86_64_feature_t* MAG_BLAS_SPECIALIZATION_FEAT_REQUEST(size_t* out_num) { - static const mag_x86_64_feature_t required_features[] = { - #ifdef __AVX512F__ - MAG_X86_64_FEATURE_AVX512F, - #endif - #ifdef __AVX512BW__ - MAG_X86_64_FEATURE_AVX512BW, - #endif - #ifdef __AVX512CD__ - MAG_X86_64_FEATURE_AVX512CD, - #endif - #ifdef __AVX512DQ__ - MAG_X86_64_FEATURE_AVX512DQ, - #endif - #ifdef __AVX512ER__ - MAG_X86_64_FEATURE_AVX512ER, - #endif - #ifdef __AVX512IFMA__ - MAG_X86_64_FEATURE_AVX512IFMA, - #endif - #ifdef __AVX512PF__ - MAG_X86_64_FEATURE_AVX512PF, - #endif - #ifdef __AVX512VBMI__ - MAG_X86_64_FEATURE_AVX512VBMI, - #endif - #ifdef __AVX512VL__ - MAG_X86_64_FEATURE_AVX512VL, - #endif - #ifdef __AVX512_4FMAPS__ - MAG_X86_64_FEATURE_AVX512_4FMAPS, - #endif - #ifdef __AVX512_4VNNIW__ - MAG_X86_64_FEATURE_AVX512_4VNNIW, - #endif - #ifdef __AVX512_FP16__ - MAG_X86_64_FEATURE_AVX512_FP16, - #endif - #ifdef __AVX512_BF16__ - MAG_X86_64_FEATURE_AVX512_BF16, - #endif - #ifdef __AVX512_BITALG__ - MAG_X86_64_FEATURE_AVX512_BITALG, - #endif - #ifdef __AVX512_VBMI2__ - MAG_X86_64_FEATURE_AVX512_VBMI2, - #endif - #ifdef __AVX512_VNNI__ - MAG_X86_64_FEATURE_AVX512_VNNI, - #endif - #ifdef __AVX512_VP2INTERSECT__ - MAG_X86_64_FEATURE_AVX512_VP2INTERSECT, - #endif - #ifdef __AVX512_VPOPCNTDQ__ - MAG_X86_64_FEATURE_AVX512_VPOPCNTDQ, - #endif - #ifdef __AVX__ - MAG_X86_64_FEATURE_AVX, - #endif - #ifdef __AVX2__ - MAG_X86_64_FEATURE_AVX2, - #endif - #ifdef __AVXVNNI__ - MAG_X86_64_FEATURE_AVXVNNI, - #endif - #ifdef __AVXVNNIINT8__ - MAG_X86_64_FEATURE_AVXVNNIINT8, - #endif - #ifdef __AVXVNNIINT16__ - MAG_X86_64_FEATURE_AVXVNNIINT16, - #endif - #ifdef __BMI__ - MAG_X86_64_FEATURE_BMI, - #endif - #ifdef __BMI2__ - MAG_X86_64_FEATURE_BMI2, - #endif - #ifdef __F16C__ - MAG_X86_64_FEATURE_F16C, - #endif - #ifdef __FMA__ - MAG_X86_64_FEATURE_FMA, - #endif - #ifdef __GFNI__ - MAG_X86_64_FEATURE_GFNI, - #endif - #ifdef __PCLMUL__ - MAG_X86_64_FEATURE_PCLMUL, - #endif - #ifdef __RDRND__ - MAG_X86_64_FEATURE_RDRND, - #endif - #ifdef __RDSEED__ - MAG_X86_64_FEATURE_RDSEED, - #endif - #ifdef __RDTSCP__ - MAG_X86_64_FEATURE_RDTSCP, - #endif - #ifdef __SHA__ - MAG_X86_64_FEATURE_SHA, - #endif - #ifdef __SSE3__ - MAG_X86_64_FEATURE_SSE3, - #endif - #ifdef __SSE4_1__ - MAG_X86_64_FEATURE_SSE4_1, - #endif - #ifdef __SSE4_2__ - MAG_X86_64_FEATURE_SSE4_2, - #endif - #ifdef __SSSE3__ - MAG_X86_64_FEATURE_SSSE3, - #endif - #ifdef __VAES__ - MAG_X86_64_FEATURE_VAES, - #endif - #ifdef __VPCLMULQDQ__ - MAG_X86_64_FEATURE_VPCLMULQDQ, - #endif - #ifdef __XSAVE__ - MAG_X86_64_FEATURE_XSAVE, - #endif - MAG_X86_64_FEATURE_SSE2, /* always required */ - }; - *out_num = sizeof(required_features)/sizeof(*required_features); - return required_features; +#if defined(__x86_64__) || defined(_M_X64) +uint64_t MAG_BLAS_SPECIALIZATION_FEAT_REQUEST() { + uint64_t caps = 1ull< */ - -#ifndef __AVX512F__ -#error "BLAS specialization requires matching compile flags" -#endif -#ifdef __AVX512DQ__ -//#error "BLAS specialization feature too high" -#endif - -#define MAG_BLAS_SPECIALIZATION mag_cpu_blas_specialization_amd64_avx512f -#define MAG_BLAS_SPECIALIZATION_FEAT_REQUEST mag_cpu_blas_specialization_amd64_avx512f_features - -#include "magnetron_cpu_blas.inl" diff --git a/magnetron/magnetron_cpu_blas_amd64_sse42.c b/magnetron/magnetron_cpu_blas_amd64_v2.c similarity index 61% rename from magnetron/magnetron_cpu_blas_amd64_sse42.c rename to magnetron/magnetron_cpu_blas_amd64_v2.c index 656cd98..39a5bfa 100644 --- a/magnetron/magnetron_cpu_blas_amd64_sse42.c +++ b/magnetron/magnetron_cpu_blas_amd64_v2.c @@ -1,13 +1,22 @@ /* (c) 2025 Mario "Neo" Sieg. */ -#ifndef __SSE4_2__ + +#ifndef _MSC_VER +#if !defined(__SSE__) \ + || !defined(__SSE2__) \ + || !defined(__SSE3__) \ + || !defined(__SSSE3__) \ + || !defined(__SSE4_1__) \ + || !defined(__SSE4_2__) #error "BLAS specialization requires matching compile flags" #endif +#endif #ifdef __AVX__ #error "BLAS specialization feature too high" #endif -#define MAG_BLAS_SPECIALIZATION mag_cpu_blas_specialization_amd64_sse41 -#define MAG_BLAS_SPECIALIZATION_FEAT_REQUEST mag_cpu_blas_specialization_amd64_sse41_features +#define MAG_BLAS_SPECIALIZATION mag_cpu_blas_specialization_amd64_v2 +#define MAG_BLAS_SPECIALIZATION_FEAT_REQUEST mag_cpu_blas_specialization_amd64_v2_features #include "magnetron_cpu_blas.inl" + diff --git a/magnetron/magnetron_cpu_blas_amd64_avx.c b/magnetron/magnetron_cpu_blas_amd64_v2_5.c similarity index 58% rename from magnetron/magnetron_cpu_blas_amd64_avx.c rename to magnetron/magnetron_cpu_blas_amd64_v2_5.c index f82ad40..a025549 100644 --- a/magnetron/magnetron_cpu_blas_amd64_avx.c +++ b/magnetron/magnetron_cpu_blas_amd64_v2_5.c @@ -1,13 +1,22 @@ /* (c) 2025 Mario "Neo" Sieg. */ -#ifndef __AVX__ + +#ifndef _MSC_VER +#if !defined(__SSE__) \ + || !defined(__SSE2__) \ + || !defined(__SSE3__) \ + || !defined(__SSSE3__) \ + || !defined(__SSE4_1__) \ + || !defined(__SSE4_2__) \ + || !defined(__AVX__) #error "BLAS specialization requires matching compile flags" #endif +#endif #ifdef __AVX2__ #error "BLAS specialization feature too high" #endif -#define MAG_BLAS_SPECIALIZATION mag_cpu_blas_specialization_amd64_avx -#define MAG_BLAS_SPECIALIZATION_FEAT_REQUEST mag_cpu_blas_specialization_amd64_avx_features +#define MAG_BLAS_SPECIALIZATION mag_cpu_blas_specialization_amd64_v2_5 +#define MAG_BLAS_SPECIALIZATION_FEAT_REQUEST mag_cpu_blas_specialization_amd64_v2_5_features #include "magnetron_cpu_blas.inl" diff --git a/magnetron/magnetron_cpu_blas_amd64_v3.c b/magnetron/magnetron_cpu_blas_amd64_v3.c new file mode 100644 index 0000000..474299c --- /dev/null +++ b/magnetron/magnetron_cpu_blas_amd64_v3.c @@ -0,0 +1,26 @@ +/* (c) 2025 Mario "Neo" Sieg. */ + +#ifndef _MSC_VER +#if !defined(__SSE__) \ + || !defined(__SSE2__) \ + || !defined(__SSE3__) \ + || !defined(__SSSE3__) \ + || !defined(__SSE4_1__) \ + || !defined(__SSE4_2__) \ + || !defined(__AVX__) \ + || !defined(__AVX2__) \ + || !defined(__BMI__) \ + || !defined(__BMI2__) \ + || !defined(__F16C__) \ + || !defined(__FMA__) +#error "BLAS specialization requires matching compile flags" +#endif +#endif +#ifdef __AVX512F__ +#error "BLAS specialization feature too high" +#endif + +#define MAG_BLAS_SPECIALIZATION mag_cpu_blas_specialization_amd64_v3 +#define MAG_BLAS_SPECIALIZATION_FEAT_REQUEST mag_cpu_blas_specialization_amd64_v3_features + +#include "magnetron_cpu_blas.inl" diff --git a/magnetron/magnetron_cpu_blas_amd64_v4.c b/magnetron/magnetron_cpu_blas_amd64_v4.c new file mode 100644 index 0000000..2ed24fc --- /dev/null +++ b/magnetron/magnetron_cpu_blas_amd64_v4.c @@ -0,0 +1,29 @@ +/* (c) 2025 Mario "Neo" Sieg. */ + + +#ifndef _MSC_VER +#if !defined(__SSE__) \ + || !defined(__SSE2__) \ + || !defined(__SSE3__) \ + || !defined(__SSSE3__) \ + || !defined(__SSE4_1__) \ + || !defined(__SSE4_2__) \ + || !defined(__AVX__) \ + || !defined(__AVX2__) \ + || !defined(__F16C__) \ + || !defined(__FMA__) \ + || !defined(__AVX512F__) \ + || !defined(__AVX512BW__) \ + || !defined(__AVX512DQ__) \ + || !defined(__AVX512VL__) +#error "BLAS specialization requires matching compile flags" +#endif +#endif +#ifdef __AVX512VNNI__ +#error "BLAS specialization feature too high" +#endif + +#define MAG_BLAS_SPECIALIZATION mag_cpu_blas_specialization_amd64_v4 +#define MAG_BLAS_SPECIALIZATION_FEAT_REQUEST mag_cpu_blas_specialization_amd64_v4_features + +#include "magnetron_cpu_blas.inl" diff --git a/magnetron/magnetron_cpu_blas_amd64_v4_5.c b/magnetron/magnetron_cpu_blas_amd64_v4_5.c new file mode 100644 index 0000000..de8e725 --- /dev/null +++ b/magnetron/magnetron_cpu_blas_amd64_v4_5.c @@ -0,0 +1,32 @@ +/* (c) 2025 Mario "Neo" Sieg. */ + +#ifndef _MSC_VER +#if !defined(__SSE__) \ + || !defined(__SSE2__) \ + || !defined(__SSE3__) \ + || !defined(__SSSE3__) \ + || !defined(__SSE4_1__) \ + || !defined(__SSE4_2__) \ + || !defined(__AVX__) \ + || !defined(__AVX2__) \ + || !defined(__BMI__) \ + || !defined(__BMI2__) \ + || !defined(__F16C__) \ + || !defined(__FMA__) \ + || !defined(__AVX512F__) \ + || !defined(__AVX512BW__) \ + || !defined(__AVX512DQ__) \ + || !defined(__AVX512VL__) \ + || !defined(__AVX512VNNI__) \ + || !defined(__AVX512BF16__) +#error "BLAS specialization requires matching compile flags" +#endif +#endif +#ifdef __APX__ +#error "BLAS specialization feature too high" +#endif + +#define MAG_BLAS_SPECIALIZATION mag_cpu_blas_specialization_amd64_v4_5 +#define MAG_BLAS_SPECIALIZATION_FEAT_REQUEST mag_cpu_blas_specialization_amd64_v4_5_features + +#include "magnetron_cpu_blas.inl" diff --git a/magnetron/magnetron_cpu_blas_amd64_znver4.c b/magnetron/magnetron_cpu_blas_amd64_znver4.c deleted file mode 100644 index 23bca62..0000000 --- a/magnetron/magnetron_cpu_blas_amd64_znver4.c +++ /dev/null @@ -1,11 +0,0 @@ -/* (c) 2025 Mario "Neo" Sieg. */ - -#if !defined(__AVX512F__) || !defined(__AVX512VL__) || !defined(__AVX512VNNI__) - || !defined(__AVX512BF16__) || !defined(__AVX512BW__) || !defined(__AVX512DQ__) -#error "BLAS specialization requires matching compile flags" -#endif - -#define MAG_BLAS_SPECIALIZATION mag_cpu_blas_specialization_amd64_znver4 -#define MAG_BLAS_SPECIALIZATION_FEAT_REQUEST mag_cpu_blas_specialization_amd64_znver4_features - -#include "magnetron_cpu_blas.inl" diff --git a/magnetron/magnetron_cpu_blas_arm64_82.c b/magnetron/magnetron_cpu_blas_arm64_v8_2.c similarity index 78% rename from magnetron/magnetron_cpu_blas_arm64_82.c rename to magnetron/magnetron_cpu_blas_arm64_v8_2.c index 27d5664..df640a0 100644 --- a/magnetron/magnetron_cpu_blas_arm64_82.c +++ b/magnetron/magnetron_cpu_blas_arm64_v8_2.c @@ -6,6 +6,7 @@ #error "BLAS specialization requires matching compile flags" #endif -#define MAG_BLAS_SPECIALIZATION mag_cpu_blas_specialization_arm64_82 +#define MAG_BLAS_SPECIALIZATION mag_cpu_blas_specialization_arm64_v_8_2 +#define MAG_BLAS_SPECIALIZATION_FEAT_REQUEST mag_cpu_blas_specialization_arm64_v_8_2_features #include "magnetron_cpu_blas.inl" diff --git a/magnetron/magnetron_cpu_blas_amd64_avx2.c b/magnetron/magnetron_cpu_blas_arm64_v9.c similarity index 54% rename from magnetron/magnetron_cpu_blas_amd64_avx2.c rename to magnetron/magnetron_cpu_blas_arm64_v9.c index 505d4c1..73cb157 100644 --- a/magnetron/magnetron_cpu_blas_amd64_avx2.c +++ b/magnetron/magnetron_cpu_blas_arm64_v9.c @@ -1,13 +1,10 @@ /* (c) 2025 Mario "Neo" Sieg. */ -#ifndef __AVX2__ +#if !defined(__ARM_FEATURE_SVE) || !defined(__ARM_FEATURE_SVE2) #error "BLAS specialization requires matching compile flags" #endif -#ifdef __AVX512f__ -#error "BLAS specialization feature too high" -#endif -#define MAG_BLAS_SPECIALIZATION mag_cpu_blas_specialization_amd64_avx2 -#define MAG_BLAS_SPECIALIZATION_FEAT_REQUEST mag_cpu_blas_specialization_amd64_avx2_features +#define MAG_BLAS_SPECIALIZATION mag_cpu_blas_specialization_arm64_v_9 +#define MAG_BLAS_SPECIALIZATION_FEAT_REQUEST mag_cpu_blas_specialization_arm64_v_9_features #include "magnetron_cpu_blas.inl" diff --git a/magnetron/magnetron_internal.h b/magnetron/magnetron_internal.h index a31c05e..d1defb6 100644 --- a/magnetron/magnetron_internal.h +++ b/magnetron/magnetron_internal.h @@ -573,6 +573,91 @@ struct mag_tensor_node_t { }; #endif +#if defined(__x86_64__) || defined(_M_X64) +#define mag_x86_64_feature_def(_, __) /* Enumerator | CPUDID Leaf | Register | Shift */\ + _(NONE , 0, EAX, 0)__\ + _(AVX , H1, ECX, 28)__\ + _(AVX2 , H7, EBX, 5)__\ + _(AVXVNNI , H7_1H, EAX, 4)__\ + _(AVXVNNIINT8 , H7_1H, EDX, 4)__\ + _(AVXVNNIINT16 , H7_1H, EDX, 10)__\ + _(AVX512BW , H7, EBX, 30)__\ + _(AVX512CD , H7, EBX, 28)__\ + _(AVX512DQ , H7, EBX, 17)__\ + _(AVX512ER , H7, EBX, 27)__\ + _(AVX512F , H7, EBX, 16)__\ + _(AVX512IFMA , H7, EBX, 21)__\ + _(AVX512PF , H7, EBX, 26)__\ + _(AVX512VBMI , H7, ECX, 1)__\ + _(AVX512VL , H7, EBX, 31)__\ + _(AVX512_4FMAPS , H7, EDX, 3)__\ + _(AVX512_4VNNIW , H7, EDX, 2)__\ + _(AVX512_FP16 , H7, EDX, 23)__\ + _(AVX512_BF16 , H7_1H, EAX, 5)__\ + _(AVX512_BITALG , H7, ECX, 12)__\ + _(AVX512_VBMI2 , H7, ECX, 6)__\ + _(AVX512_VNNI , H7, ECX, 11)__\ + _(AVX512_VP2INTERSECT , H7, EDX, 8)__\ + _(AVX512_VPOPCNTDQ , H7, ECX, 14)__\ + _(BMI , H7, EBX, 3)__\ + _(BMI2 , H7, EBX, 8)__\ + _(F16C , H1, ECX, 29)__\ + _(FMA , H1, ECX, 12)__\ + _(FPU , H1, EDX, 0)__\ + _(GFNI , H7, ECX, 8)__\ + _(IA64 , H1, EDX, 30)__\ + _(MMX , H1, EDX, 23)__\ + _(OSXSAVE , H1, ECX, 27)__\ + _(PCLMUL , H1, ECX, 1)__\ + _(RDRND , H1, ECX, 30)__\ + _(RDSEED , H7, EBX, 18)__\ + _(RDTSCP , H80000001, EDX, 27)__\ + _(SHA , H7, EBX, 29)__\ + _(SSE , H1, EDX, 25)__\ + _(SSE2 , H1, EDX, 26)__\ + _(SSE3 , H1, ECX, 0)__\ + _(SSE4_1 , H1, ECX, 19)__\ + _(SSE4_2 , H1, ECX, 20)__\ + _(SSSE3 , H1, ECX, 9)__\ + _(VAES , H7, ECX, 9)__\ + _(VME , H1, EDX, 1)__\ + _(VMX , H1, ECX, 5)__\ + _(VPCLMULQDQ , H7, ECX, 10)__\ + _(XSAVE , H1, ECX, 26)__\ + _(HYBRID_CPU , H7, EDX, 15)__ + +#define _(ident, leaf, reg, bit) MAG_AMD64_CAP_##ident +typedef enum mag_amd64_cap_t { + mag_x86_64_feature_def(_, MAG_SEP) + MAG_AMD64_CAP__NUM +} mag_amd64_cap_t; +#undef _ + +extern const char* const mag_amd64_cap_names[MAG_AMD64_CAP__NUM]; + +#elif defined(__aarch64__) + +#define mag_arm64_feature_def(_, __) /* Enumerator | Shift */\ + _(NONE)__\ + _(NEON)__\ + _(DOTPROD)__\ + _(I8MM)__\ + _(F16SCA)__\ + _(F16VEC)__\ + _(BF16)__\ + _(SVE)__\ + _(SVE2)__ + +#define _(ident) MAG_ARM64_CAP_##ident +typedef enum mag_arm64_cap_t { + mag_arm64_feature_def(_, MAG_SEP) + MAG_ARM64_CAP__NUM +} mag_arm64_cap_t; +#undef _ +extern const char* const mag_arm64_cap_names[MAG_ARM64_CAP__NUM]; + +#endif + /* ** Context contains all isolated state and data. ** Lifetimes of tensors and compute graphs are bound to the context - the context is the owner. @@ -588,11 +673,12 @@ struct mag_ctx_t { uint64_t phys_mem_total; /* Total physical memory in bytes. */ uint64_t phys_mem_free; /* Free physical memory in bytes. */ #if defined(__x86_64__) || defined(_M_X64) - uint32_t x86_64_cpu_features[8][4]; /* x86-64 CPU features. */ + uint64_t amd64_cpu_caps; /* x86-64 CPU features. Bitset of 1ull<var, prefix) -#if defined(__x86_64__) || defined(_M_X64) -#define MAG_X86_64_CPUID_0H 0 -#define MAG_X86_64_CPUID_1H 1 -#define MAG_X86_64_CPUID_2H 2 -#define MAG_X86_64_CPUID_7H 3 -#define MAG_X86_64_CPUID_80000001H 4 -#define MAG_X86_64_CPUID_80000007H 5 -#define MAG_X86_64_CPUID_16H 6 -#define MAG_X86_64_CPUID_7H_1H 7 -#define MAG_X86_64_CPUID_EAX 0 -#define MAG_X86_64_CPUID_EBX 1 -#define MAG_X86_64_CPUID_ECX 2 -#define MAG_X86_64_CPUID_EDX 3 - -#define mag_x86_64_feature_def(_, __) /* Enumerator | CPUDID Leaf | Register | Bit Index */\ - _(AVX , 1H, ECX, 28)__\ - _(AVX2 , 7H, EBX, 5)__\ - _(AVXVNNI , 7H_1H, EAX, 4)__\ - _(AVXVNNIINT8 , 7H_1H, EDX, 4)__\ - _(AVXVNNIINT16 , 7H_1H, EDX, 10)__\ - _(AVX512BW , 7H, EBX, 30)__\ - _(AVX512CD , 7H, EBX, 28)__\ - _(AVX512DQ , 7H, EBX, 17)__\ - _(AVX512ER , 7H, EBX, 27)__\ - _(AVX512F , 7H, EBX, 16)__\ - _(AVX512IFMA , 7H, EBX, 21)__\ - _(AVX512PF , 7H, EBX, 26)__\ - _(AVX512VBMI , 7H, ECX, 1)__\ - _(AVX512VL , 7H, EBX, 31)__\ - _(AVX512_4FMAPS , 7H, EDX, 3)__\ - _(AVX512_4VNNIW , 7H, EDX, 2)__\ - _(AVX512_FP16 , 7H, EDX, 23)__\ - _(AVX512_BF16 , 7H_1H, EAX, 5)__\ - _(AVX512_BITALG , 7H, ECX, 12)__\ - _(AVX512_VBMI2 , 7H, ECX, 6)__\ - _(AVX512_VNNI , 7H, ECX, 11)__\ - _(AVX512_VP2INTERSECT , 7H, EDX, 8)__\ - _(AVX512_VPOPCNTDQ , 7H, ECX, 14)__\ - _(BMI , 7H, EBX, 3)__\ - _(BMI2 , 7H, EBX, 8)__\ - _(F16C , 1H, ECX, 29)__\ - _(FMA , 1H, ECX, 12)__\ - _(FPU , 1H, EDX, 0)__\ - _(GFNI , 7H, ECX, 8)__\ - _(IA64 , 1H, EDX, 30)__\ - _(MMX , 1H, EDX, 23)__\ - _(OSXSAVE , 1H, ECX, 27)__\ - _(PCLMUL , 1H, ECX, 1)__\ - _(RDRND , 1H, ECX, 30)__\ - _(RDSEED , 7H, EBX, 18)__\ - _(RDTSCP , 80000001H, EDX, 27)__\ - _(SHA , 7H, EBX, 29)__\ - _(SSE , 1H, EDX, 25)__\ - _(SSE2 , 1H, EDX, 26)__\ - _(SSE3 , 1H, ECX, 0)__\ - _(SSE4_1 , 1H, ECX, 19)__\ - _(SSE4_2 , 1H, ECX, 20)__\ - _(SSSE3 , 1H, ECX, 9)__\ - _(VAES , 7H, ECX, 9)__\ - _(VME , 1H, EDX, 1)__\ - _(VMX , 1H, ECX, 5)__\ - _(VPCLMULQDQ , 7H, ECX, 10)__\ - _(XSAVE , 1H, ECX, 26)__\ - _(HYBRID_CPU , 7H, EDX, 15)__ - -#define _(enumerator, leaf, reg, bit) MAG_X86_64_FEATURE_##enumerator -typedef enum mag_x86_64_feature_t { - mag_x86_64_feature_def(_, MAG_SEP) - MAG_X86_64_FEATURE__NUM -} mag_x86_64_feature_t; -#undef _ -extern const char* const mag_x86_64_feature_names[MAG_X86_64_FEATURE__NUM]; -extern const uint8_t mag_x86_64_feature_leaves[MAG_X86_64_FEATURE__NUM]; -extern const uint8_t mag_x86_64_feature_regs[MAG_X86_64_FEATURE__NUM]; -extern const uint32_t mag_x86_64_feature_masks[MAG_X86_64_FEATURE__NUM]; - -static inline bool mag_ctx_x86_64_cpu_has_feature(const mag_ctx_t* ctx, mag_x86_64_feature_t feature) { - const uint8_t (*leafs)[49] = &mag_x86_64_feature_leaves; - const uint8_t (*regs)[49] = &mag_x86_64_feature_regs; - const uint32_t (*masks)[49] = &mag_x86_64_feature_masks; - const uint32_t (*features)[8][4] = &ctx->sys.x86_64_cpu_features; - return (*features)[(*leafs)[feature]][(*regs)[feature]] & (*masks)[feature]; -} -#endif - #ifdef __cplusplus } #endif diff --git a/python/benchmarks/bench.py b/python/benchmarks/bench.py index c031939..18a9fcb 100644 --- a/python/benchmarks/bench.py +++ b/python/benchmarks/bench.py @@ -54,39 +54,36 @@ def allocate_args(self, shape_a: tuple[int, int], shape_b: tuple[int, int]): ('Matrix Multiplication', lambda x, y: x @ y), ] -max_dim = 256 -square_step = 8 -all_step = max_dim // 4 - print('Running performance benchmark...') print('Magnetron VS') for participant in participants: if not isinstance(participant, MagnetronBenchmark): print(f' {participant.name}') -print('\nSquare Matrix Benchmarks (NxN):') -square_shapes = generate_square_shapes(max_dim, square_step) -for op in elementwise_ops: - name, fn = op - print(f'Benchmarking {name} Operator') - benchmark(name, participants, fn, square_shapes, plot_style='lines') - -for op in matmul_ops: - name, fn = op - print(f'Benchmarking {name} Operator') - benchmark(name, participants, fn, square_shapes, plot_style='lines') - -print('\nAll Shapes Benchmarks:') -print('Elementwise Operations:') -elementwise_shapes = generate_elementwise_shapes(max_dim, all_step) -for op in elementwise_ops: - name, fn = op - print(f'Benchmarking {name} Operator') - benchmark(name, participants, fn, elementwise_shapes, plot_style='bars') - -print('\nMatrix Multiplication:') -matmul_shapes = generate_matmul_shapes(max_dim, all_step) -for op in matmul_ops: - name, fn = op - print(f'Benchmarking {name} Operator') - benchmark(name, participants, fn, matmul_shapes, plot_style='bars') +def bench_square_bin_ops(dim_lim: int=2048, step: int=32): + square_shapes = generate_square_shapes(dim_lim, step) + for op in elementwise_ops: + name, fn = op + print(f'Benchmarking {name} Operator') + benchmark(name, participants, fn, square_shapes, plot_style='lines') + +def bench_square_matmul(dim_lim: int=2048, step: int=32): + square_shapes = generate_square_shapes(dim_lim, step) + for op in matmul_ops: + name, fn = op + print(f'Benchmarking {name} Operator') + benchmark(name, participants, fn, square_shapes, plot_style='lines') + +def bench_permuted_bin_ops(dim_lim: int=2048, step: int=32): + elementwise_shapes = generate_elementwise_shapes(dim_lim, step) + for op in elementwise_ops: + name, fn = op + print(f'Benchmarking {name} Operator') + benchmark(name, participants, fn, elementwise_shapes, plot_style='bars') + +def bench_permuted_matmul(dim_lim: int=2048, step: int=32): + matmul_shapes = generate_matmul_shapes(dim_lim, step) + for op in matmul_ops: + name, fn = op + print(f'Benchmarking {name} Operator') + benchmark(name, participants, fn, matmul_shapes, plot_style='bars') diff --git a/python/benchmarks/bench_all.py b/python/benchmarks/bench_all.py new file mode 100644 index 0000000..f083108 --- /dev/null +++ b/python/benchmarks/bench_all.py @@ -0,0 +1,8 @@ +# (c) 2025 Mario "Neo" Sieg. + +from benchmarks.bench import * + +bench_square_bin_ops(dim_lim=4096) +bench_square_matmul(dim_lim=4096) +bench_permuted_bin_ops(dim_lim=4096) +bench_permuted_matmul(dim_lim=4096) diff --git a/python/benchmarks/bench_square_matmul.py b/python/benchmarks/bench_square_matmul.py new file mode 100644 index 0000000..76a021a --- /dev/null +++ b/python/benchmarks/bench_square_matmul.py @@ -0,0 +1,5 @@ +# (c) 2025 Mario "Neo" Sieg. + +from benchmarks.bench import bench_square_matmul + +bench_square_matmul(dim_lim=1024) diff --git a/python/benchmarks/bench_various_matmul.py b/python/benchmarks/bench_various_matmul.py new file mode 100644 index 0000000..cd5c232 --- /dev/null +++ b/python/benchmarks/bench_various_matmul.py @@ -0,0 +1,5 @@ +# (c) 2025 Mario "Neo" Sieg. + +from benchmarks.bench import bench_permuted_matmul + +bench_permuted_matmul() diff --git a/python/examples/xor.py b/python/examples/xor.py index ffae237..ae537c0 100644 --- a/python/examples/xor.py +++ b/python/examples/xor.py @@ -1,7 +1,7 @@ # (c) 2025 Mario "Neo" Sieg. from magnetron import Tensor -from magnetron.models import SequentialModel, DenseLayer +from magnetron.model import SequentialModel, DenseLayer import matplotlib.pyplot as plt EPOCHS: int = 10000 diff --git a/python/magnetron_framework/magnetron/core.py b/python/magnetron_framework/magnetron/core.py index ff65b94..b1648a0 100644 --- a/python/magnetron_framework/magnetron/core.py +++ b/python/magnetron_framework/magnetron/core.py @@ -656,7 +656,7 @@ def uniform(cls, shape: tuple[int, ...], *, interval: (float, float) = (-1.0, 1. return tensor @classmethod - def normal(cls, shape: tuple[int, ...], *, mean: float, stddev: float) -> 'Tensor': + def normal(cls, shape: tuple[int, ...], *, mean: float=0.0, stddev: float=1.0) -> 'Tensor': """ Creates a _ptr filled with random values from a normal distribution. @@ -827,7 +827,7 @@ def data_ptr(self) -> int: """ return int(ffi.cast('uintptr_t', C.mag_tensor_data_ptr(self._ptr))) - def to_list(self) -> list[float]: + def tolist(self) -> list[float]: """ Returns the tensor data as a Python list of floats. @@ -1171,7 +1171,7 @@ def view(self) -> 'Tensor': def transpose(self) -> 'Tensor': """ - Transposes the tensor (swaps the last two dimensions). + Transposes the tensor (swaps the last two dimensions). Same as tensor.T. Returns ------- @@ -1180,6 +1180,18 @@ def transpose(self) -> 'Tensor': """ return Tensor(C.mag_transpose(self._ptr)) + @property + def T(self) -> 'Tensor': + """ + Transposes the tensor (swaps the last two dimensions). Same as tensor.transpose(). + + Returns + ------- + Tensor + A transposed tensor. + """ + return Tensor(C.mag_transpose(self._ptr)) + def permute(self, axes: tuple[int, ...]) -> 'Tensor': """ Permutes the dimensions of the tensor. diff --git a/python/magnetron_framework/magnetron/models.py b/python/magnetron_framework/magnetron/model.py similarity index 61% rename from python/magnetron_framework/magnetron/models.py rename to python/magnetron_framework/magnetron/model.py index 8a6bbad..10393eb 100644 --- a/python/magnetron_framework/magnetron/models.py +++ b/python/magnetron_framework/magnetron/model.py @@ -44,7 +44,6 @@ def cross_entropy(y: Tensor, y_hat: Tensor) -> float: class DenseLayer(Layer): def __init__(self, in_features: int, out_features: int): super().__init__() - # For column-based math, shape=(out_features, in_features) self.weight = Tensor.uniform(shape=(out_features, in_features)) self.bias = Tensor.uniform(shape=(out_features, 1)) self._x = None @@ -52,56 +51,23 @@ def __init__(self, in_features: int, out_features: int): self._out = None def forward(self, x: Tensor) -> Tensor: - """ - If we do: z = W @ x + b, - then out = sigmoid(z). - - We'll store both x and out (or z) for backward(). - """ - self._x = x # store input (shape=(in_features, batch_size)) + self._x = x self._z = self.weight @ x + self.bias self._out = self._z.sigmoid() return self._out def backward(self, is_hidden_layer: bool, delta: Tensor, rate: float) -> Tensor: - """ - `delta` here is dL/d(output_of_this_layer). We do: - - dW = delta @ x^T (since x is shape=(in_features, batch_size)) - db = mean of delta, per each output neuron - next_delta = W^T @ delta * σ'(z) [ only if is_hidden_layer=True ] - """ - # Weight update - # delta shape = (out_features, batch_size) - # x^T shape = (batch_size, in_features) - # so delta @ x^T is (out_features, in_features), which matches weight self.weight -= (delta @ self._x.transpose().clone()) * rate - - # Bias update: one bias per out_feature => take mean along batch_size axis=1 - # delta.mean(axis=1) gives shape (out_features,) so we keepdims to (out_features, 1) - #self.bias -= delta.mean(axis=1, keepdims=True) * rate - batch_size = delta.shape[1] ones_vec = Tensor.const([[1.0] for _ in range(batch_size)]) - row_sums = delta @ ones_vec # shape (out_features, 1) + row_sums = delta @ ones_vec row_means = row_sums * (1.0 / batch_size) self.bias -= row_means * rate - - # For the next layer’s delta = (W^T @ delta) * sigmoid'(z) - # We must use the derivative of the *post*–linear pre-activation z, - # or equivalently the derivative wrt the output if we have it stored. if is_hidden_layer: - # shape(W^T) = (in_features, out_features) - # shape(delta) = (out_features, batch_size) d_in = self.weight.transpose().clone() @ delta - # Multiply by derivative of out = sigmoid(z) - # i.e. out * (1 - out). If your library’s .sigmoid(derivative=True) - # expects the “pre-activated” z, you can do that here. d_in *= self._z.sigmoid(derivative=True) return d_in else: - # For the last layer, we return delta as is, - # or skip the activation derivative if you already did it in the top-level. return delta @@ -118,15 +84,8 @@ def forward(self, inputs: Tensor) -> Tensor: return x def backward(self, outputs: Tensor, targets: Tensor, rate: float): - """ - For the final layer delta, we do: delta = dL/dOut * sigmoid'(Out) - Then pass delta backward through each layer. - """ error = outputs - targets - # For the final layer’s activation derivative: delta = error * outputs.sigmoid(derivative=True) - - # Backprop through layers from last to first for i in reversed(range(len(self.layers))): is_hidden = (i > 0) delta = self.layers[i].backward(is_hidden, delta, rate) @@ -136,17 +95,13 @@ def train(self, inputs: Tensor, targets: Tensor, epochs: int, rate: float): import time start_time = time.time_ns() - # Optionally transpose if you want (features, batch) layout inputs = inputs.transpose().clone() targets = targets.transpose().clone() losses = [] for epoch in range(epochs): - # Forward pass output = self.forward(inputs) - # Backward pass self.backward(output, targets, rate) - # Compute and record loss loss = Optim.mse(output, targets) losses.append(loss) if epoch % self.loss_epoch_step == 0: diff --git a/python/magnetron_viewer/main.py b/python/magnetron_viewer/main.py index 8d03e70..14e6f0b 100644 --- a/python/magnetron_viewer/main.py +++ b/python/magnetron_viewer/main.py @@ -119,7 +119,7 @@ def show_tensor_data(self, item): if tensor_name not in self.tensors: return tensor = self.tensors[tensor_name] - tensor_data = tensor.to_list() + tensor_data = tensor.tolist() rows = [] elements_per_row = 16 diff --git a/python/tests/__init__.py b/python/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/python/tests/tensor.py b/python/tests/tensor.py index c3eb5a7..2151636 100644 --- a/python/tests/tensor.py +++ b/python/tests/tensor.py @@ -39,4 +39,4 @@ def test_tensor_to_list(): tensor[1] = 255 tensor[2] = -22333 tensor[3] = 22 - assert tensor.to_list() == [128, 255, -22333, 22] + assert tensor.tolist() == [128, 255, -22333, 22] diff --git a/python/tests/tensor_fill.py b/python/tests/tensor_fill.py index 116fe27..79a2fff 100644 --- a/python/tests/tensor_fill.py +++ b/python/tests/tensor_fill.py @@ -4,31 +4,31 @@ def test_tensor_fill_zero(): tensor = Tensor.zeros((1, 2, 3, 4, 5, 6)) - data = tensor.to_list() + data = tensor.tolist() assert len(data) == 1 * 2 * 3 * 4 * 5 * 6 assert all([x == 0 for x in data]) def test_tensor_fill_x(): tensor = Tensor.full((1, 2, 3, 4, 5, 6), fill_value=-22) - data = tensor.to_list() + data = tensor.tolist() assert len(data) == 1 * 2 * 3 * 4 * 5 * 6 assert all([x == -22 for x in data]) def test_tensor_fill_uniform(): tensor = Tensor.uniform((1, 2, 3, 4, 5, 6), interval=(-1, 1)) - data = tensor.to_list() + data = tensor.tolist() assert len(data) == 1 * 2 * 3 * 4 * 5 * 6 assert all([-1 <= x <= 1 for x in data]) def test_tensor_fill_uniform2(): tensor = Tensor.uniform((1, 2, 3, 4, 5, 6), interval=(0, 100)) - data = tensor.to_list() + data = tensor.tolist() assert len(data) == 1 * 2 * 3 * 4 * 5 * 6 assert all([0 <= x <= 100 for x in data]) def test_tensor_fill_uniform3(): tensor = Tensor.uniform((1, 2, 3, 4, 5, 6), interval=(-1000, -20)) - data = tensor.to_list() + data = tensor.tolist() assert len(data) == 1 * 2 * 3 * 4 * 5 * 6 assert all([-1000 <= x <= -20 for x in data]) @@ -36,7 +36,7 @@ def test_tensor_fill_normal(): mean = 0.0 stddev = 1 tensor = Tensor.normal((1, 2, 3, 4, 5, 6), mean=mean, stddev=stddev) - data = tensor.to_list() + data = tensor.tolist() assert len(data) == 1 * 2 * 3 * 4 * 5 * 6 #assert all([abs(x - mean) <= 3 * stddev for x in data]) TODO @@ -46,7 +46,7 @@ def test_tensor_fill_const_1d(): assert tensor.shape == (4,) assert tensor.numel == 4 assert tensor.rank == 1 - data = tensor.to_list() + data = tensor.tolist() assert data == [1, 2, 3, 4] def test_tensor_fill_const_2d(): @@ -58,7 +58,7 @@ def test_tensor_fill_const_2d(): assert tensor.shape == (2, 2) assert tensor.numel == 4 assert tensor.rank == 2 - data = tensor.to_list() + data = tensor.tolist() assert data == [1, 2, 3, 4] def test_tensor_fill_const_3d(): @@ -76,7 +76,7 @@ def test_tensor_fill_const_3d(): assert tensor.shape == (2, 2, 2) assert tensor.numel == 8 assert tensor.rank == 3 - data = tensor.to_list() + data = tensor.tolist() assert data == [1, 2, 3, 4, 1, 2, 3, 4] def test_tensor_fill_const_4d(): @@ -106,7 +106,7 @@ def test_tensor_fill_const_4d(): assert tensor.shape == (2, 2, 2, 2) assert tensor.numel == 16 assert tensor.rank == 4 - data = tensor.to_list() + data = tensor.tolist() assert data == [1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4] def test_tensor_fill_const_5d(): @@ -160,7 +160,7 @@ def test_tensor_fill_const_5d(): assert tensor.shape == (2, 2, 2, 2, 2) assert tensor.numel == 32 assert tensor.rank == 5 - data = tensor.to_list() + data = tensor.tolist() assert data == [1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4] def test_tensor_fill_const_6d(): @@ -262,6 +262,6 @@ def test_tensor_fill_const_6d(): assert tensor.shape == (2, 2, 2, 2, 2, 2) assert tensor.numel == 64 assert tensor.rank == 6 - data = tensor.to_list() + data = tensor.tolist() assert data == [1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4] diff --git a/python/tests/tensor_ops1.py b/python/tests/tensor_ops1.py index 87c8440..e49b10a 100644 --- a/python/tests/tensor_ops1.py +++ b/python/tests/tensor_ops1.py @@ -8,7 +8,7 @@ def test_tensor_clone(): assert a.shape == b.shape assert a.numel == b.numel assert a.rank == b.rank - assert a.to_list() == b.to_list() + assert a.tolist() == b.tolist() assert a.is_contiguous == b.is_contiguous def test_tensor_transpose(): @@ -20,15 +20,15 @@ def test_tensor_transpose(): assert b.numel == 6 assert a.rank == 2 assert b.rank == 2 - assert a.to_list() == [1, 1, 1, 1, 1, 1] - assert b.to_list() == [1, 1, 1, 1, 1, 1] + assert a.tolist() == [1, 1, 1, 1, 1, 1] + assert b.tolist() == [1, 1, 1, 1, 1, 1] assert a.is_contiguous assert not a.is_transposed assert not a.is_permuted assert not b.is_contiguous assert b.is_transposed assert b.is_permuted - +""" def test_tensor_transpose_6d(): a = Tensor.full((1, 2, 3, 4, 5, 6), fill_value=1) b = a.transpose() @@ -38,15 +38,15 @@ def test_tensor_transpose_6d(): assert b.numel == 720 assert a.rank == 6 assert b.rank == 6 - assert a.to_list() == [1] * 720 - assert b.to_list() == [1] * 720 + assert a.tolist() == [1] * 720 + assert b.tolist() == [1] * 720 assert a.is_contiguous assert not a.is_transposed assert not a.is_permuted assert not b.is_contiguous assert b.is_transposed assert b.is_permuted - +""" def test_tensor_permute(): a = Tensor.full((2, 3), fill_value=1) b = a.permute((1, 0)) @@ -56,8 +56,8 @@ def test_tensor_permute(): assert b.numel == 6 assert a.rank == 2 assert b.rank == 2 - assert a.to_list() == [1, 1, 1, 1, 1, 1] - assert b.to_list() == [1, 1, 1, 1, 1, 1] + assert a.tolist() == [1, 1, 1, 1, 1, 1] + assert b.tolist() == [1, 1, 1, 1, 1, 1] assert a.is_contiguous assert not a.is_transposed assert not a.is_permuted @@ -74,8 +74,8 @@ def test_tensor_permute_6d(): assert b.numel == 720 assert a.rank == 6 assert b.rank == 6 - assert a.to_list() == [1] * 720 - assert b.to_list() == [1] * 720 + assert a.tolist() == [1] * 720 + assert b.tolist() == [1] * 720 assert a.is_contiguous assert not a.is_transposed assert not a.is_permuted diff --git a/python/tests/tests.py b/python/tests/tests.py index 5df02eb..a39f602 100644 --- a/python/tests/tests.py +++ b/python/tests/tests.py @@ -8,4 +8,4 @@ def test_import_magnetron(): def test_simple_exec(): import magnetron as mag a = mag.Tensor.const([1, 4, 1]) - assert a.max().scalar() == 4 + assert a.max()[0] == 4