MarioSieg · MarioSieg · Feb 11, 2025 · Jan 25, 2025 · Jan 25, 2025 · Jan 26, 2025
diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,9 @@
+[submodule "extern/googletest"]
+	path = extern/googletest
+	url = https://github.com/google/googletest
+[submodule "extern/nanobench"]
+	path = extern/nanobench
+	url = https://github.com/martinus/nanobench
+[submodule "extern/mimalloc"]
+	path = extern/mimalloc
+	url = https://github.com/microsoft/mimalloc
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -19,12 +19,14 @@ option(MAGNETRON_DEBUG "Enable debug mode" OFF)                             # En
 option(MAGNETRON_CPU_APPROX_MATH "Trade precision for performance" ON)      # (CPU only) Enable SIMD math function approximations. Greatly increases performance. Try disabling if you encounter numerical instability. Does NOT enable -ffast-math or similar compiler flags.
 option(MAGNETRON_ENABLE_CUDA "Enable CUDA support" ON)                      # Enable CUDA support
 option(MAGNETRON_ENABLE_ACCELERATE "Use Apple's Accelerate framework" ON)   # Use Apple's Accelerate framework for optimized math functions (only on Apple platforms)
+option(MAGNETRON_ENABLE_MIMALLOC "Use mimalloc as memory allocator" ON)    # Use mimalloc as memory allocator for faster memory allocation
 
 set(MAGNETRON_CUDA_COMPILER "/usr/local/cuda-12.6/bin/nvcc" CACHE STRING "Path to the CUDA compiler") # Set to your CUDA compiler path
 set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
 
 if (${MAGNETRON_BUILD_TESTS})
     enable_testing()
+    add_subdirectory(extern/googletest)
 endif()
 
 if (${MAGNETRON_BUILD_TESTS} OR ${MAGNETRON_BUILD_BENCHMARKS} OR ${MAGNETRON_BUILD_FUZZERS})
@@ -45,6 +47,10 @@ if (${MAGNETRON_ENABLE_ACCELERATE} AND APPLE)
     include(cmake/accelerate.cmake)
 endif()
 
+if (${MAGNETRON_ENABLE_MIMALLOC})
+    include(cmake/allocator.cmake)
+endif()
+
 if (${MAGNETRON_BUILD_TESTS})
     add_subdirectory(test)
 endif()

diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
@@ -7,7 +7,7 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
 add_executable(magnetron_benchmark benchmarks.cpp)
 target_link_libraries(magnetron_benchmark magnetron)
 target_include_directories(magnetron_benchmark PRIVATE ../magnetron)
-target_include_directories(magnetron_benchmark PRIVATE nanobench)
+target_include_directories(magnetron_benchmark PRIVATE ../extern/nanobench/src/include)
 
 add_executable(magnetron_profile profile.cpp)
 target_link_libraries(magnetron_profile magnetron)

diff --git a/benchmark/benchmarks.cpp b/benchmark/benchmarks.cpp
@@ -5,59 +5,44 @@
 #include <magnetron.h>
 #define ANKERL_NANOBENCH_IMPLEMENT
 #include <nanobench.h>
-#include <thread>
 
 #include "magnetron_internal.h"
 
-static auto bench_cpu_compute(std::int64_t numel_per_dim) -> void {
+static auto bench_op(ankerl::nanobench::Bench& bench, std::int64_t numel_per_dim) -> void {
+    mag_device_descriptor_t desc {};
+    desc.type = MAG_COMPUTE_DEVICE_TYPE_CPU;
+    mag_ctx_t* ctx = mag_ctx_create2(&desc);
+    mag_tensor_t* A = mag_tensor_create_2d(ctx, MAG_DTYPE_F32, numel_per_dim, numel_per_dim);
+    mag_tensor_fill(A, 1.0f);
+    mag_tensor_t* B = mag_tensor_create_2d(ctx, MAG_DTYPE_F32, numel_per_dim, numel_per_dim);
+    mag_tensor_fill(A, 3.0f);
+    bench.run("Parallel Elems = " + std::to_string(A->numel), [&] {
+        mag_tensor_t* R = mag_add(A, B);
+        ankerl::nanobench::doNotOptimizeAway(R);
+        mag_tensor_decref(R);
+    });
+
+    ankerl::nanobench::doNotOptimizeAway(ctx);
+    mag_tensor_decref(B);
+    mag_tensor_decref(A);
+    mag_ctx_destroy(ctx);
+}
+
+auto main() -> int {
     ankerl::nanobench::Bench bench {};
-    bench.title("Parallel MM Big Tensor | Numel per Dim: " + std::to_string(numel_per_dim))
+    bench.title("Parallel Big Tensor")
         .unit("MM")
         .warmup(100)
-        .relative(true)
         .performanceCounters(true);
 
-    std::cout << "Benchmarking Parallel MM on CPU with Numel per Dim: " << numel_per_dim << std::endl;
-
-    auto exec_bench = [&](std::uint32_t threads) {
-        mag_device_descriptor_t desc {};
-        desc.type = MAG_COMPUTE_DEVICE_TYPE_CPU;
-        desc.thread_count = threads;
-        mag_ctx_t* ctx = mag_ctx_create2(&desc);
-        mag_tensor_t* A = mag_tensor_create_2d(ctx, MAG_DTYPE_F32, numel_per_dim, numel_per_dim);
-        mag_tensor_fill_random_normal(A, 0.0f, 1.0f);
-        mag_tensor_t* B = mag_tensor_create_2d(ctx, MAG_DTYPE_F32, numel_per_dim, numel_per_dim);
-        mag_tensor_fill_random_normal(B, 0.0f, 1.0f);
-        bench.run("Parallel MM on " + std::to_string(threads) + " threads, Elems = " + std::to_string(A->numel), [&] {
-            mag_tensor_t* R = mag_matmul(A, B);
-            ankerl::nanobench::doNotOptimizeAway(R);
-            mag_tensor_decref(R);
-        });
-
-        ankerl::nanobench::doNotOptimizeAway(ctx);
-        mag_tensor_decref(B);
-        mag_tensor_decref(A);
-        mag_ctx_destroy(ctx);
-    };
-
-    std::uint32_t num_threads = std::max(1u, std::thread::hardware_concurrency());
-
-    for (std::uint32_t i=1; i <= num_threads;) {
-        exec_bench(i);
-        if (i == 1) ++i;
-        else i += 2;
-    }
-}
-
-auto main() -> int {
-    //bench_cpu_compute(10000);
-    bench_cpu_compute(1000);
-    bench_cpu_compute(750);
-    bench_cpu_compute(500);
-    bench_cpu_compute(250);
-    bench_cpu_compute(100);
-    bench_cpu_compute(10);
-    bench_cpu_compute(2);
-    //bench_cpu_compute(250);
+    bench_op(bench, 15000);
+    bench_op(bench, 10000);
+    bench_op(bench, 1000);
+    bench_op(bench, 750);
+    bench_op(bench, 500);
+    bench_op(bench, 250);
+    bench_op(bench, 100);
+    bench_op(bench, 10);
+    bench_op(bench, 4);
     return 0;
 }