Skip to content

Commit

Permalink
Merge pull request #8 from MarioSieg/example-refractor
Browse files Browse the repository at this point in the history
Example refractor
  • Loading branch information
MarioSieg authored Feb 11, 2025
2 parents a716373 + 608a925 commit 28060b1
Show file tree
Hide file tree
Showing 302 changed files with 2,588 additions and 117,033 deletions.
9 changes: 9 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
[submodule "extern/googletest"]
path = extern/googletest
url = https://github.com/google/googletest
[submodule "extern/nanobench"]
path = extern/nanobench
url = https://github.com/martinus/nanobench
[submodule "extern/mimalloc"]
path = extern/mimalloc
url = https://github.com/microsoft/mimalloc
6 changes: 6 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,14 @@ option(MAGNETRON_DEBUG "Enable debug mode" OFF) # En
option(MAGNETRON_CPU_APPROX_MATH "Trade precision for performance" ON) # (CPU only) Enable SIMD math function approximations. Greatly increases performance. Try disabling if you encounter numerical instability. Does NOT enable -ffast-math or similar compiler flags.
option(MAGNETRON_ENABLE_CUDA "Enable CUDA support" ON) # Enable CUDA support
option(MAGNETRON_ENABLE_ACCELERATE "Use Apple's Accelerate framework" ON) # Use Apple's Accelerate framework for optimized math functions (only on Apple platforms)
option(MAGNETRON_ENABLE_MIMALLOC "Use mimalloc as memory allocator" ON) # Use mimalloc as memory allocator for faster memory allocation

set(MAGNETRON_CUDA_COMPILER "/usr/local/cuda-12.6/bin/nvcc" CACHE STRING "Path to the CUDA compiler") # Set to your CUDA compiler path
set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)

if (${MAGNETRON_BUILD_TESTS})
enable_testing()
add_subdirectory(extern/googletest)
endif()

if (${MAGNETRON_BUILD_TESTS} OR ${MAGNETRON_BUILD_BENCHMARKS} OR ${MAGNETRON_BUILD_FUZZERS})
Expand All @@ -45,6 +47,10 @@ if (${MAGNETRON_ENABLE_ACCELERATE} AND APPLE)
include(cmake/accelerate.cmake)
endif()

if (${MAGNETRON_ENABLE_MIMALLOC})
include(cmake/allocator.cmake)
endif()

if (${MAGNETRON_BUILD_TESTS})
add_subdirectory(test)
endif()
Expand Down
2 changes: 1 addition & 1 deletion benchmark/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
add_executable(magnetron_benchmark benchmarks.cpp)
target_link_libraries(magnetron_benchmark magnetron)
target_include_directories(magnetron_benchmark PRIVATE ../magnetron)
target_include_directories(magnetron_benchmark PRIVATE nanobench)
target_include_directories(magnetron_benchmark PRIVATE ../extern/nanobench/src/include)

add_executable(magnetron_profile profile.cpp)
target_link_libraries(magnetron_profile magnetron)
Expand Down
77 changes: 31 additions & 46 deletions benchmark/benchmarks.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,59 +5,44 @@
#include <magnetron.h>
#define ANKERL_NANOBENCH_IMPLEMENT
#include <nanobench.h>
#include <thread>

#include "magnetron_internal.h"

static auto bench_cpu_compute(std::int64_t numel_per_dim) -> void {
static auto bench_op(ankerl::nanobench::Bench& bench, std::int64_t numel_per_dim) -> void {
mag_device_descriptor_t desc {};
desc.type = MAG_COMPUTE_DEVICE_TYPE_CPU;
mag_ctx_t* ctx = mag_ctx_create2(&desc);
mag_tensor_t* A = mag_tensor_create_2d(ctx, MAG_DTYPE_F32, numel_per_dim, numel_per_dim);
mag_tensor_fill(A, 1.0f);
mag_tensor_t* B = mag_tensor_create_2d(ctx, MAG_DTYPE_F32, numel_per_dim, numel_per_dim);
mag_tensor_fill(A, 3.0f);
bench.run("Parallel Elems = " + std::to_string(A->numel), [&] {
mag_tensor_t* R = mag_add(A, B);
ankerl::nanobench::doNotOptimizeAway(R);
mag_tensor_decref(R);
});

ankerl::nanobench::doNotOptimizeAway(ctx);
mag_tensor_decref(B);
mag_tensor_decref(A);
mag_ctx_destroy(ctx);
}

auto main() -> int {
ankerl::nanobench::Bench bench {};
bench.title("Parallel MM Big Tensor | Numel per Dim: " + std::to_string(numel_per_dim))
bench.title("Parallel Big Tensor")
.unit("MM")
.warmup(100)
.relative(true)
.performanceCounters(true);

std::cout << "Benchmarking Parallel MM on CPU with Numel per Dim: " << numel_per_dim << std::endl;

auto exec_bench = [&](std::uint32_t threads) {
mag_device_descriptor_t desc {};
desc.type = MAG_COMPUTE_DEVICE_TYPE_CPU;
desc.thread_count = threads;
mag_ctx_t* ctx = mag_ctx_create2(&desc);
mag_tensor_t* A = mag_tensor_create_2d(ctx, MAG_DTYPE_F32, numel_per_dim, numel_per_dim);
mag_tensor_fill_random_normal(A, 0.0f, 1.0f);
mag_tensor_t* B = mag_tensor_create_2d(ctx, MAG_DTYPE_F32, numel_per_dim, numel_per_dim);
mag_tensor_fill_random_normal(B, 0.0f, 1.0f);
bench.run("Parallel MM on " + std::to_string(threads) + " threads, Elems = " + std::to_string(A->numel), [&] {
mag_tensor_t* R = mag_matmul(A, B);
ankerl::nanobench::doNotOptimizeAway(R);
mag_tensor_decref(R);
});

ankerl::nanobench::doNotOptimizeAway(ctx);
mag_tensor_decref(B);
mag_tensor_decref(A);
mag_ctx_destroy(ctx);
};

std::uint32_t num_threads = std::max(1u, std::thread::hardware_concurrency());

for (std::uint32_t i=1; i <= num_threads;) {
exec_bench(i);
if (i == 1) ++i;
else i += 2;
}
}

auto main() -> int {
//bench_cpu_compute(10000);
bench_cpu_compute(1000);
bench_cpu_compute(750);
bench_cpu_compute(500);
bench_cpu_compute(250);
bench_cpu_compute(100);
bench_cpu_compute(10);
bench_cpu_compute(2);
//bench_cpu_compute(250);
bench_op(bench, 15000);
bench_op(bench, 10000);
bench_op(bench, 1000);
bench_op(bench, 750);
bench_op(bench, 500);
bench_op(bench, 250);
bench_op(bench, 100);
bench_op(bench, 10);
bench_op(bench, 4);
return 0;
}
Loading

0 comments on commit 28060b1

Please sign in to comment.