Skip to content

Commit 13ad16a

Browse files
Add support for properly optimized Windows ARM64 builds with LLVM and MSVC (ggml-org#7191)
* logging: add proper checks for clang to avoid errors and warnings with VA_ARGS * build: add CMake Presets and toolchian files for Windows ARM64 * matmul-int8: enable matmul-int8 with MSVC and fix Clang warnings * ci: add support for optimized Windows ARM64 builds with MSVC and LLVM * matmul-int8: fixed typos in q8_0_q8_0 matmuls Co-authored-by: Georgi Gerganov <[email protected]> * matmul-int8: remove unnecessary casts in q8_0_q8_0 --------- Co-authored-by: Georgi Gerganov <[email protected]>
1 parent 8f7080b commit 13ad16a

File tree

7 files changed

+138
-58
lines changed

7 files changed

+138
-58
lines changed

.github/workflows/build.yml

+33-28
Original file line numberDiff line numberDiff line change
@@ -693,26 +693,28 @@ jobs:
693693
strategy:
694694
matrix:
695695
include:
696-
- build: 'rpc'
696+
- build: 'rpc-x64'
697697
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_RPC=ON -DBUILD_SHARED_LIBS=ON'
698-
- build: 'noavx'
698+
- build: 'noavx-x64'
699699
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF -DBUILD_SHARED_LIBS=ON'
700-
- build: 'avx2'
700+
- build: 'avx2-x64'
701701
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
702-
- build: 'avx'
702+
- build: 'avx-x64'
703703
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
704-
- build: 'avx512'
704+
- build: 'avx512-x64'
705705
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
706-
- build: 'clblast'
706+
- build: 'clblast-x64'
707707
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
708-
- build: 'openblas'
708+
- build: 'openblas-x64'
709709
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
710-
- build: 'kompute'
710+
- build: 'kompute-x64'
711711
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
712-
- build: 'vulkan'
712+
- build: 'vulkan-x64'
713713
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_VULKAN=ON -DBUILD_SHARED_LIBS=ON'
714-
- build: 'arm64'
715-
defines: '-A ARM64 -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
714+
- build: 'llvm-arm64'
715+
defines: '-G Ninja -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
716+
- build: 'msvc-arm64'
717+
defines: '-G Ninja -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
716718

717719
steps:
718720
- name: Clone
@@ -723,21 +725,21 @@ jobs:
723725

724726
- name: Clone Kompute submodule
725727
id: clone_kompute
726-
if: ${{ matrix.build == 'kompute' }}
728+
if: ${{ matrix.build == 'kompute-x64' }}
727729
run: |
728730
git submodule update --init kompute
729731
730732
- name: Download OpenCL SDK
731733
id: get_opencl
732-
if: ${{ matrix.build == 'clblast' }}
734+
if: ${{ matrix.build == 'clblast-x64' }}
733735
run: |
734736
curl.exe -o $env:RUNNER_TEMP/opencl.zip -L "https://github.com/KhronosGroup/OpenCL-SDK/releases/download/v${env:OPENCL_VERSION}/OpenCL-SDK-v${env:OPENCL_VERSION}-Win-x64.zip"
735737
mkdir $env:RUNNER_TEMP/opencl
736738
tar.exe -xvf $env:RUNNER_TEMP/opencl.zip --strip-components=1 -C $env:RUNNER_TEMP/opencl
737739
738740
- name: Download CLBlast
739741
id: get_clblast
740-
if: ${{ matrix.build == 'clblast' }}
742+
if: ${{ matrix.build == 'clblast-x64' }}
741743
run: |
742744
curl.exe -o $env:RUNNER_TEMP/clblast.7z -L "https://github.com/CNugteren/CLBlast/releases/download/${env:CLBLAST_VERSION}/CLBlast-${env:CLBLAST_VERSION}-windows-x64.7z"
743745
curl.exe -o $env:RUNNER_TEMP/CLBlast.LICENSE.txt -L "https://github.com/CNugteren/CLBlast/raw/${env:CLBLAST_VERSION}/LICENSE"
@@ -750,7 +752,7 @@ jobs:
750752
751753
- name: Download OpenBLAS
752754
id: get_openblas
753-
if: ${{ matrix.build == 'openblas' }}
755+
if: ${{ matrix.build == 'openblas-x64' }}
754756
run: |
755757
curl.exe -o $env:RUNNER_TEMP/openblas.zip -L "https://github.com/xianyi/OpenBLAS/releases/download/v${env:OPENBLAS_VERSION}/OpenBLAS-${env:OPENBLAS_VERSION}-x64.zip"
756758
curl.exe -o $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt -L "https://github.com/xianyi/OpenBLAS/raw/v${env:OPENBLAS_VERSION}/LICENSE"
@@ -763,38 +765,41 @@ jobs:
763765
764766
- name: Install Vulkan SDK
765767
id: get_vulkan
766-
if: ${{ matrix.build == 'kompute' || matrix.build == 'vulkan' }}
768+
if: ${{ matrix.build == 'kompute-x64' || matrix.build == 'vulkan-x64' }}
767769
run: |
768770
curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
769771
& "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
770772
Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
771773
Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
772774
775+
- name: Install Ninja
776+
id: install_ninja
777+
run: |
778+
choco install ninja
779+
773780
- name: Build
774781
id: cmake_build
775782
run: |
776-
mkdir build
777-
cd build
778-
cmake .. ${{ matrix.defines }}
779-
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
783+
cmake -S . -B build ${{ matrix.defines }}
784+
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
780785
781786
- name: Add clblast.dll
782787
id: add_clblast_dll
783-
if: ${{ matrix.build == 'clblast' }}
788+
if: ${{ matrix.build == 'clblast-x64' }}
784789
run: |
785790
cp $env:RUNNER_TEMP/clblast/lib/clblast.dll ./build/bin/Release
786791
cp $env:RUNNER_TEMP/CLBlast.LICENSE.txt ./build/bin/Release/CLBlast-${env:CLBLAST_VERSION}.txt
787792
788793
- name: Add libopenblas.dll
789794
id: add_libopenblas_dll
790-
if: ${{ matrix.build == 'openblas' }}
795+
if: ${{ matrix.build == 'openblas-x64' }}
791796
run: |
792797
cp $env:RUNNER_TEMP/openblas/bin/libopenblas.dll ./build/bin/Release/openblas.dll
793798
cp $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt ./build/bin/Release/OpenBLAS-${env:OPENBLAS_VERSION}.txt
794799
795800
- name: Check AVX512F support
796801
id: check_avx512f
797-
if: ${{ matrix.build == 'avx512' }}
802+
if: ${{ matrix.build == 'avx512-x64' }}
798803
continue-on-error: true
799804
run: |
800805
cd build
@@ -808,14 +813,14 @@ jobs:
808813
- name: Test
809814
id: cmake_test
810815
# not all machines have native AVX-512
811-
if: ${{ matrix.build != 'arm64' && matrix.build != 'clblast' && matrix.build != 'kompute' && matrix.build != 'vulkan' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }}
816+
if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'clblast-x64' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
812817
run: |
813818
cd build
814819
ctest -L main -C Release --verbose --timeout 900
815820
816821
- name: Test (Intel SDE)
817822
id: cmake_test_sde
818-
if: ${{ matrix.build == 'avx512' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation
823+
if: ${{ matrix.build == 'avx512-x64' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation
819824
run: |
820825
curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/813591/sde-external-${env:SDE_VERSION}-win.tar.xz"
821826
# for some weird reason windows tar doesn't like sde tar.xz
@@ -843,14 +848,14 @@ jobs:
843848
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
844849
run: |
845850
Copy-Item LICENSE .\build\bin\Release\llama.cpp.txt
846-
7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-x64.zip .\build\bin\Release\*
851+
7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\*
847852
848853
- name: Upload artifacts
849854
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
850855
uses: actions/upload-artifact@v4
851856
with:
852-
path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-x64.zip
853-
name: llama-bin-win-${{ matrix.build }}-x64.zip
857+
path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip
858+
name: llama-bin-win-${{ matrix.build }}.zip
854859

855860
windows-latest-cmake-cuda:
856861
runs-on: windows-latest

CMakeLists.txt

+5
Original file line numberDiff line numberDiff line change
@@ -1007,6 +1007,11 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR CMAKE_GENERATOR_PLATFORM_LWR STR
10071007
if (GGML_COMPILER_SUPPORT_DOTPROD)
10081008
add_compile_definitions(__ARM_FEATURE_DOTPROD)
10091009
endif ()
1010+
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
1011+
if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
1012+
add_compile_definitions(__ARM_FEATURE_MATMUL_INT8)
1013+
endif ()
1014+
10101015
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
10111016
if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
10121017
add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)

CMakePresets.json

+45
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
{
2+
"version": 4,
3+
"configurePresets": [
4+
{
5+
"name": "base",
6+
"hidden": true,
7+
"generator": "Ninja",
8+
"binaryDir": "${sourceDir}/build-${presetName}",
9+
"cacheVariables": {
10+
"CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
11+
"CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
12+
}
13+
},
14+
15+
{ "name": "debug", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } },
16+
{ "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
17+
{ "name": "static", "hidden": true, "cacheVariables": { "LLAMA_STATIC": "ON" } },
18+
19+
{
20+
"name": "arm64-windows-msvc", "hidden": true,
21+
"architecture": { "value": "arm64", "strategy": "external" },
22+
"toolset": { "value": "host=x86_64", "strategy": "external" },
23+
"cacheVariables": {
24+
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-msvc.cmake"
25+
}
26+
},
27+
28+
{
29+
"name": "arm64-windows-llvm", "hidden": true,
30+
"architecture": { "value": "arm64", "strategy": "external" },
31+
"toolset": { "value": "host=x86_64", "strategy": "external" },
32+
"cacheVariables": {
33+
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-llvm.cmake"
34+
}
35+
},
36+
37+
{ "name": "arm64-windows-llvm-debug" , "inherits": [ "base", "arm64-windows-llvm", "debug" ] },
38+
{ "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "release" ] },
39+
{ "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm", "release", "static" ] },
40+
41+
{ "name": "arm64-windows-msvc-debug" , "inherits": [ "base", "arm64-windows-msvc", "debug" ] },
42+
{ "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "release" ] },
43+
{ "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "release", "static" ] }
44+
]
45+
}

cmake/arm64-windows-llvm.cmake

+16
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
set( CMAKE_SYSTEM_NAME Windows )
2+
set( CMAKE_SYSTEM_PROCESSOR arm64 )
3+
4+
set( target arm64-pc-windows-msvc )
5+
6+
set( CMAKE_C_COMPILER clang )
7+
set( CMAKE_CXX_COMPILER clang++ )
8+
9+
set( CMAKE_C_COMPILER_TARGET ${target} )
10+
set( CMAKE_CXX_COMPILER_TARGET ${target} )
11+
12+
set( arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast" )
13+
set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function -Wno-gnu-zero-variadic-macro-arguments" )
14+
15+
set( CMAKE_C_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
16+
set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )

cmake/arm64-windows-msvc.cmake

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
set( CMAKE_SYSTEM_NAME Windows )
2+
set( CMAKE_SYSTEM_PROCESSOR arm64 )
3+
4+
set( target arm64-pc-windows-msvc )
5+
set( CMAKE_C_COMPILER_TARGET ${target} )
6+
set( CMAKE_CXX_COMPILER_TARGET ${target} )

common/log.h

+5-5
Original file line numberDiff line numberDiff line change
@@ -211,7 +211,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
211211
#define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
212212
#else
213213
#define LOG_FLF_FMT "[%24s:%5ld][%24s] "
214-
#define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
214+
#define LOG_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__
215215
#endif
216216
#else
217217
#define LOG_FLF_FMT "%s"
@@ -224,7 +224,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
224224
#define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
225225
#else
226226
#define LOG_TEE_FLF_FMT "[%24s:%5ld][%24s] "
227-
#define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
227+
#define LOG_TEE_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__
228228
#endif
229229
#else
230230
#define LOG_TEE_FLF_FMT "%s"
@@ -294,7 +294,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
294294
// Main LOG macro.
295295
// behaves like printf, and supports arguments the exact same way.
296296
//
297-
#ifndef _MSC_VER
297+
#if !defined(_MSC_VER) || defined(__clang__)
298298
#define LOG(...) LOG_IMPL(__VA_ARGS__, "")
299299
#else
300300
#define LOG(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "")
@@ -308,14 +308,14 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
308308
// Secondary target can be changed just like LOG_TARGET
309309
// by defining LOG_TEE_TARGET
310310
//
311-
#ifndef _MSC_VER
311+
#if !defined(_MSC_VER) || defined(__clang__)
312312
#define LOG_TEE(...) LOG_TEE_IMPL(__VA_ARGS__, "")
313313
#else
314314
#define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "")
315315
#endif
316316

317317
// LOG macro variants with auto endline.
318-
#ifndef _MSC_VER
318+
#if !defined(_MSC_VER) || defined(__clang__)
319319
#define LOGLN(...) LOG_IMPL(__VA_ARGS__, "\n")
320320
#define LOG_TEELN(...) LOG_TEE_IMPL(__VA_ARGS__, "\n")
321321
#else

0 commit comments

Comments
 (0)