[CI][Benchmarks] update llama.cpp and requirements to latest

pbalcer · pbalcer · commit 70c9355bfaa4 · 2025-04-07T10:53:23.000Z
This patch updates llama.cpp to the latest available version, uses a
new, more relevant, GGUF model, and updates oneAPI to 2025.1.

I was trying to avoid updating oneAPI, but the latest llama.cpp
internal pooling logic seems to be broken on 2025.0, resulting in
double-free errors when using older oneAPI components.

The utils.download function also had to be updated, because it was
using a deprecated features and didn't work on some configurations.
diff --git a/devops/scripts/benchmarks/benches/llamacpp.py b/devops/scripts/benchmarks/benches/llamacpp.py
@@ -29,7 +29,7 @@ def git_url(self) -> str:
         return "https://github.com/ggerganov/llama.cpp"
 
     def git_hash(self) -> str:
-        return "1ee9eea094fe5846c7d8d770aa7caa749d246b23"
+        return "916c83bfe7f8b08ada609c3b8e583cf5301e594b"
 
     def setup(self):
         if options.sycl is None:
@@ -47,9 +47,9 @@ def setup(self):
 
         self.model = download(
             self.models_dir,
-            "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-q4.gguf",
-            "Phi-3-mini-4k-instruct-q4.gguf",
-            checksum="fc4f45c9729874a33a527465b2ec78189a18e5726b7121182623feeae38632ace4f280617b01d4a04875acf49d263ee4",
+            "https://huggingface.co/ggml-org/DeepSeek-R1-Distill-Qwen-1.5B-Q4_0-GGUF/resolve/main/deepseek-r1-distill-qwen-1.5b-q4_0.gguf",
+            "deepseek-r1-distill-qwen-1.5b-q4_0.gguf",
+            checksum="791f6091059b653a24924b9f2b9c3141c8f892ae13fff15725f77a2bf7f9b1b6b71c85718f1e9c0f26c2549aba44d191",
         )
 
         self.oneapi = get_oneapi()
@@ -64,10 +64,11 @@ def setup(self):
             f"-DGGML_SYCL=ON",
             f"-DCMAKE_C_COMPILER=clang",
             f"-DCMAKE_CXX_COMPILER=clang++",
-            f"-DDNNL_DIR={self.oneapi.dnn_cmake()}",
+            f"-DDNNL_GPU_VENDOR=INTEL",
             f"-DTBB_DIR={self.oneapi.tbb_cmake()}",
-            f'-DCMAKE_CXX_FLAGS=-I"{self.oneapi.mkl_include()}"',
-            f"-DCMAKE_SHARED_LINKER_FLAGS=-L{self.oneapi.compiler_lib()} -L{self.oneapi.mkl_lib()}",
+            f"-DDNNL_DIR={self.oneapi.dnn_cmake()}",
+            f"-DSYCL_COMPILER=ON",
+            f"-DMKL_DIR={self.oneapi.mkl_cmake()}",
         ]
 
         run(configure_command, add_sycl=True)
@@ -96,14 +97,17 @@ def __init__(self, bench):
     def setup(self):
         self.benchmark_bin = os.path.join(self.bench.build_path, "bin", "llama-bench")
 
+    def model(self):
+        return "DeepSeek-R1-Distill-Qwen-1.5B-Q4_0.gguf"
+
     def name(self):
-        return f"llama.cpp"
+        return f"llama.cpp {self.model()}"
 
     def description(self) -> str:
         return (
             "Performance testing tool for llama.cpp that measures LLM inference speed in tokens per second. "
             "Runs both prompt processing (initial context processing) and text generation benchmarks with "
-            "different batch sizes. Higher values indicate better performance. Uses the Phi-3-mini-4k-instruct "
+            f"different batch sizes. Higher values indicate better performance. Uses the {self.model()} "
             "quantized model and leverages SYCL with oneDNN for acceleration."
         )
 
@@ -122,12 +126,18 @@ def run(self, env_vars) -> list[Result]:
             "128",
             "-p",
             "512",
-            "-b",
-            "128,256,512",
+            "-pg",
+            "0,0",
+            "-sm",
+            "none",
+            "-ngl",
+            "99",
             "--numa",
             "isolate",
             "-t",
-            "56",  # TODO: use only as many threads as numa node 0 has cpus
+            "8",
+            "--mmap",
+            "0",
             "--model",
             f"{self.bench.model}",
         ]
diff --git a/devops/scripts/benchmarks/utils/oneapi.py b/devops/scripts/benchmarks/utils/oneapi.py
@@ -16,16 +16,10 @@ def __init__(self):
         Path(self.oneapi_dir).mkdir(parents=True, exist_ok=True)
         self.oneapi_instance_id = self.generate_unique_oneapi_id(self.oneapi_dir)
 
-        # can we just hardcode these links?
         self.install_package(
-            "dnnl",
-            "https://registrationcenter-download.intel.com/akdlm/IRC_NAS/87e117ab-039b-437d-9c80-dcd5c9e675d5/intel-onednn-2025.0.0.862_offline.sh",
-            "6866feb5b8dfefd6ff45d6bfabed44f01d7fba8fd452480ae1fd86b92e9481ae052c24842da14f112f672f5c4859945b",
-        )
-        self.install_package(
-            "mkl",
-            "https://registrationcenter-download.intel.com/akdlm/IRC_NAS/79153e0f-74d7-45af-b8c2-258941adf58a/intel-onemkl-2025.0.0.940_offline.sh",
-            "122bb84cf943ea27753cb399c81ab2ae218ebd51b789c74d273240157722925ab4d5a43cb0b5de41b854f2c5a59a4002",
+            "base",
+            "https://registrationcenter-download.intel.com/akdlm/IRC_NAS/cca951e1-31e7-485e-b300-fe7627cb8c08/intel-oneapi-base-toolkit-2025.1.0.651_offline.sh",
+            "98cad2489f2c90a2b328568a59371cf35855a3338643f61a9fc2d16a265d29f22feb2d673916dd7be18fa12a5e6d2475",
         )
         return
 
diff --git a/devops/scripts/benchmarks/utils/utils.py b/devops/scripts/benchmarks/utils/utils.py
@@ -9,11 +9,11 @@
 import subprocess
 
 import tarfile
-import urllib  # nosec B404
 from options import options
 from pathlib import Path
 import hashlib
-
+from urllib.request import urlopen # nosec B404
+from shutil import copyfileobj
 
 def run(
     command,
@@ -147,7 +147,9 @@ def download(dir, url, file, untar=False, unzip=False, checksum=""):
     data_file = os.path.join(dir, file)
     if not Path(data_file).exists():
         print(f"{data_file} does not exist, downloading")
-        urllib.request.urlretrieve(url, data_file)
+        with urlopen(url) as in_stream, open(data_file, 'wb') as out_file:
+            copyfileobj(in_stream, out_file)
+
         calculated_checksum = calculate_checksum(data_file)
         if calculated_checksum != checksum:
             print(