add cpu core pinning to vllm-server on Gaudi3 + GNR for Llama405B and 70B

louie-tsai · louie-tsai · commit d54e5168996a · 2025-10-28T19:44:22.000-07:00
diff --git a/.cd/README.md b/.cd/README.md
@@ -64,6 +64,37 @@ cd vllm-gaudi/.cd/
 
    This launches the vLLM server and runs the benchmark suite automatically.
 
+#### 2.1 (Optional) Running the Server with a Benchmark, and pinning CPU cores for memory access coherence
+
+   To improve memory access cohererence and release CPUs to other CPU only workloads like a vLLM serving with Llama3 8B,
+   pin the CPU cores based on different CPU NUMA nodes by using an auto-generate docker-compose.override.yml file.
+   Couple python libraries are needed for the python scripts, so install the required packages using following commnad.
+   ```bash
+   pip install -r vllm-fork/.cd/server/requirements_cpu_binding.txt
+   ```
+   Run below command to do CPU cores pinning via auto-generated docker-compose.override.yml file.
+   ```bash
+   cd vllm-fork/.cd/
+   MODEL="Qwen/Qwen2.5-14B-Instruct" \
+   HF_TOKEN="<your huggingface token>" \
+   DOCKER_IMAGE="vault.habana.ai/gaudi-docker/1.22.0/ubuntu22.04/habanalabs/vllm-installer-2.7.1:latest" \
+   python3 server/generate_cpu_binding_from_csv.py --settings server/cpu_binding.csv --output ./docker-compose.override.yml \
+   docker compose --profile benchmark -f docker-compose.yml -f docker-compose.override.yml up
+   ```
+
+   To also pin idle CPUs to another service like vllm-cpu-service, please give the service name to update
+   docker-compose.override.yml in order to bind another service to idle cpus.
+   Here is an exmaple to bind idle cpu for vllm-cpu-service service while docker-compose.vllm-cpu-service.yml defines cpu service.
+
+   ```bash
+   cd vllm-fork/.cd/
+   MODEL="Qwen/Qwen2.5-14B-Instruct" \
+   HF_TOKEN="<your huggingface token>" \
+   DOCKER_IMAGE="vault.habana.ai/gaudi-docker/1.22.0/ubuntu22.04/habanalabs/vllm-installer-2.7.1:latest" \
+   python3 server/generate_cpu_binding_from_csv.py --settings server/cpu_binding.csv --output ./docker-compose.override.yml --cpuservice vllm-cpu-service \
+   docker compose --profile benchmark -f docker-compose.yml -f docker-compose.vllm-cpu-service.yml -f docker-compose.override.yml up
+   ```
+
 ### 3. Run the server using Docker Compose with custom parameters
 
    To override default settings, you can provide additional parameters when starting the server. This is a more advanced approach:
@@ -129,7 +160,7 @@ cd vllm-gaudi/.cd/
    MAX_MODEL_LEN=2048 \
    INPUT_TOK=128 \
    OUTPUT_TOK=128 \
-   CON_REQ=16 \
+   CONCURRENT_REQ=16 \
    NUM_PROMPTS=64 \
    docker compose --profile benchmark up
    ```
diff --git a/.cd/benchmark/benchmark_user.env b/.cd/benchmark/benchmark_user.env
@@ -1,5 +1,5 @@
 MODEL
 INPUT_TOK
 OUTPUT_TOK
-CON_REQ
+CONCURRENT_REQ
 NUM_PROMPTS
diff --git a/.cd/docker-compose.yml b/.cd/docker-compose.yml
@@ -42,4 +42,6 @@ services:
       - PYTHONUNBUFFERED=1
     env_file:
       - ./benchmark/benchmark_user.env
+    volumes:
+      - ./logs:/root/scripts/logs
     command: ["benchmark", "--config-file", "${VLLM_BENCHMARK_CONFIG_FILE}", "--config-name", "${VLLM_BENCHMARK_CONFIG_NAME}"]
diff --git a/.cd/server/cpu_binding.py b/.cd/server/cpu_binding.py
@@ -0,0 +1,160 @@
+# SPDX-License-Identifier: Apache-2.0
+import os
+import csv
+from importlib import util
+from typing import Optional
+from enum import Enum
+from gaudi_topology import GaudiTopology
+from typing import List, Tuple
+REQUIRED_COLUMNS = ["model_id", "input_length", "output_length", "world_size", "data_type","num_allocated_cpu"]
+
+class BindingPolicy(Enum):
+    Evenly_on_NUMAs = "evenly"
+    NUMAs_with_cards = "close2cards"
+
+
+class CPU_Binding():
+
+    def __init__(self,
+                 csv_path: str = "cpu_binding_gnr.csv",
+                 use_hyperthread: bool = False):
+        self.libnuma_found = util.find_spec("numa") is not None
+        self.psutil_found = util.find_spec("psutil") is not None
+        if self.libnuma_found and self.psutil_found:
+            import psutil
+            from numa import info
+            # Get system Info
+            self.cpu_count = psutil.cpu_count(logical=False)
+            self.cpus_allow_list = psutil.Process().cpu_affinity()
+            #print("cpu allow list:",self.cpus_allow_list)
+            self.numa_size = info.get_num_configured_nodes()
+            self.cpu_count_per_numa =self.cpu_count // self.numa_size
+
+            # Get CSV info
+            with open(csv_path, newline="") as f:
+                rows = list(csv.DictReader(f))
+            if not rows or any(col not in rows[0] for col in REQUIRED_COLUMNS):
+                found = list(rows[0].keys()) if rows else "EMPTY CSV"
+                raise ValueError(f"CSV missing required headers {REQUIRED_COLUMNS}. Found: {found}")
+            model = os.environ.get("MODEL")
+            if not model:
+                raise RuntimeError("Set environment variable MODEL to a model_id in the CSV (e.g., export MODEL='meta-llama/Llama-3.1-8B-Instruct').")
+            input_tok = os.environ.get("INPUT_TOK")
+            output_tok = os.environ.get("OUTPUT_TOK")
+            con_req = os.environ.get("CONCURRENT_REQ")
+            num_allocated_cpu = os.environ.get("NUM_CPUS")
+            print(num_allocated_cpu)
+
+            row = self.pick_row_by_parameters(rows, model, input_tok, output_tok, con_req)
+            print(row["num_allocated_cpu"])
+
+            self.world_size = self.parse_int(row["world_size"], "world_size")
+            binding_policy_index = self.parse_int(row["binding_policy"], "binding_policy")
+            self.binding_policy = list(BindingPolicy)[binding_policy_index]
+
+            if num_allocated_cpu:
+                self.num_allocated_cpu = int(num_allocated_cpu)
+            elif row["num_allocated_cpu"] == 'NA':
+                raise RuntimeError("Invalid NUM_CPU value. Set environment variable NUM_CPUS instead .")
+            else:
+                self.num_allocated_cpu  = self.parse_int(row["num_allocated_cpu"], "num_allocated_cpu")
+
+            # CPU
+            # check allow node_to_cpus list
+            self.node_to_cpus = []
+            for i in range(self.numa_size):
+                from numa import info
+                node_intersect = [cpu for cpu in info.node_to_cpus(i) if cpu in self.cpus_allow_list]
+                if bool(node_intersect):
+                    self.node_to_cpus.append(list(node_intersect))
+            self.node_to_idle_cpus = self.node_to_cpus.copy()
+            #self.node_to_idle_cpus_ht = [] #self.node_to_cpus
+            for i in range(self.numa_size):
+                if use_hyperthread is False:
+                    self.node_to_idle_cpus[i] = self.node_to_cpus[i][:self.cpu_count_per_numa]
+                else:
+                    self.node_to_idle_cpus[i] = self.node_to_cpus[i][self.cpu_count_per_numa:]
+            # Gaudi
+            topo = GaudiTopology()
+            self.cards = topo.get_cards()
+            if self.cards != None:
+                self.gaudi_numa_list=[]
+                # Assume to use cards from 0 to 7
+                for card in self.cards[:self.world_size]:
+                    if card['numa_node'] not in self.gaudi_numa_list:
+                        self.gaudi_numa_list.append(card['numa_node'])
+                        print(f"Card {card['card_id']} ({card['model']}):")
+                        print(f"  Bus ID     : {card['bus_id']}")
+                        print(f"  NUMA Node  : {card['numa_node']}")
+                        print(f"  Local CPUs : {card['local_cpulist']}")
+
+    def parse_int(self, v: str, name: str) -> int:
+        try:
+            return int(v)
+        except Exception:
+            raise ValueError(f"Invalid integer for {name!r}: {v!r}")
+
+    def pick_row_by_parameters(self, rows: List[dict], model: str, input_tok: str, output_tok: str, con_req: str) -> dict:
+        matches = [
+                r for r in rows
+                if r.get("model_id", "").strip() == model
+                if r.get("input_length", "").strip() == input_tok
+                if r.get("output_length", "").strip() == output_tok
+                ]
+        if not matches:
+            available = ", ".join(sorted({r.get('model_id','') for r in rows}))
+            raise ValueError(f"MODEL '{model}', input_lenght '{input_tok}', output_length '{output_tok}' not found in CSV. Available: {available}")
+        return matches[0]
+
+    def get_cpus_id_binding_based_on_numa_nodes(self,
+                                                rank: int) -> str:
+        """Return CPUs id binding based on NUMA nodes.
+        """
+        rank_to_cpus = ''
+        if not self.libnuma_found or not self.psutil_found:
+            print(
+                "Auto thread-binding is not supported due to "
+                "the lack of package numa and psutil,"
+                "fallback to no thread-binding. To get better performance,"
+                "please try to manually bind threads.")
+            return rank_to_cpus
+
+        if self.binding_policy is BindingPolicy.Evenly_on_NUMAs or self.cards is None:
+            divider = min (self.world_size, len(self.node_to_cpus))
+            self.allocated_cpu_per_numa = self.num_allocated_cpu // divider
+            node_id = rank
+        elif self.binding_policy is BindingPolicy.NUMAs_with_cards:
+            self.allocated_cpu_per_numa = self.num_allocated_cpu // len(self.gaudi_numa_list)
+            node_id = int(self.cards[rank]['numa_node'])
+
+        print("binding numa node_id %d  allocated_cpu_per_numa %d", node_id, self.allocated_cpu_per_numa)
+        # Option 1. Bind to the last N cpu cores
+        start = self.cpu_count_per_numa - self.allocated_cpu_per_numa
+        rank_to_cpus_list = self.node_to_cpus[node_id][start:self.cpu_count_per_numa]
+        # Option 2. Bind to the first N cpu cores
+        #rank_to_cpus_list = self.node_to_cpus[node_id][:self.allocated_cpu_per_numa]
+
+        rank_to_cpus = ','.join(str(x) for x in rank_to_cpus_list)
+        print("rank %d auto thread-binding list: %s", rank, rank_to_cpus)
+        self.node_to_idle_cpus[node_id] = [cpu for cpu in self.node_to_idle_cpus[node_id] if cpu not in rank_to_cpus_list]
+        return rank_to_cpus
+
+if __name__=="__main__":
+    libnuma_found = util.find_spec("numa") is not None
+    if libnuma_found:
+        from numa import info
+        numa_size = info.get_num_configured_nodes()
+    else:
+        numa_size = 1
+    world_size = numa_size
+    cpu_binder = CPU_Binding(use_hyperthread=False)
+    max_needed_numa_size = min(cpu_binder.world_size, cpu_binder.numa_size)
+    for i in range(max_needed_numa_size):
+        rank_to_cpus = cpu_binder.get_cpus_id_binding_based_on_numa_nodes(i)
+        print(rank_to_cpus)
+
+
+    rank_to_idle_cpus = ','.join(str(x) for row in cpu_binder.node_to_idle_cpus for x in row)
+    print(rank_to_idle_cpus)
+    for r in cpu_binder.node_to_idle_cpus:
+        print(len(r))
diff --git a/.cd/server/cpu_binding_gnr.csv b/.cd/server/cpu_binding_gnr.csv
@@ -0,0 +1,7 @@
+model_id,input_length,output_length,world_size,data_type,num_allocated_cpu,binding_policy
+meta-llama/Llama-3.1-405B-Instruct,128,4096,8,bf16,18,0
+meta-llama/Llama-3.1-405B-Instruct,2048,2048,8,bf16,18,0
+meta-llama/Llama-3.1-405B-Instruct,4096,128,8,bf16,18,0
+meta-llama/Llama-3.1-70B-Instruct,128,4096,4,bf16,12,0
+meta-llama/Llama-3.1-70B-Instruct,2048,2048,4,bf16,12,0
+meta-llama/Llama-3.1-70B-Instruct,4096,128,4,bf16,12,0
diff --git a/.cd/server/gaudi_topology.py b/.cd/server/gaudi_topology.py
@@ -0,0 +1,126 @@
+#!/usr/bin/env python3
+# ==============================================================================
+# gaudi_topology.py
+# Provides GaudiTopology class:
+#   - discover all Gaudi cards via hl-smi
+#   - return NUMA node and CPU IDs per card
+# Works with hl-smi v1.22.0+ (HL-325L / Gaudi3) table format.
+# ==============================================================================
+
+import subprocess
+import re
+import os
+from typing import List, Dict, Optional
+import shutil
+
+class GaudiTopology:
+    """Utility class to discover Gaudi cards and their NUMA / CPU locality."""
+
+    def __init__(self):
+        self.cards = self._discover_cards()
+
+    # ------------------------------------------------------------------
+    def _run_cmd(self, cmd: str) -> str:
+        """Run a shell command and return stdout."""
+        try:
+            result = subprocess.run(cmd, shell=True, check=True,
+                                    stdout=subprocess.PIPE, stderr=subprocess.PIPE,
+                                    text=True)
+            return result.stdout
+        except subprocess.CalledProcessError as e:
+            raise RuntimeError(f"Command failed: {cmd}\n{e.stderr}")
+
+    # ------------------------------------------------------------------
+    def _parse_hl_smi_table(self, text: str) -> List[Dict]:
+        """
+        Parse hl-smi v1.22+ table format.
+        Example line:
+        |   0  HL-325L             N/A  | 0000:97:00.0     N/A | ...
+        """
+        cards = []
+        pattern = re.compile(
+            r'^\|\s*(\d+)\s+([A-Z0-9-]+)\s+N/A\s+\|\s*([0-9a-fA-F:.]+)\s+N/A\s*\|'
+        )
+        for line in text.splitlines():
+            match = pattern.match(line)
+            if not match:
+                continue
+            card_id, model, bus_id = match.groups()
+            if not bus_id.startswith("0000:"):
+                bus_id = "0000:" + bus_id
+            cards.append({
+                "card_id": int(card_id),
+                "model": model,
+                "bus_id": bus_id
+            })
+        return cards
+
+    # ------------------------------------------------------------------
+    def _get_sysfs_info(self, bus_id: str) -> Dict[str, Optional[str]]:
+        """Fetch NUMA node and local CPU list from sysfs."""
+        sys_path = f"/sys/bus/pci/devices/{bus_id}"
+        info = {"numa_node": None, "local_cpulist": None}
+        try:
+            with open(os.path.join(sys_path, "numa_node")) as f:
+                info["numa_node"] = f.read().strip()
+        except FileNotFoundError:
+            pass
+        try:
+            with open(os.path.join(sys_path, "local_cpulist")) as f:
+                info["local_cpulist"] = f.read().strip()
+        except FileNotFoundError:
+            pass
+        return info
+
+    # ------------------------------------------------------------------
+    def _discover_cards(self) -> List[Dict]:
+        """Run hl-smi and discover Gaudi cards."""
+        if shutil.which("hl-smi") is None:
+            print("No hl-smi found")
+            return None
+
+        hl_smi_output = self._run_cmd("hl-smi")
+        cards = self._parse_hl_smi_table(hl_smi_output)
+        for c in cards:
+            sysfs_info = self._get_sysfs_info(c["bus_id"])
+            c.update(sysfs_info)
+        return cards
+
+    # ------------------------------------------------------------------
+    def get_cards(self) -> List[Dict]:
+        """Return list of all discovered cards sorted by NUMA node (then card_id)."""
+        def sort_key(c):
+            # Convert numa_node to int when possible, else put N/A at the end
+            try:
+                return (int(c["numa_node"]), c["card_id"])
+            except (TypeError, ValueError):
+                return (999, c["card_id"])
+        return sorted(self.cards, key=sort_key)
+
+
+    # ------------------------------------------------------------------
+    def get_numa_for_card(self, card_id: int) -> Optional[str]:
+        """Return NUMA node for a given card ID."""
+        for c in self.cards:
+            if c["card_id"] == card_id:
+                return c["numa_node"]
+        return None
+
+    # ------------------------------------------------------------------
+    def get_cpus_for_card(self, card_id: int) -> Optional[str]:
+        """Return local CPU list for a given card ID."""
+        for c in self.cards:
+            if c["card_id"] == card_id:
+                return c["local_cpulist"]
+        return None
+
+# ------------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    topo = GaudiTopology()
+    for card in topo.get_cards():
+        print(f"Card {card['card_id']} ({card['model']}):")
+        print(f"  Bus ID     : {card['bus_id']}")
+        print(f"  NUMA Node  : {card['numa_node']}")
+        print(f"  Local CPUs : {card['local_cpulist']}")
+        print()
diff --git a/.cd/server/generate_cpu_binding_from_csv.py b/.cd/server/generate_cpu_binding_from_csv.py
diff --git a/.cd/server/requirements_cpu_binding.txt b/.cd/server/requirements_cpu_binding.txt