fix even policy issue for 70b and 8b

louie-tsai · louie-tsai · commit df4933da98b8 · 2025-11-01T07:22:30.000+08:00
use one CPU id per core, and fallback model_id match if no input/output match

Signed-off-by: louie-tsai &lt;louie.tsai@intel.com&gt;
diff --git a/.cd/server/cpu_binding.py b/.cd/server/cpu_binding.py
@@ -62,7 +62,8 @@ def __init__(self, csv_path: str = "cpu_binding_gnr.csv", use_hyperthread: bool
             self.node_to_cpus = []
             for i in range(self.numa_size):
                 from numa import info
-                node_intersect = [cpu for cpu in info.node_to_cpus(i) if cpu in self.cpus_allow_list]
+                filtered_node_to_cpus = self.filter_one_cpu_per_core(info.node_to_cpus(i))
+                node_intersect = [cpu for cpu in filtered_node_to_cpus if cpu in self.cpus_allow_list]
                 if bool(node_intersect):
                     self.node_to_cpus.append(list(node_intersect))
             self.node_to_idle_cpus = self.node_to_cpus.copy()
@@ -98,11 +99,33 @@ def pick_row_by_parameters(self, rows: list[dict], model: str, input_tok: str, o
             r for r in rows if r.get("model_id", "").strip() == model if r.get("input_length", "").strip() == input_tok
             if r.get("output_length", "").strip() == output_tok
         ]
+        if not matches:
+            # fallback: match only by model_id
+            matches = [r for r in rows if r.get('model_id', '') == model]
+            print(f"Warning: using fallback entry for model '{model}' without exact input/output token match")
         if not matches:
             available = ", ".join(sorted({r.get('model_id', '') for r in rows}))
             raise ValueError(f"MODEL '{model}', input_length '{input_tok}', output_length '{output_tok}' "
                              f"not found in CSV. Available: {available}")
         return matches[0]
+    def filter_one_cpu_per_core(self, cpus):
+        """
+        Given a list of CPU IDs (possibly with HT pairs),
+        return a filtered list with only one logical CPU per physical core.
+        """
+        seen_cores = set()
+        filtered = []
+        for cpu in sorted(cpus):
+            core_path = f"/sys/devices/system/cpu/cpu{cpu}/topology/core_id"
+            try:
+                with open(core_path) as f:
+                    core_id = int(f.read().strip())
+            except FileNotFoundError:
+                continue
+            if core_id not in seen_cores:
+                seen_cores.add(core_id)
+                filtered.append(cpu)
+        return filtered
 
     def get_cpus_id_binding_based_on_numa_nodes(self, rank: int) -> str:
         """Return CPUs id binding based on NUMA nodes.
@@ -116,8 +139,8 @@ def get_cpus_id_binding_based_on_numa_nodes(self, rank: int) -> str:
             return rank_to_cpus
 
         if self.binding_policy is BindingPolicy.Evenly_on_NUMAs or self.cards is None:
-            divider = min(self.world_size, len(self.node_to_cpus))
-            self.allocated_cpu_per_numa = self.num_allocated_cpu // divider
+            #divider = min(self.world_size, len(self.node_to_cpus))
+            self.allocated_cpu_per_numa = self.num_allocated_cpu // len(self.node_to_cpus)
             node_id = rank
         elif self.binding_policy is BindingPolicy.NUMAs_with_cards:
             self.allocated_cpu_per_numa = self.num_allocated_cpu // len(self.gaudi_numa_list)
@@ -147,7 +170,10 @@ def get_cpus_id_binding_based_on_numa_nodes(self, rank: int) -> str:
         numa_size = 1
     world_size = numa_size
     cpu_binder = CPU_Binding(use_hyperthread=False)
-    max_needed_numa_size = min(cpu_binder.world_size, cpu_binder.numa_size)
+    if cpu_binder.binding_policy is BindingPolicy.Evenly_on_NUMAs or cpu_binder.cards is None:
+        max_needed_numa_size = len(cpu_binder.node_to_cpus)
+    elif cpu_binder.binding_policy is BindingPolicy.NUMAs_with_cards:
+        max_needed_numa_size = min(cpu_binder.world_size, len(cpu_binder.node_to_cpus))
     for i in range(max_needed_numa_size):
         rank_to_cpus = cpu_binder.get_cpus_id_binding_based_on_numa_nodes(i)
         print(rank_to_cpus)
diff --git a/.cd/server/cpu_binding_gnr.csv b/.cd/server/cpu_binding_gnr.csv
@@ -5,3 +5,6 @@ meta-llama/Llama-3.1-405B-Instruct,4096,128,8,bf16,18,0
 meta-llama/Llama-3.1-70B-Instruct,128,4096,4,bf16,12,0
 meta-llama/Llama-3.1-70B-Instruct,2048,2048,4,bf16,12,0
 meta-llama/Llama-3.1-70B-Instruct,4096,128,4,bf16,12,0
+meta-llama/Llama-3.1-8B-Instruct,128,4096,1,bf16,6,0
+meta-llama/Llama-3.1-8B-Instruct,2048,2048,1,bf16,6,0
+meta-llama/Llama-3.1-8B-Instruct,4096,128,1,bf16,6,0
diff --git a/.cd/server/generate_cpu_binding_from_csv.py b/.cd/server/generate_cpu_binding_from_csv.py
@@ -7,7 +7,7 @@
 from ruamel.yaml.comments import CommentedMap
 from ruamel.yaml.scalarstring import DoubleQuotedScalarString
 # Import CPU_Binding directly from sibling cpu_binding.py
-from cpu_binding import CPU_Binding
+from cpu_binding import CPU_Binding, BindingPolicy
 
 SERVICE_NAME = "vllm-server"  # single service
 XSET_NAME = "vllm_server_cpu"  # x-sets key/anchor
@@ -19,7 +19,10 @@ def build_cpuset_and_limit(csv_path: str):
     cpus_list = ''
     idle_cpus_list = ''
     cpu_binder = CPU_Binding(csv_path=csv_path, use_hyperthread=False)
-    max_needed_numa_size = min(cpu_binder.world_size, cpu_binder.numa_size)
+    if cpu_binder.binding_policy is BindingPolicy.Evenly_on_NUMAs or cpu_binder.cards is None:
+        max_needed_numa_size = len(cpu_binder.node_to_cpus)
+    elif cpu_binder.binding_policy is BindingPolicy.NUMAs_with_cards:
+        max_needed_numa_size = min(cpu_binder.world_size, len(cpu_binder.node_to_cpus))
     for rank in range(max_needed_numa_size):
         rank_to_cpus = cpu_binder.get_cpus_id_binding_based_on_numa_nodes(rank)
         if rank_to_cpus not in cpus_list: