@@ -62,7 +62,8 @@ def __init__(self, csv_path: str = "cpu_binding_gnr.csv", use_hyperthread: bool
6262 self .node_to_cpus = []
6363 for i in range (self .numa_size ):
6464 from numa import info
65- node_intersect = [cpu for cpu in info .node_to_cpus (i ) if cpu in self .cpus_allow_list ]
65+ filtered_node_to_cpus = self .filter_one_cpu_per_core (info .node_to_cpus (i ))
66+ node_intersect = [cpu for cpu in filtered_node_to_cpus if cpu in self .cpus_allow_list ]
6667 if bool (node_intersect ):
6768 self .node_to_cpus .append (list (node_intersect ))
6869 self .node_to_idle_cpus = self .node_to_cpus .copy ()
@@ -98,11 +99,33 @@ def pick_row_by_parameters(self, rows: list[dict], model: str, input_tok: str, o
9899 r for r in rows if r .get ("model_id" , "" ).strip () == model if r .get ("input_length" , "" ).strip () == input_tok
99100 if r .get ("output_length" , "" ).strip () == output_tok
100101 ]
102+ if not matches :
103+ # fallback: match only by model_id
104+ matches = [r for r in rows if r .get ('model_id' , '' ) == model ]
105+ print (f"Warning: using fallback entry for model '{ model } ' without exact input/output token match" )
101106 if not matches :
102107 available = ", " .join (sorted ({r .get ('model_id' , '' ) for r in rows }))
103108 raise ValueError (f"MODEL '{ model } ', input_length '{ input_tok } ', output_length '{ output_tok } ' "
104109 f"not found in CSV. Available: { available } " )
105110 return matches [0 ]
111+ def filter_one_cpu_per_core (self , cpus ):
112+ """
113+ Given a list of CPU IDs (possibly with HT pairs),
114+ return a filtered list with only one logical CPU per physical core.
115+ """
116+ seen_cores = set ()
117+ filtered = []
118+ for cpu in sorted (cpus ):
119+ core_path = f"/sys/devices/system/cpu/cpu{ cpu } /topology/core_id"
120+ try :
121+ with open (core_path ) as f :
122+ core_id = int (f .read ().strip ())
123+ except FileNotFoundError :
124+ continue
125+ if core_id not in seen_cores :
126+ seen_cores .add (core_id )
127+ filtered .append (cpu )
128+ return filtered
106129
107130 def get_cpus_id_binding_based_on_numa_nodes (self , rank : int ) -> str :
108131 """Return CPUs id binding based on NUMA nodes.
@@ -116,8 +139,8 @@ def get_cpus_id_binding_based_on_numa_nodes(self, rank: int) -> str:
116139 return rank_to_cpus
117140
118141 if self .binding_policy is BindingPolicy .Evenly_on_NUMAs or self .cards is None :
119- divider = min (self .world_size , len (self .node_to_cpus ))
120- self .allocated_cpu_per_numa = self .num_allocated_cpu // divider
142+ # divider = min(self.world_size, len(self.node_to_cpus))
143+ self .allocated_cpu_per_numa = self .num_allocated_cpu // len ( self . node_to_cpus )
121144 node_id = rank
122145 elif self .binding_policy is BindingPolicy .NUMAs_with_cards :
123146 self .allocated_cpu_per_numa = self .num_allocated_cpu // len (self .gaudi_numa_list )
@@ -147,7 +170,10 @@ def get_cpus_id_binding_based_on_numa_nodes(self, rank: int) -> str:
147170 numa_size = 1
148171 world_size = numa_size
149172 cpu_binder = CPU_Binding (use_hyperthread = False )
150- max_needed_numa_size = min (cpu_binder .world_size , cpu_binder .numa_size )
173+ if cpu_binder .binding_policy is BindingPolicy .Evenly_on_NUMAs or cpu_binder .cards is None :
174+ max_needed_numa_size = len (cpu_binder .node_to_cpus )
175+ elif cpu_binder .binding_policy is BindingPolicy .NUMAs_with_cards :
176+ max_needed_numa_size = min (cpu_binder .world_size , len (cpu_binder .node_to_cpus ))
151177 for i in range (max_needed_numa_size ):
152178 rank_to_cpus = cpu_binder .get_cpus_id_binding_based_on_numa_nodes (i )
153179 print (rank_to_cpus )
0 commit comments