22import os
33import csv
44from importlib import util
5- from typing import Optional
65from enum import Enum
76from gaudi_topology import GaudiTopology
8- from typing import List , Tuple
9- REQUIRED_COLUMNS = ["model_id" , "input_length" , "output_length" , "world_size" , "data_type" ,"num_allocated_cpu" ]
7+
8+ REQUIRED_COLUMNS = ["model_id" , "input_length" , "output_length" , "world_size" , "data_type" , "num_allocated_cpu" ]
9+
1010
1111class BindingPolicy (Enum ):
1212 Evenly_on_NUMAs = "evenly"
1313 NUMAs_with_cards = "close2cards"
1414
1515
16- class CPU_Binding () :
16+ class CPU_Binding :
1717
18- def __init__ (self ,
19- csv_path : str = "cpu_binding_gnr.csv" ,
20- use_hyperthread : bool = False ):
18+ def __init__ (self , csv_path : str = "cpu_binding_gnr.csv" , use_hyperthread : bool = False ):
2119 self .libnuma_found = util .find_spec ("numa" ) is not None
2220 self .psutil_found = util .find_spec ("psutil" ) is not None
2321 if self .libnuma_found and self .psutil_found :
@@ -28,7 +26,7 @@ def __init__(self,
2826 self .cpus_allow_list = psutil .Process ().cpu_affinity ()
2927 #print("cpu allow list:",self.cpus_allow_list)
3028 self .numa_size = info .get_num_configured_nodes ()
31- self .cpu_count_per_numa = self .cpu_count // self .numa_size
29+ self .cpu_count_per_numa = self .cpu_count // self .numa_size
3230
3331 # Get CSV info
3432 with open (csv_path , newline = "" ) as f :
@@ -38,7 +36,7 @@ def __init__(self,
3836 raise ValueError (f"CSV missing required headers { REQUIRED_COLUMNS } . Found: { found } " )
3937 model = os .environ .get ("MODEL" )
4038 if not model :
41- raise RuntimeError ("Set environment variable MODEL to a model_id in the CSV (e.g., export MODEL='meta-llama/Llama-3.1-8B-Instruct') ." )
39+ raise RuntimeError ("Set environment variable MODEL to a model_id in the CSV." )
4240 input_tok = os .environ .get ("INPUT_TOK" )
4341 output_tok = os .environ .get ("OUTPUT_TOK" )
4442 con_req = os .environ .get ("CONCURRENT_REQ" )
@@ -57,7 +55,7 @@ def __init__(self,
5755 elif row ["num_allocated_cpu" ] == 'NA' :
5856 raise RuntimeError ("Invalid NUM_CPU value. Set environment variable NUM_CPUS instead ." )
5957 else :
60- self .num_allocated_cpu = self .parse_int (row ["num_allocated_cpu" ], "num_allocated_cpu" )
58+ self .num_allocated_cpu = self .parse_int (row ["num_allocated_cpu" ], "num_allocated_cpu" )
6159
6260 # CPU
6361 # check allow node_to_cpus list
@@ -77,8 +75,8 @@ def __init__(self,
7775 # Gaudi
7876 topo = GaudiTopology ()
7977 self .cards = topo .get_cards ()
80- if self .cards != None :
81- self .gaudi_numa_list = []
78+ if self .cards is not None :
79+ self .gaudi_numa_list = []
8280 # Assume to use cards from 0 to 7
8381 for card in self .cards [:self .world_size ]:
8482 if card ['numa_node' ] not in self .gaudi_numa_list :
@@ -91,36 +89,34 @@ def __init__(self,
9189 def parse_int (self , v : str , name : str ) -> int :
9290 try :
9391 return int (v )
94- except Exception :
95- raise ValueError (f"Invalid integer for { name !r} : { v !r} " )
92+ except Exception as err :
93+ raise ValueError (f"Invalid integer for { name !r} : { v !r} " ) from err
9694
97- def pick_row_by_parameters (self , rows : List [dict ], model : str , input_tok : str , output_tok : str , con_req : str ) -> dict :
95+ def pick_row_by_parameters (self , rows : list [dict ], model : str , input_tok : str , output_tok : str ,
96+ con_req : str ) -> dict :
9897 matches = [
99- r for r in rows
100- if r .get ("model_id" , "" ).strip () == model
101- if r .get ("input_length" , "" ).strip () == input_tok
102- if r .get ("output_length" , "" ).strip () == output_tok
103- ]
98+ r for r in rows if r .get ("model_id" , "" ).strip () == model if r .get ("input_length" , "" ).strip () == input_tok
99+ if r .get ("output_length" , "" ).strip () == output_tok
100+ ]
104101 if not matches :
105- available = ", " .join (sorted ({r .get ('model_id' ,'' ) for r in rows }))
106- raise ValueError (f"MODEL '{ model } ', input_lenght '{ input_tok } ', output_length '{ output_tok } ' not found in CSV. Available: { available } " )
102+ available = ", " .join (sorted ({r .get ('model_id' , '' ) for r in rows }))
103+ raise ValueError (f"MODEL '{ model } ', input_length '{ input_tok } ', output_length '{ output_tok } ' "
104+ f"not found in CSV. Available: { available } " )
107105 return matches [0 ]
108106
109- def get_cpus_id_binding_based_on_numa_nodes (self ,
110- rank : int ) -> str :
107+ def get_cpus_id_binding_based_on_numa_nodes (self , rank : int ) -> str :
111108 """Return CPUs id binding based on NUMA nodes.
112109 """
113110 rank_to_cpus = ''
114111 if not self .libnuma_found or not self .psutil_found :
115- print (
116- "Auto thread-binding is not supported due to "
117- "the lack of package numa and psutil,"
118- "fallback to no thread-binding. To get better performance,"
119- "please try to manually bind threads." )
112+ print ("Auto thread-binding is not supported due to "
113+ "the lack of package numa and psutil,"
114+ "fallback to no thread-binding. To get better performance,"
115+ "please try to manually bind threads." )
120116 return rank_to_cpus
121117
122118 if self .binding_policy is BindingPolicy .Evenly_on_NUMAs or self .cards is None :
123- divider = min (self .world_size , len (self .node_to_cpus ))
119+ divider = min (self .world_size , len (self .node_to_cpus ))
124120 self .allocated_cpu_per_numa = self .num_allocated_cpu // divider
125121 node_id = rank
126122 elif self .binding_policy is BindingPolicy .NUMAs_with_cards :
@@ -136,10 +132,13 @@ def get_cpus_id_binding_based_on_numa_nodes(self,
136132
137133 rank_to_cpus = ',' .join (str (x ) for x in rank_to_cpus_list )
138134 print ("rank %d auto thread-binding list: %s" , rank , rank_to_cpus )
139- self .node_to_idle_cpus [node_id ] = [cpu for cpu in self .node_to_idle_cpus [node_id ] if cpu not in rank_to_cpus_list ]
135+ self .node_to_idle_cpus [node_id ] = [
136+ cpu for cpu in self .node_to_idle_cpus [node_id ] if cpu not in rank_to_cpus_list
137+ ]
140138 return rank_to_cpus
141139
142- if __name__ == "__main__" :
140+
141+ if __name__ == "__main__" :
143142 libnuma_found = util .find_spec ("numa" ) is not None
144143 if libnuma_found :
145144 from numa import info
@@ -153,7 +152,6 @@ def get_cpus_id_binding_based_on_numa_nodes(self,
153152 rank_to_cpus = cpu_binder .get_cpus_id_binding_based_on_numa_nodes (i )
154153 print (rank_to_cpus )
155154
156-
157155 rank_to_idle_cpus = ',' .join (str (x ) for row in cpu_binder .node_to_idle_cpus for x in row )
158156 print (rank_to_idle_cpus )
159157 for r in cpu_binder .node_to_idle_cpus :
0 commit comments