@@ -16,10 +16,14 @@ def run_command(command, pwd, ignore_errors=False):
16
16
print (f" Running command in { pwd } :" )
17
17
print (f" { ' ' .join (command )} " )
18
18
os .makedirs (FLAGS .logs_dir , exist_ok = True )
19
- log_file = os .path .join (FLAGS .logs_dir , datetime .now ().strftime ("%Y-%m-%d-%H-%M-%S.log" ))
19
+ command_name = command [0 ].split (os .path .sep )[- 1 ]
20
+ log_file = os .path .join (FLAGS .logs_dir , f"{ datetime .now ().strftime ('%Y-%m-%d-%H-%M-%S' )} _{ command_name } .log" )
20
21
with open (log_file , "w" ) as fp :
21
22
try :
22
- subprocess .check_call (command , cwd = pwd , stdout = fp , stderr = fp )
23
+ if "llama-bench" in command_name :
24
+ subprocess .check_call (command , cwd = pwd )
25
+ else :
26
+ subprocess .check_call (command , cwd = pwd , stdout = fp , stderr = fp )
23
27
except subprocess .CalledProcessError as err :
24
28
if not ignore_errors :
25
29
print (RED + f"Please check { log_file } for what's wrong" + RESET )
@@ -48,6 +52,7 @@ def get_llamacpp_build_dir():
48
52
49
53
50
54
def compile_kernels ():
55
+ model_name = f"{ FLAGS .model } _{ str (FLAGS .quant_type ).upper ()} "
51
56
deploy_dir = os .path .join (ROOT_DIR , "deploy" )
52
57
tuned_dir = os .path .join (deploy_dir , "tuned" )
53
58
prebuilt_dir = os .path .join (tuned_dir , f"{ get_arch (FLAGS .device )} -{ FLAGS .model } " )
@@ -56,10 +61,18 @@ def compile_kernels():
56
61
shutil .copytree (prebuilt_dir , tuned_dir , dirs_exist_ok = True )
57
62
return
58
63
64
+ # Clear previous tune.log
65
+ command = [
66
+ 'rm' ,
67
+ os .path .join ("tuned" , "preprocessor" , "tune.log" ),
68
+ os .path .join ("tuned" , "qgemm_lut" , "tune.log" ),
69
+ ]
70
+ run_command (command , deploy_dir , ignore_errors = True )
71
+
59
72
qargs = get_quant_args ()
60
73
command = [
61
74
'python' , 'compile.py' ,
62
- '-o' , ' tuned' ,
75
+ '-o' , f' { os . path . join ( " tuned" , model_name ) } ' ,
63
76
'-da' ,
64
77
'-nt' , f'{ FLAGS .num_threads } ' ,
65
78
'-tb' ,
@@ -82,6 +95,11 @@ def compile_kernels():
82
95
command .append ('-v' )
83
96
run_command (command , deploy_dir )
84
97
98
+ # Move to pre-install directory
99
+ kernel_dir = os .path .join (tuned_dir , model_name )
100
+ print (f" Copy built kernels from { kernel_dir } to { tuned_dir } " )
101
+ shutil .copytree (kernel_dir , tuned_dir , dirs_exist_ok = True )
102
+
85
103
86
104
def _clean_cmake (build_dir ):
87
105
command = ['cmake' , '--build' , '.' , '--target' , 'clean' ]
@@ -123,31 +141,51 @@ def convert_models():
123
141
model_dir = FLAGS .model_dir
124
142
if not os .path .exists (model_dir ):
125
143
raise FileNotFoundError (model_dir )
126
- out_path = os .path .join (model_dir , f"ggml-model.{ FLAGS .quant_type } .gguf" )
144
+
145
+ out_type = FLAGS .quant_type
146
+ if FLAGS .quant_type == "q4_0" :
147
+ out_type = "f16"
148
+
149
+ model_name = f"{ os .path .split (model_dir )[- 1 ]} .{ str (out_type ).upper ()} .gguf"
150
+ out_path = os .path .join (model_dir , model_name )
127
151
kcfg_path = os .path .join (ROOT_DIR , "install" , "lib" , "kcfg.ini" )
128
152
llamacpp_dir = os .path .join (ROOT_DIR , "3rdparty" , "llama.cpp" )
129
153
command = [
130
154
'python' ,
131
155
'convert_hf_to_gguf.py' ,
132
156
f'{ model_dir } ' ,
133
- '--outtype' , f'{ FLAGS . quant_type } ' ,
157
+ '--outtype' , f'{ out_type } ' ,
134
158
'--outfile' , f'{ out_path } ' ,
135
159
'--kcfg' , f'{ kcfg_path } ' ,
136
160
'--enable-t-mac' ,
137
161
'--verbose' ,
138
162
]
139
163
run_command (command , llamacpp_dir )
140
164
165
+ if FLAGS .quant_type == "q4_0" :
166
+ quantized_model_name = f"{ os .path .split (model_dir )[- 1 ]} .Q4_0.gguf"
167
+ quantized_out_path = os .path .join (model_dir , quantized_model_name )
168
+ command = [
169
+ './build/bin/llama-quantize' ,
170
+ '--token-embedding-type' , 'f16' ,
171
+ '--output-tensor-type' , 'f16' ,
172
+ f'{ out_path } ' ,
173
+ f'{ quantized_out_path } ' ,
174
+ 'q4_0' ,
175
+ ]
176
+ run_command (command , llamacpp_dir )
177
+
141
178
142
179
def cmake_llamacpp ():
143
180
build_dir = get_llamacpp_build_dir ()
144
181
cmake_prefix_path = os .path .join (ROOT_DIR , "install" , "lib" , "cmake" , "t-mac" )
145
182
command = [
146
183
'cmake' , '..' ,
147
- '-DGGML_TMAC=ON ' ,
184
+ f '-DGGML_TMAC={ "OFF" if FLAGS . disable_t_mac else "ON" } ' ,
148
185
f'-DCMAKE_PREFIX_PATH={ cmake_prefix_path } ' ,
149
186
'-DCMAKE_BUILD_TYPE=Release' ,
150
187
'-DGGML_OPENMP=OFF' ,
188
+ f'-DGGML_TMAC_RECHUNK={ "ON" if FLAGS .rechunk else "OFF" } ' ,
151
189
]
152
190
if FLAGS .device == "android" :
153
191
try :
@@ -178,13 +216,14 @@ def cmake_llamacpp():
178
216
179
217
def build_llamacpp ():
180
218
build_dir = get_llamacpp_build_dir ()
181
- command = ['cmake' , '--build' , '.' , '--target' , 'llama-cli' , 'llama-bench' , 'llama-quantize' , '--config' , 'Release' ]
219
+ command = ['cmake' , '--build' , '.' , '--target' , 'llama-cli' , 'llama-bench' , 'llama-quantize' , 'llama-perplexity' , ' --config' , 'Release' ]
182
220
run_command (command , build_dir )
183
221
184
222
185
223
def run_inference ():
186
224
build_dir = get_llamacpp_build_dir ()
187
- out_path = os .path .join (FLAGS .model_dir , f"ggml-model.{ FLAGS .quant_type } .gguf" )
225
+ model_name = f"{ os .path .split (FLAGS .model_dir )[- 1 ]} .{ str (FLAGS .inference_type ).upper ()} .gguf"
226
+ out_path = os .path .join (FLAGS .model_dir , model_name )
188
227
if is_win ():
189
228
main_path = os .path .join (build_dir , "bin" , "Release" , "llama-cli.exe" )
190
229
if not os .path .exists (main_path ):
@@ -229,14 +268,67 @@ def run_inference():
229
268
'-m' , f'{ out_path } ' ,
230
269
'-n' , '128' ,
231
270
'-t' , f'{ FLAGS .num_threads } ' ,
232
- '-p' , prompt ,
271
+ '-p' , f' { prompt } ' ,
233
272
'-ngl' , '0' ,
234
273
'-c' , '2048'
235
274
]
236
275
log_file = run_command (command , build_dir )
237
276
print (GREEN + f"Check { log_file } for inference output" + RESET )
238
277
239
278
279
+ def run_llama_bench ():
280
+ build_dir = get_llamacpp_build_dir ()
281
+ model_name = f"{ os .path .split (FLAGS .model_dir )[- 1 ]} .{ str (FLAGS .inference_type ).upper ()} .gguf"
282
+ out_path = os .path .join (FLAGS .model_dir , model_name )
283
+ if is_win ():
284
+ main_path = os .path .join (build_dir , "bin" , "Release" , "llama-bench.exe" )
285
+ if not os .path .exists (main_path ):
286
+ main_path = os .path .join (build_dir , "bin" , "llama-bench" )
287
+ else :
288
+ main_path = os .path .join (build_dir , "bin" , "llama-bench" )
289
+ prompt = 256
290
+ # TODO: verify in Android
291
+ if FLAGS .device == "android" :
292
+ remote_bin_path = os .path .join (FLAGS .remote_dir , "bin" )
293
+ command = ['push' , os .path .join (build_dir , "bin" ), FLAGS .remote_dir ]
294
+ run_adb_command (command , build_dir )
295
+ remote_main_path = os .path .join (remote_bin_path , "llama-bench" )
296
+ command = ['shell' , 'chmod' , '-R' , '+x' , remote_bin_path ]
297
+ run_adb_command (command , build_dir )
298
+ remote_out_path = os .path .join (
299
+ FLAGS .remote_dir ,
300
+ f"{ os .path .basename (FLAGS .model_dir )} -{ os .path .basename (out_path )} " ,
301
+ )
302
+ if not FLAGS .skip_push_model :
303
+ command = ['push' , out_path , remote_out_path ]
304
+ run_adb_command (command , build_dir )
305
+ kcfg_path = os .path .join (ROOT_DIR , "install" , "lib" , "kcfg.ini" )
306
+ remote_kcfg_path = os .path .join (FLAGS .remote_dir , "kcfg.ini" )
307
+ command = ['push' , kcfg_path , remote_kcfg_path ]
308
+ run_adb_command (command , build_dir )
309
+ command = [
310
+ 'shell' ,
311
+ f'TMAC_KCFG_FILE={ remote_kcfg_path } ' ,
312
+ f'{ remote_main_path } ' ,
313
+ '-m' , f'{ remote_out_path } ' ,
314
+ '-n' , '128' ,
315
+ '-t' , f'{ FLAGS .num_threads } ' ,
316
+ '-p' , f'{ prompt } ' ,
317
+ '-ngl' , '0' ,
318
+ ]
319
+ log_file = run_adb_command (command , build_dir )
320
+ else :
321
+ command = [
322
+ f'{ main_path } ' ,
323
+ '-m' , f'{ out_path } ' ,
324
+ '-n' , '128' ,
325
+ '-t' , f'{ FLAGS .num_threads } ' ,
326
+ '-p' , f'{ prompt } ' ,
327
+ '-ngl' , '0' ,
328
+ ]
329
+ log_file = run_command (command , build_dir )
330
+ print (GREEN + f"Check { log_file } for llama-bench output" + RESET )
331
+
240
332
STEPS = [
241
333
("Compile kernels" , compile_kernels ),
242
334
("Build T-MAC C++ CMakeFiles" , cmake_t_mac ),
@@ -245,6 +337,7 @@ def run_inference():
245
337
("Build llama.cpp CMakeFiles" , cmake_llamacpp ),
246
338
("Build llama.cpp" , build_llamacpp ),
247
339
("Run inference" , run_inference ),
340
+ ("Run llama-bench" , run_llama_bench )
248
341
]
249
342
250
343
@@ -278,7 +371,10 @@ def parse_args():
278
371
parser .add_argument ("-gs" , "--group_size" , type = int , default = None , help = "Don't set this argument if you don't know its meaning." )
279
372
parser .add_argument ("-ags" , "--act_group_size" , type = int , default = None , help = "Don't set this argument if you don't know its meaning." )
280
373
parser .add_argument ("-ld" , "--logs_dir" , type = str , default = "logs" )
281
- parser .add_argument ("-q" , "--quant_type" , type = str , choices = ["int_n" , "f16" , "f32" ], default = "int_n" )
374
+ parser .add_argument ("-q" , "--quant_type" , type = str , choices = ["int_n" , "f16" , "f32" , "tq1_0" , "tq2_0" , "q4_0" ], default = None ,
375
+ help = "Quantization model type. This will override inference_type." )
376
+ parser .add_argument ("-it" , "--inference_type" , type = str , default = "int_n" ,
377
+ help = "Inference model type. This will be overridden by quant_type if quant_type is set." )
282
378
parser .add_argument ("-zp" , "--zero_point" , action = "store_true" , help = "Enforce enable zero_point. Required by EfficientQAT models." )
283
379
parser .add_argument ("-nzp" , "--no_zero_point" , action = "store_false" , help = "Enforce disable zero_point. Don't set this argument if you don't know its meaning." )
284
380
@@ -293,8 +389,16 @@ def parse_args():
293
389
parser .add_argument ("-ndk" , "--ndk_home" , type = str , default = "" , help = "NDK home" )
294
390
parser .add_argument ("-spm" , "--skip_push_model" , action = "store_true" , help = "Suppose the model is unchanged to skip pushing the model file" )
295
391
392
+ parser .add_argument ("-rc" , "--rechunk" , action = "store_true" , help = "Set this argument if you want to use rechunk in computation." )
393
+ parser .add_argument ("--disable-t-mac" , action = "store_true" , help = "Set this argument if you want to disable T-MAC." )
394
+
296
395
parser .set_defaults (zero_point = None )
297
- return parser .parse_args ()
396
+ args = parser .parse_args ()
397
+
398
+ if args .quant_type is not None :
399
+ args .inference_type = args .quant_type
400
+
401
+ return args
298
402
299
403
300
404
def get_quant_args ():
0 commit comments