Skip to content

Commit e8f91f8

Browse files
author
Chris Maunder
committed
Cleanup / better handling of non-accelerated env.
1 parent 0fc1ad3 commit e8f91f8

File tree

2 files changed

+54
-35
lines changed

2 files changed

+54
-35
lines changed

multimode_llm.py

+20-13
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,17 @@
44

55
from PIL import Image
66

7-
use_ONNX = sys.platform != 'darwin'
8-
use_MLX = sys.platform == 'darwin' and "ARM64" in platform.uname().version
7+
accel_mode = None
8+
if sys.platform == 'darwin':
9+
if "ARM64" in platform.uname().version:
10+
accel_mode = 'MLX'
11+
else:
12+
accel_mode = 'ONNX'
13+
914

10-
if use_ONNX:
15+
if accel_mode == 'ONNX':
1116
import onnxruntime_genai as og
12-
elif use_MLX:
17+
elif accel_mode == 'MLX':
1318
from phi_3_vision_mlx import generate, load
1419
else:
1520
from transformers import AutoModelForCausalLM, AutoProcessor, AutoConfig
@@ -32,7 +37,7 @@ def __init__(self, model_repo: str, filename: str, model_dir: str,
3237
self.model_path = None
3338

3439
try:
35-
if use_ONNX: # Non macOS
40+
if accel_mode == 'ONNX': # Non macOS
3641

3742
# For ONNX, we download the models at install time
3843
self.device = device
@@ -41,7 +46,7 @@ def __init__(self, model_repo: str, filename: str, model_dir: str,
4146
self.processor = self.model.create_multimodal_processor()
4247
self.tokenizer_stream = self.processor.create_stream()
4348

44-
elif use_MLX: # macOS on Apple silicon.
49+
elif accel_mode == 'MLX': # macOS, Apple Silicon.
4550

4651
# Hardcoded in MLX code
4752
# repo = "microsoft/Phi-3-vision-128k-instruct"
@@ -50,16 +55,18 @@ def __init__(self, model_repo: str, filename: str, model_dir: str,
5055
self.model_path = model_dir
5156
self.model, self.processor = load(model_path=model_dir, adapter_path=None)
5257

53-
else: # macOS Intel
54-
58+
else: # macOS, Numpy, not MLX
5559
# For macOS (intel), we don't download at install time (yet). We download at runtime
5660
# TBD: Download model in installer, load the model here. If download
5761
# and load fail, fall through to download-at-runtime
5862
raise
5963

6064
except Exception as ex:
61-
if use_ONNX or use_MLX:
62-
# No luck loading what we downloaded
65+
# A general fall-through for the case where ONNX or MLX model loading failed, or where
66+
# we only have non-GPU accelerated libraries (macOS on Intel) to use.
67+
68+
if accel_mode == 'ONNX' or accel_mode == 'MLX':
69+
# We tried, but failed, and we won't fallback to CPU here (Could but won't).
6370
self.model = None
6471
self.processor = None
6572
self.model_path = None
@@ -124,7 +131,7 @@ def do_chat(self, user_prompt: str, image: Image, system_prompt: str=None,
124131

125132
inferenceMs = 0
126133
try:
127-
if use_ONNX:
134+
if accel_mode == 'ONNX':
128135

129136
# ONNX genai API doesn't (yet) provide the means to load an image
130137
# from memory https://github.com/microsoft/onnxruntime-genai/issues/777
@@ -161,7 +168,7 @@ def do_chat(self, user_prompt: str, image: Image, system_prompt: str=None,
161168

162169
del generator
163170

164-
elif use_MLX:
171+
elif accel_mode == 'MLX':
165172

166173
start_inference_time = time.perf_counter()
167174

@@ -197,7 +204,7 @@ def do_chat(self, user_prompt: str, image: Image, system_prompt: str=None,
197204
"inferenceMs": 0
198205
}
199206

200-
if not use_ONNX and self.device == "cuda":
207+
if not accel_mode == 'ONNX' and self.device == "cuda":
201208
try:
202209
import torch
203210
torch.cuda.empty_cache()

multimode_llm_adapter.py

+34-22
Original file line numberDiff line numberDiff line change
@@ -13,17 +13,17 @@
1313
# Import CodeProject.AI SDK
1414
from codeproject_ai_sdk import RequestData, ModuleRunner, ModuleOptions, LogMethod, LogVerbosity, JSON
1515

16-
from multimode_llm import MultiModeLLM, use_ONNX, use_MLX
16+
from multimode_llm import MultiModeLLM, accel_mode
1717

1818
class MultiModeLLM_adapter(ModuleRunner):
1919

2020
def initialise(self) -> None:
2121

22-
if use_ONNX:
22+
if accel_mode == 'ONNX':
2323
(cuda_major, cuda_minor) = self.system_info.getCudaVersion
2424
if cuda_major and (cuda_major >= 12 or (cuda_major == 11 and cuda_minor == 8)) :
2525
self.inference_device = "GPU"
26-
self.inference_library = "CUDA"
26+
self.inference_library = "ONNX/CUDA"
2727
self.device = "cuda"
2828
self.model_repo = "microsoft/Phi-3-vision-128k-instruct-onnx-cuda"
2929
self.model_filename = None # "Phi-3-vision-128k-instruct.gguf"
@@ -36,13 +36,15 @@ def initialise(self) -> None:
3636
self.model_repo = "microsoft/Phi-3-vision-128k-instruct-onnx-cpu"
3737
self.model_filename = None # "Phi-3-vision-128k-instruct.gguf"
3838
self.models_dir = "cpu-int4-rtn-block-32-acc-level-4"
39-
elif use_MLX:
39+
40+
elif accel_mode == 'MLX': # macOS
4041
self.inference_device = "GPU"
4142
self.inference_library = "MLX"
4243
self.device = "mps"
4344
self.model_repo = "microsoft/Phi-3.5-vision-instruct"
4445
self.model_filename = None # "Phi-3.5-vision-instruct.gguf"
4546
self.models_dir = "models"
47+
4648
else:
4749
print("*** Multi-modal LLM using CPU only: This module requires > 16Gb RAM")
4850
# If only...
@@ -55,25 +57,32 @@ def initialise(self) -> None:
5557
self.model_repo = "microsoft/Phi-3-vision-128k-instruct"
5658
self.model_filename = None # "Phi-3-vision-128k-instruct.gguf"
5759
self.models_dir = "./models"
58-
59-
verbose = self.log_verbosity != LogVerbosity.Quiet
60-
self.multimode_chat = MultiModeLLM(model_repo=self.model_repo,
61-
filename=self.model_filename,
62-
model_dir=os.path.join(ModuleOptions.module_path,self.models_dir),
63-
device=self.device,
64-
inference_library=self.inference_library,
65-
verbose=verbose)
66-
67-
if self.multimode_chat.model_path:
68-
self.log(LogMethod.Info|LogMethod.Server, {
69-
"message": f"Using model from '{self.multimode_chat.model_path}'",
70-
"loglevel": "information"
71-
})
72-
else:
60+
61+
62+
if self._performing_self_test and self.device == "cpu":
7363
self.log(LogMethod.Error|LogMethod.Server, {
74-
"message": f"Unable to load Multi-mode model",
64+
"message": f"Unable to perform self-text without acceleration",
7565
"loglevel": "error"
7666
})
67+
else:
68+
verbose = self.log_verbosity != LogVerbosity.Quiet
69+
self.multimode_chat = MultiModeLLM(model_repo=self.model_repo,
70+
filename=self.model_filename,
71+
model_dir=os.path.join(ModuleOptions.module_path,self.models_dir),
72+
device=self.device,
73+
inference_library=self.inference_library,
74+
verbose=verbose)
75+
76+
if self.multimode_chat.model_path:
77+
self.log(LogMethod.Info|LogMethod.Server, {
78+
"message": f"Using model from '{self.multimode_chat.model_path}'",
79+
"loglevel": "information"
80+
})
81+
else:
82+
self.log(LogMethod.Error|LogMethod.Server, {
83+
"message": f"Unable to load Multi-mode model",
84+
"loglevel": "error"
85+
})
7786

7887
self.reply_text = ""
7988
self.cancelled = False
@@ -113,7 +122,7 @@ def long_process(self, data: RequestData) -> JSON:
113122
error = None
114123

115124
try:
116-
if use_ONNX:
125+
if accel_mode == 'ONNX':
117126
(generator, tokenizer_stream) = self.multimode_chat.do_chat(user_prompt, image,
118127
system_prompt,
119128
max_tokens=max_tokens,
@@ -196,6 +205,9 @@ def cancel_command_task(self):
196205

197206
def selftest(self) -> JSON:
198207

208+
if accel_mode == None:
209+
return { "success": False, "message": "Not performing self-test on CPU due to time taken" }
210+
199211
request_data = RequestData()
200212
request_data.queue = self.queue_name
201213
request_data.command = "prompt"
@@ -213,7 +225,7 @@ def selftest(self) -> JSON:
213225
print(f"Info: Self-test for {self.module_id}. Success: {result['success']}")
214226
# print(f"Info: Self-test output for {self.module_id}: {result}")
215227

216-
return { "success": result['success'], "message": "MulitModal LLM test successful" }
228+
return { "success": result['success'], "message": "MultiModal LLM test successful" }
217229

218230

219231
if __name__ == "__main__":

0 commit comments

Comments
 (0)