13
13
# Import CodeProject.AI SDK
14
14
from codeproject_ai_sdk import RequestData , ModuleRunner , ModuleOptions , LogMethod , LogVerbosity , JSON
15
15
16
- from multimode_llm import MultiModeLLM , use_ONNX , use_MLX
16
+ from multimode_llm import MultiModeLLM , accel_mode
17
17
18
18
class MultiModeLLM_adapter (ModuleRunner ):
19
19
20
20
def initialise (self ) -> None :
21
21
22
- if use_ONNX :
22
+ if accel_mode == 'ONNX' :
23
23
(cuda_major , cuda_minor ) = self .system_info .getCudaVersion
24
24
if cuda_major and (cuda_major >= 12 or (cuda_major == 11 and cuda_minor == 8 )) :
25
25
self .inference_device = "GPU"
26
- self .inference_library = "CUDA"
26
+ self .inference_library = "ONNX/ CUDA"
27
27
self .device = "cuda"
28
28
self .model_repo = "microsoft/Phi-3-vision-128k-instruct-onnx-cuda"
29
29
self .model_filename = None # "Phi-3-vision-128k-instruct.gguf"
@@ -36,13 +36,15 @@ def initialise(self) -> None:
36
36
self .model_repo = "microsoft/Phi-3-vision-128k-instruct-onnx-cpu"
37
37
self .model_filename = None # "Phi-3-vision-128k-instruct.gguf"
38
38
self .models_dir = "cpu-int4-rtn-block-32-acc-level-4"
39
- elif use_MLX :
39
+
40
+ elif accel_mode == 'MLX' : # macOS
40
41
self .inference_device = "GPU"
41
42
self .inference_library = "MLX"
42
43
self .device = "mps"
43
44
self .model_repo = "microsoft/Phi-3.5-vision-instruct"
44
45
self .model_filename = None # "Phi-3.5-vision-instruct.gguf"
45
46
self .models_dir = "models"
47
+
46
48
else :
47
49
print ("*** Multi-modal LLM using CPU only: This module requires > 16Gb RAM" )
48
50
# If only...
@@ -55,25 +57,32 @@ def initialise(self) -> None:
55
57
self .model_repo = "microsoft/Phi-3-vision-128k-instruct"
56
58
self .model_filename = None # "Phi-3-vision-128k-instruct.gguf"
57
59
self .models_dir = "./models"
58
-
59
- verbose = self .log_verbosity != LogVerbosity .Quiet
60
- self .multimode_chat = MultiModeLLM (model_repo = self .model_repo ,
61
- filename = self .model_filename ,
62
- model_dir = os .path .join (ModuleOptions .module_path ,self .models_dir ),
63
- device = self .device ,
64
- inference_library = self .inference_library ,
65
- verbose = verbose )
66
-
67
- if self .multimode_chat .model_path :
68
- self .log (LogMethod .Info | LogMethod .Server , {
69
- "message" : f"Using model from '{ self .multimode_chat .model_path } '" ,
70
- "loglevel" : "information"
71
- })
72
- else :
60
+
61
+
62
+ if self ._performing_self_test and self .device == "cpu" :
73
63
self .log (LogMethod .Error | LogMethod .Server , {
74
- "message" : f"Unable to load Multi-mode model " ,
64
+ "message" : f"Unable to perform self-text without acceleration " ,
75
65
"loglevel" : "error"
76
66
})
67
+ else :
68
+ verbose = self .log_verbosity != LogVerbosity .Quiet
69
+ self .multimode_chat = MultiModeLLM (model_repo = self .model_repo ,
70
+ filename = self .model_filename ,
71
+ model_dir = os .path .join (ModuleOptions .module_path ,self .models_dir ),
72
+ device = self .device ,
73
+ inference_library = self .inference_library ,
74
+ verbose = verbose )
75
+
76
+ if self .multimode_chat .model_path :
77
+ self .log (LogMethod .Info | LogMethod .Server , {
78
+ "message" : f"Using model from '{ self .multimode_chat .model_path } '" ,
79
+ "loglevel" : "information"
80
+ })
81
+ else :
82
+ self .log (LogMethod .Error | LogMethod .Server , {
83
+ "message" : f"Unable to load Multi-mode model" ,
84
+ "loglevel" : "error"
85
+ })
77
86
78
87
self .reply_text = ""
79
88
self .cancelled = False
@@ -113,7 +122,7 @@ def long_process(self, data: RequestData) -> JSON:
113
122
error = None
114
123
115
124
try :
116
- if use_ONNX :
125
+ if accel_mode == 'ONNX' :
117
126
(generator , tokenizer_stream ) = self .multimode_chat .do_chat (user_prompt , image ,
118
127
system_prompt ,
119
128
max_tokens = max_tokens ,
@@ -196,6 +205,9 @@ def cancel_command_task(self):
196
205
197
206
def selftest (self ) -> JSON :
198
207
208
+ if accel_mode == None :
209
+ return { "success" : False , "message" : "Not performing self-test on CPU due to time taken" }
210
+
199
211
request_data = RequestData ()
200
212
request_data .queue = self .queue_name
201
213
request_data .command = "prompt"
@@ -213,7 +225,7 @@ def selftest(self) -> JSON:
213
225
print (f"Info: Self-test for { self .module_id } . Success: { result ['success' ]} " )
214
226
# print(f"Info: Self-test output for {self.module_id}: {result}")
215
227
216
- return { "success" : result ['success' ], "message" : "MulitModal LLM test successful" }
228
+ return { "success" : result ['success' ], "message" : "MultiModal LLM test successful" }
217
229
218
230
219
231
if __name__ == "__main__" :
0 commit comments