Cleanup / better handling of non-accelerated env.

Chris Maunder · Chris Maunder · commit e8f91f8680c4 · 2024-10-01T13:08:37.000-04:00
diff --git a/multimode_llm.py b/multimode_llm.py
@@ -4,12 +4,17 @@
 
 from PIL import Image
 
-use_ONNX = sys.platform != 'darwin' 
-use_MLX  = sys.platform == 'darwin' and "ARM64" in platform.uname().version
+accel_mode = None
+if sys.platform == 'darwin':
+    if "ARM64" in platform.uname().version:
+        accel_mode = 'MLX'
+else:
+    accel_mode = 'ONNX'
+
 
-if use_ONNX:
+if accel_mode == 'ONNX':
     import onnxruntime_genai as og
-elif use_MLX:
+elif accel_mode == 'MLX':
     from phi_3_vision_mlx import generate, load
 else:
     from transformers import AutoModelForCausalLM, AutoProcessor, AutoConfig
@@ -32,7 +37,7 @@ def __init__(self, model_repo: str, filename: str, model_dir: str,
         self.model_path = None
 
         try:
-            if use_ONNX:    # Non macOS
+            if accel_mode == 'ONNX':        # Non macOS
                 
                 # For ONNX, we download the models at install time
                 self.device           = device
@@ -41,7 +46,7 @@ def __init__(self, model_repo: str, filename: str, model_dir: str,
                 self.processor        = self.model.create_multimodal_processor()
                 self.tokenizer_stream = self.processor.create_stream()
 
-            elif use_MLX:   # macOS on Apple silicon.
+            elif accel_mode == 'MLX':       # macOS, Apple Silicon.
 
                 # Hardcoded in MLX code
                 # repo = "microsoft/Phi-3-vision-128k-instruct"
@@ -50,16 +55,18 @@ def __init__(self, model_repo: str, filename: str, model_dir: str,
                 self.model_path            = model_dir
                 self.model, self.processor = load(model_path=model_dir, adapter_path=None)
 
-            else:           # macOS Intel
-                
+            else:                           # macOS, Numpy, not MLX
                 # For macOS (intel), we don't download at install time (yet). We download at runtime
                 # TBD: Download model in installer, load the model here. If download 
                 #      and load fail, fall through to download-at-runtime
                 raise
 
         except Exception as ex:
-            if use_ONNX or use_MLX:
-                # No luck loading what we downloaded
+            # A general fall-through for the case where ONNX or MLX model loading failed, or where
+            # we only have non-GPU accelerated libraries (macOS on Intel) to use.
+
+            if accel_mode == 'ONNX' or accel_mode == 'MLX':
+                # We tried, but failed, and we won't fallback to CPU here (Could but won't).
                 self.model      = None
                 self.processor  = None
                 self.model_path = None
@@ -124,7 +131,7 @@ def do_chat(self, user_prompt: str, image: Image, system_prompt: str=None,
 
         inferenceMs = 0
         try:
-            if use_ONNX:
+            if accel_mode == 'ONNX':
                 
                 # ONNX genai API doesn't (yet) provide the means to load an image
                 # from memory https://github.com/microsoft/onnxruntime-genai/issues/777
@@ -161,7 +168,7 @@ def do_chat(self, user_prompt: str, image: Image, system_prompt: str=None,
 
                 del generator
 
-            elif use_MLX:
+            elif accel_mode == 'MLX':
                
                 start_inference_time = time.perf_counter()
 
@@ -197,7 +204,7 @@ def do_chat(self, user_prompt: str, image: Image, system_prompt: str=None,
                 "inferenceMs": 0
             }
 
-        if not use_ONNX and self.device == "cuda":
+        if not accel_mode == 'ONNX' and self.device == "cuda":
             try:
                 import torch
                 torch.cuda.empty_cache()
diff --git a/multimode_llm_adapter.py b/multimode_llm_adapter.py
@@ -13,17 +13,17 @@
 # Import CodeProject.AI SDK
 from codeproject_ai_sdk import RequestData, ModuleRunner, ModuleOptions, LogMethod, LogVerbosity, JSON
 
-from multimode_llm import MultiModeLLM, use_ONNX, use_MLX
+from multimode_llm import MultiModeLLM, accel_mode
 
 class MultiModeLLM_adapter(ModuleRunner):
 
     def initialise(self) -> None:
 
-        if use_ONNX:
+        if accel_mode == 'ONNX':
             (cuda_major, cuda_minor) = self.system_info.getCudaVersion
             if cuda_major and (cuda_major >= 12 or (cuda_major == 11 and cuda_minor == 8)) :
                 self.inference_device  = "GPU"
-                self.inference_library = "CUDA"
+                self.inference_library = "ONNX/CUDA"
                 self.device            = "cuda"
                 self.model_repo        = "microsoft/Phi-3-vision-128k-instruct-onnx-cuda"
                 self.model_filename    = None # "Phi-3-vision-128k-instruct.gguf"
@@ -36,13 +36,15 @@ def initialise(self) -> None:
                 self.model_repo        = "microsoft/Phi-3-vision-128k-instruct-onnx-cpu"
                 self.model_filename    = None # "Phi-3-vision-128k-instruct.gguf"
                 self.models_dir        = "cpu-int4-rtn-block-32-acc-level-4"
-        elif use_MLX:
+
+        elif accel_mode == 'MLX':           # macOS
             self.inference_device  = "GPU"
             self.inference_library = "MLX"
             self.device            = "mps"
             self.model_repo        = "microsoft/Phi-3.5-vision-instruct"
             self.model_filename    = None # "Phi-3.5-vision-instruct.gguf"
             self.models_dir        = "models"
+
         else:
             print("*** Multi-modal LLM using CPU only: This module requires > 16Gb RAM")
             # If only...
@@ -55,25 +57,32 @@ def initialise(self) -> None:
             self.model_repo       = "microsoft/Phi-3-vision-128k-instruct"
             self.model_filename    = None # "Phi-3-vision-128k-instruct.gguf"
             self.models_dir       = "./models"
-            
-        verbose = self.log_verbosity != LogVerbosity.Quiet
-        self.multimode_chat = MultiModeLLM(model_repo=self.model_repo,
-                                           filename=self.model_filename,
-                                           model_dir=os.path.join(ModuleOptions.module_path,self.models_dir),
-                                           device=self.device, 
-                                           inference_library=self.inference_library,
-                                           verbose=verbose)
-        
-        if self.multimode_chat.model_path:
-            self.log(LogMethod.Info|LogMethod.Server, {
-                "message": f"Using model from '{self.multimode_chat.model_path}'",
-                "loglevel": "information"
-            })
-        else:
+
+
+        if self._performing_self_test and self.device == "cpu":
             self.log(LogMethod.Error|LogMethod.Server, {
-                "message": f"Unable to load Multi-mode model",
+                "message": f"Unable to perform self-text without acceleration",
                 "loglevel": "error"
             })
+        else:        
+            verbose = self.log_verbosity != LogVerbosity.Quiet
+            self.multimode_chat = MultiModeLLM(model_repo=self.model_repo,
+                                            filename=self.model_filename,
+                                            model_dir=os.path.join(ModuleOptions.module_path,self.models_dir),
+                                            device=self.device, 
+                                            inference_library=self.inference_library,
+                                            verbose=verbose)
+            
+            if self.multimode_chat.model_path:
+                self.log(LogMethod.Info|LogMethod.Server, {
+                    "message": f"Using model from '{self.multimode_chat.model_path}'",
+                    "loglevel": "information"
+                })
+            else:
+                self.log(LogMethod.Error|LogMethod.Server, {
+                    "message": f"Unable to load Multi-mode model",
+                    "loglevel": "error"
+                })
 
         self.reply_text  = ""
         self.cancelled   = False
@@ -113,7 +122,7 @@ def long_process(self, data: RequestData) -> JSON:
         error = None
 
         try:
-            if use_ONNX:
+            if accel_mode == 'ONNX':
                 (generator, tokenizer_stream) = self.multimode_chat.do_chat(user_prompt, image,
                                                                             system_prompt,
                                                                             max_tokens=max_tokens,
@@ -196,6 +205,9 @@ def cancel_command_task(self):
 
     def selftest(self) -> JSON:
 
+        if accel_mode == None:
+            return { "success": False, "message": "Not performing self-test on CPU due to time taken" }
+        
         request_data = RequestData()
         request_data.queue   = self.queue_name
         request_data.command = "prompt"
@@ -213,7 +225,7 @@ def selftest(self) -> JSON:
         print(f"Info: Self-test for {self.module_id}. Success: {result['success']}")
         # print(f"Info: Self-test output for {self.module_id}: {result}")
 
-        return { "success": result['success'], "message": "MulitModal LLM test successful" }
+        return { "success": result['success'], "message": "MultiModal LLM test successful" }
 
 
 if __name__ == "__main__":