Working on macOS arm64 via MLX

Chris Maunder · Chris Maunder · commit fafab33d2d72 · 2024-09-04T14:37:14.000-04:00
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -33,7 +33,7 @@
                 "python": "${workspaceFolder}/bin/linux/python310/venv/bin/python",
             },
             "osx": {
-                "python": "${workspaceFolder}/bin/macos/python310/venv/bin/python",
+                "python": "${workspaceFolder}/bin/macos/python311/venv/bin/python",
             }
         }
     ]
diff --git a/install.sh b/install.sh
@@ -29,6 +29,9 @@ if [ "$moduleInstallErrors" = "" ]; then
     oneStepPIP=true  # Makes dealing with Numpy so much easier.
 
     if [ "$os" = "macos" ]; then
+
+        oneStepPIP=false  # Makes dealing with Numpy so much easier.
+
         phi3_sourceUrl="..."
         phi3_fileToGet="..."
         # brew install git-lfs
diff --git a/modulesettings.json b/modulesettings.json
@@ -48,7 +48,7 @@
       },
       
       "InstallOptions" : {
-        "Platforms": [ "windows", "Linux", "macOS" ],
+        "Platforms": [ "windows", "Linux", "macOS", "macOS-arm64" ],
         "ModuleReleases": [               // Which server version is compatible with each version of this module.
           { "ModuleVersion": "1.0.0", "ServerVersionRange": [ "2.8.0", ""      ], "ReleaseDate": "2024-08-04", "ReleaseNotes": "Initial release" }
         ]
diff --git a/modulesettings.macos.arm64.json b/modulesettings.macos.arm64.json
@@ -0,0 +1,9 @@
+{
+  "Modules": {
+    "MultiModeLLM": {
+      "LaunchSettings": {
+        "Runtime": "python3.11"
+      }
+    }
+  }
+}
diff --git a/multimode_llm.py b/multimode_llm.py
@@ -1,13 +1,16 @@
+import platform
 import time
 import sys
 
 from PIL import Image
 
-# ONNX isn't supported in macOS
-use_ONNX = sys.platform != 'darwin'
+use_ONNX = sys.platform != 'darwin' 
+use_MLX  = sys.platform == 'darwin' and "ARM64" in platform.uname().version
 
 if use_ONNX:
     import onnxruntime_genai as og
+elif use_MLX:
+    from phi_3_vision_mlx import generate, load
 else:
     from transformers import AutoModelForCausalLM, AutoProcessor, AutoConfig
 
@@ -29,21 +32,33 @@ def __init__(self, model_repo: str, filename: str, model_dir: str,
         self.model_path = None
 
         try:
-            if use_ONNX:
+            if use_ONNX:    # Non macOS
+                
                 # For ONNX, we download the models at install time
                 self.device           = device
                 self.model_path       = model_dir
                 self.model            = og.Model(self.model_path)
                 self.processor        = self.model.create_multimodal_processor()
                 self.tokenizer_stream = self.processor.create_stream()
-            else:
-                # For macOS, we don't download at install time (yet). We download at runtime
+
+            elif use_MLX:   # macOS on Apple silicon.
+
+                # Hardcoded in MLX code
+                # repo = "microsoft/Phi-3-vision-128k-instruct"
+
+                self.device                = device
+                self.model_path            = model_dir
+                self.model, self.processor = load(model_path=model_dir, adapter_path=None)
+
+            else:           # macOS Intel
+                
+                # For macOS (intel), we don't download at install time (yet). We download at runtime
                 # TBD: Download model in installer, load the model here. If download 
                 #      and load fail, fall through to download-at-runtime
                 raise
 
         except Exception as ex:
-            if use_ONNX:
+            if use_ONNX or use_MLX:
                 # No luck loading what we downloaded
                 self.model      = None
                 self.processor  = None
@@ -146,6 +161,27 @@ def do_chat(self, user_prompt: str, image: Image, system_prompt: str=None,
 
                 del generator
 
+            elif use_MLX:
+               
+                # Using phi_3_vision_mlx v0.0.2
+                # https://github.com/JosefAlbers/Phi-3-Vision-MLX/tree/v0.0.2-beta
+                start_inference_time = time.perf_counter()
+                response = generate(self.model, self.processor, prompt, [image])
+                inferenceMs = int((time.perf_counter() - start_inference_time) * 1000)
+
+                # Using latest phi_3_vision_mlx
+                # import os
+                # temp_name="onnx_genai_temp_image.png"
+                # image.save(temp_name)
+                #
+                # agent = Agent()
+                # start_inference_time = time.perf_counter()
+                # response = agent(prompt, images=[temp_name])
+                # inferenceMs = int((time.perf_counter() - start_inference_time) * 1000)
+                # agent.end()
+                #
+                # os.remove(temp_name)
+
             else:           
                 inputs = self.processor(prompt, image, return_tensors="pt").to(self.device)
 
diff --git a/multimode_llm_adapter.py b/multimode_llm_adapter.py
@@ -13,7 +13,7 @@
 # Import CodeProject.AI SDK
 from codeproject_ai_sdk import RequestData, ModuleRunner, ModuleOptions, LogMethod, LogVerbosity, JSON
 
-from multimode_llm import MultiModeLLM, use_ONNX
+from multimode_llm import MultiModeLLM, use_ONNX, use_MLX
 
 class MultiModeLLM_adapter(ModuleRunner):
 
@@ -36,6 +36,13 @@ def initialise(self) -> None:
                 self.model_repo        = "microsoft/Phi-3-vision-128k-instruct-onnx-cpu"
                 self.model_filename    = None # "Phi-3-vision-128k-instruct.gguf"
                 self.models_dir        = "cpu-int4-rtn-block-32-acc-level-4"
+        elif use_MLX:
+            self.inference_device  = "GPU"
+            self.inference_library = "MLX"
+            self.device            = "mps"
+            self.model_repo        = "microsoft/Phi-3.5-vision-instruct"
+            self.model_filename    = None # "Phi-3.5-vision-instruct.gguf"
+            self.models_dir        = "models"
         else:
             print("*** Multi-modal LLM using CPU only: This module requires > 16Gb RAM")
             # If only...
@@ -101,47 +108,72 @@ def long_process(self, data: RequestData) -> JSON:
         #    # pix = page.get_pixmap(matrix=mat)  # use 'mat' instead of the identity matrix
 
         start_process_time = time.perf_counter()
+        start_inference_time = time.perf_counter()
 
-        try:
-            (generator, tokenizer_stream) = self.multimode_chat.do_chat(user_prompt, image,
-                                                                        system_prompt,
-                                                                        max_tokens=max_tokens,
-                                                                        temperature=temperature,
-                                                                        stream=True)
-        
-            start_inference_time = time.perf_counter()
-
-            if generator:                   
-                while not generator.is_done():
-                    if self.cancelled:
-                        self.cancelled = False
-                        stop_reason = "cancelled"
-                        break
+        error = None
 
-                    generator.compute_logits()
-                    generator.generate_next_token()
-
-                    next_tokens   = generator.get_next_tokens()
-                    next_token    = next_tokens[0]
-                    next_response = tokenizer_stream.decode(next_token)
+        try:
+            if use_ONNX:
+                (generator, tokenizer_stream) = self.multimode_chat.do_chat(user_prompt, image,
+                                                                            system_prompt,
+                                                                            max_tokens=max_tokens,
+                                                                            temperature=temperature,
+                                                                            stream=True)
+                if generator:
+                    while not generator.is_done():
+                        if self.cancelled:
+                            self.cancelled = False
+                            stop_reason = "cancelled"
+                            break
+
+                        generator.compute_logits()
+                        generator.generate_next_token()
+
+                        next_tokens   = generator.get_next_tokens()
+                        next_token    = next_tokens[0]
+                        next_response = tokenizer_stream.decode(next_token)
+
+                        self.reply_text += next_response
+                    
+                inferenceMs : int = int((time.perf_counter() - start_inference_time) * 1000)
+
+                if generator:                   
+                    del generator
 
-                    self.reply_text += next_response
+            else:
+                llm_response = self.multimode_chat.do_chat(user_prompt, image, system_prompt,
+                                                           max_tokens=max_tokens,
+                                                           temperature=temperature,
+                                                           stream=False)
+                if llm_response["success"]:
+                    inferenceMs     = llm_response["inferenceMs"]
+                    self.reply_text = llm_response["reply"]
+                else:
+                    error       = llm_response["error"] if "error" in llm_response["error"] else "Error generating reply"
+                    inferenceMs = 0
+
+            if stop_reason == "cancelled" and not self.reply_text and not error:
+                error = "Operation cancelled"
                 
-            inferenceMs : int = int((time.perf_counter() - start_inference_time) * 1000)
-
-            if generator:                   
-                del generator
-
             if stop_reason is None:
                 stop_reason = "completed"
 
-            response = {
-                "success": True, 
-                "reply": self.reply_text,
-                "stop_reason": stop_reason,
-                "processMs": int((time.perf_counter() - start_process_time) * 1000),
-                "inferenceMs" : inferenceMs
-            }
+            if error:
+                response = {
+                    "success": False, 
+                    "error": error,
+                    "reply": "",
+                    "stop_reason": stop_reason,
+                    "processMs": int((time.perf_counter() - start_process_time) * 1000),
+                }
+            else:
+                response = {
+                    "success": True, 
+                    "reply": self.reply_text,
+                    "stop_reason": stop_reason,
+                    "processMs": int((time.perf_counter() - start_process_time) * 1000),
+                    "inferenceMs" : inferenceMs
+                }
 
         except Exception as ex:
             self.report_error(ex, __file__)
diff --git a/requirements.macos.arm64.txt b/requirements.macos.arm64.txt
@@ -1,7 +1,11 @@
-#! Python3.10
+#! Python3.12
 
 # For Phi-3 for Apple Silicon using MLX
-phi-3-vision-mlx                # Installing onnxruntime-genai, the ONNX Runtime generate() API
+#See https://huggingface.co/JosefAlbers/Phi-3-vision-128k-instruct-mlx
+mlx                             # Installing MLX, a framework for machine learning on Apple silicon.
+phi-3-vision-mlx==0.0.2         # Installing onnxruntime-genai, the ONNX Runtime generate() API
+torch                           # Installing PyTorch, an open source machine learning framework
+torchvision                     # Installing TorchVision, for working with computer vision models
 
 CodeProject-AI-SDK              # Installing the CodeProject.AI SDK
 

Original file line number	Diff line number	Diff line change
`@@ -33,7 +33,7 @@`
`33`	`33`	`"python": "${workspaceFolder}/bin/linux/python310/venv/bin/python",`
`34`	`34`	`},`
`35`	`35`	`"osx": {`
`36`		`- "python": "${workspaceFolder}/bin/macos/python310/venv/bin/python",`
	`36`	`+ "python": "${workspaceFolder}/bin/macos/python311/venv/bin/python",`
`37`	`37`	`}`
`38`	`38`	`}`
`39`	`39`	`]`
Original file line number	Diff line number	Diff line change
`@@ -48,7 +48,7 @@`
`48`	`48`	`},`
`49`	`49`
`50`	`50`	`"InstallOptions" : {`
`51`		`- "Platforms": [ "windows", "Linux", "macOS" ],`
	`51`	`+ "Platforms": [ "windows", "Linux", "macOS", "macOS-arm64" ],`
`52`	`52`	`"ModuleReleases": [ // Which server version is compatible with each version of this module.`
`53`	`53`	`{ "ModuleVersion": "1.0.0", "ServerVersionRange": [ "2.8.0", "" ], "ReleaseDate": "2024-08-04", "ReleaseNotes": "Initial release" }`
`54`	`54`	`]`