Skip to content

Commit fafab33

Browse files
author
Chris Maunder
committed
Working on macOS arm64 via MLX
1 parent b1b4186 commit fafab33

File tree

7 files changed

+129
-45
lines changed

7 files changed

+129
-45
lines changed

.vscode/launch.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333
"python": "${workspaceFolder}/bin/linux/python310/venv/bin/python",
3434
},
3535
"osx": {
36-
"python": "${workspaceFolder}/bin/macos/python310/venv/bin/python",
36+
"python": "${workspaceFolder}/bin/macos/python311/venv/bin/python",
3737
}
3838
}
3939
]

install.sh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,9 @@ if [ "$moduleInstallErrors" = "" ]; then
2929
oneStepPIP=true # Makes dealing with Numpy so much easier.
3030

3131
if [ "$os" = "macos" ]; then
32+
33+
oneStepPIP=false # Makes dealing with Numpy so much easier.
34+
3235
phi3_sourceUrl="..."
3336
phi3_fileToGet="..."
3437
# brew install git-lfs

modulesettings.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@
4848
},
4949

5050
"InstallOptions" : {
51-
"Platforms": [ "windows", "Linux", "macOS" ],
51+
"Platforms": [ "windows", "Linux", "macOS", "macOS-arm64" ],
5252
"ModuleReleases": [ // Which server version is compatible with each version of this module.
5353
{ "ModuleVersion": "1.0.0", "ServerVersionRange": [ "2.8.0", "" ], "ReleaseDate": "2024-08-04", "ReleaseNotes": "Initial release" }
5454
]

modulesettings.macos.arm64.json

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
{
2+
"Modules": {
3+
"MultiModeLLM": {
4+
"LaunchSettings": {
5+
"Runtime": "python3.11"
6+
}
7+
}
8+
}
9+
}

multimode_llm.py

Lines changed: 42 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,16 @@
1+
import platform
12
import time
23
import sys
34

45
from PIL import Image
56

6-
# ONNX isn't supported in macOS
7-
use_ONNX = sys.platform != 'darwin'
7+
use_ONNX = sys.platform != 'darwin'
8+
use_MLX = sys.platform == 'darwin' and "ARM64" in platform.uname().version
89

910
if use_ONNX:
1011
import onnxruntime_genai as og
12+
elif use_MLX:
13+
from phi_3_vision_mlx import generate, load
1114
else:
1215
from transformers import AutoModelForCausalLM, AutoProcessor, AutoConfig
1316

@@ -29,21 +32,33 @@ def __init__(self, model_repo: str, filename: str, model_dir: str,
2932
self.model_path = None
3033

3134
try:
32-
if use_ONNX:
35+
if use_ONNX: # Non macOS
36+
3337
# For ONNX, we download the models at install time
3438
self.device = device
3539
self.model_path = model_dir
3640
self.model = og.Model(self.model_path)
3741
self.processor = self.model.create_multimodal_processor()
3842
self.tokenizer_stream = self.processor.create_stream()
39-
else:
40-
# For macOS, we don't download at install time (yet). We download at runtime
43+
44+
elif use_MLX: # macOS on Apple silicon.
45+
46+
# Hardcoded in MLX code
47+
# repo = "microsoft/Phi-3-vision-128k-instruct"
48+
49+
self.device = device
50+
self.model_path = model_dir
51+
self.model, self.processor = load(model_path=model_dir, adapter_path=None)
52+
53+
else: # macOS Intel
54+
55+
# For macOS (intel), we don't download at install time (yet). We download at runtime
4156
# TBD: Download model in installer, load the model here. If download
4257
# and load fail, fall through to download-at-runtime
4358
raise
4459

4560
except Exception as ex:
46-
if use_ONNX:
61+
if use_ONNX or use_MLX:
4762
# No luck loading what we downloaded
4863
self.model = None
4964
self.processor = None
@@ -146,6 +161,27 @@ def do_chat(self, user_prompt: str, image: Image, system_prompt: str=None,
146161

147162
del generator
148163

164+
elif use_MLX:
165+
166+
# Using phi_3_vision_mlx v0.0.2
167+
# https://github.com/JosefAlbers/Phi-3-Vision-MLX/tree/v0.0.2-beta
168+
start_inference_time = time.perf_counter()
169+
response = generate(self.model, self.processor, prompt, [image])
170+
inferenceMs = int((time.perf_counter() - start_inference_time) * 1000)
171+
172+
# Using latest phi_3_vision_mlx
173+
# import os
174+
# temp_name="onnx_genai_temp_image.png"
175+
# image.save(temp_name)
176+
#
177+
# agent = Agent()
178+
# start_inference_time = time.perf_counter()
179+
# response = agent(prompt, images=[temp_name])
180+
# inferenceMs = int((time.perf_counter() - start_inference_time) * 1000)
181+
# agent.end()
182+
#
183+
# os.remove(temp_name)
184+
149185
else:
150186
inputs = self.processor(prompt, image, return_tensors="pt").to(self.device)
151187

multimode_llm_adapter.py

Lines changed: 67 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
# Import CodeProject.AI SDK
1414
from codeproject_ai_sdk import RequestData, ModuleRunner, ModuleOptions, LogMethod, LogVerbosity, JSON
1515

16-
from multimode_llm import MultiModeLLM, use_ONNX
16+
from multimode_llm import MultiModeLLM, use_ONNX, use_MLX
1717

1818
class MultiModeLLM_adapter(ModuleRunner):
1919

@@ -36,6 +36,13 @@ def initialise(self) -> None:
3636
self.model_repo = "microsoft/Phi-3-vision-128k-instruct-onnx-cpu"
3737
self.model_filename = None # "Phi-3-vision-128k-instruct.gguf"
3838
self.models_dir = "cpu-int4-rtn-block-32-acc-level-4"
39+
elif use_MLX:
40+
self.inference_device = "GPU"
41+
self.inference_library = "MLX"
42+
self.device = "mps"
43+
self.model_repo = "microsoft/Phi-3.5-vision-instruct"
44+
self.model_filename = None # "Phi-3.5-vision-instruct.gguf"
45+
self.models_dir = "models"
3946
else:
4047
print("*** Multi-modal LLM using CPU only: This module requires > 16Gb RAM")
4148
# If only...
@@ -101,47 +108,72 @@ def long_process(self, data: RequestData) -> JSON:
101108
# # pix = page.get_pixmap(matrix=mat) # use 'mat' instead of the identity matrix
102109

103110
start_process_time = time.perf_counter()
111+
start_inference_time = time.perf_counter()
104112

105-
try:
106-
(generator, tokenizer_stream) = self.multimode_chat.do_chat(user_prompt, image,
107-
system_prompt,
108-
max_tokens=max_tokens,
109-
temperature=temperature,
110-
stream=True)
111-
112-
start_inference_time = time.perf_counter()
113-
114-
if generator:
115-
while not generator.is_done():
116-
if self.cancelled:
117-
self.cancelled = False
118-
stop_reason = "cancelled"
119-
break
113+
error = None
120114

121-
generator.compute_logits()
122-
generator.generate_next_token()
123-
124-
next_tokens = generator.get_next_tokens()
125-
next_token = next_tokens[0]
126-
next_response = tokenizer_stream.decode(next_token)
115+
try:
116+
if use_ONNX:
117+
(generator, tokenizer_stream) = self.multimode_chat.do_chat(user_prompt, image,
118+
system_prompt,
119+
max_tokens=max_tokens,
120+
temperature=temperature,
121+
stream=True)
122+
if generator:
123+
while not generator.is_done():
124+
if self.cancelled:
125+
self.cancelled = False
126+
stop_reason = "cancelled"
127+
break
128+
129+
generator.compute_logits()
130+
generator.generate_next_token()
131+
132+
next_tokens = generator.get_next_tokens()
133+
next_token = next_tokens[0]
134+
next_response = tokenizer_stream.decode(next_token)
135+
136+
self.reply_text += next_response
137+
138+
inferenceMs : int = int((time.perf_counter() - start_inference_time) * 1000)
139+
140+
if generator:
141+
del generator
127142

128-
self.reply_text += next_response
143+
else:
144+
llm_response = self.multimode_chat.do_chat(user_prompt, image, system_prompt,
145+
max_tokens=max_tokens,
146+
temperature=temperature,
147+
stream=False)
148+
if llm_response["success"]:
149+
inferenceMs = llm_response["inferenceMs"]
150+
self.reply_text = llm_response["reply"]
151+
else:
152+
error = llm_response["error"] if "error" in llm_response["error"] else "Error generating reply"
153+
inferenceMs = 0
154+
155+
if stop_reason == "cancelled" and not self.reply_text and not error:
156+
error = "Operation cancelled"
129157

130-
inferenceMs : int = int((time.perf_counter() - start_inference_time) * 1000)
131-
132-
if generator:
133-
del generator
134-
135158
if stop_reason is None:
136159
stop_reason = "completed"
137160

138-
response = {
139-
"success": True,
140-
"reply": self.reply_text,
141-
"stop_reason": stop_reason,
142-
"processMs": int((time.perf_counter() - start_process_time) * 1000),
143-
"inferenceMs" : inferenceMs
144-
}
161+
if error:
162+
response = {
163+
"success": False,
164+
"error": error,
165+
"reply": "",
166+
"stop_reason": stop_reason,
167+
"processMs": int((time.perf_counter() - start_process_time) * 1000),
168+
}
169+
else:
170+
response = {
171+
"success": True,
172+
"reply": self.reply_text,
173+
"stop_reason": stop_reason,
174+
"processMs": int((time.perf_counter() - start_process_time) * 1000),
175+
"inferenceMs" : inferenceMs
176+
}
145177

146178
except Exception as ex:
147179
self.report_error(ex, __file__)

requirements.macos.arm64.txt

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,11 @@
1-
#! Python3.10
1+
#! Python3.12
22

33
# For Phi-3 for Apple Silicon using MLX
4-
phi-3-vision-mlx # Installing onnxruntime-genai, the ONNX Runtime generate() API
4+
#See https://huggingface.co/JosefAlbers/Phi-3-vision-128k-instruct-mlx
5+
mlx # Installing MLX, a framework for machine learning on Apple silicon.
6+
phi-3-vision-mlx==0.0.2 # Installing onnxruntime-genai, the ONNX Runtime generate() API
7+
torch # Installing PyTorch, an open source machine learning framework
8+
torchvision # Installing TorchVision, for working with computer vision models
59

610
CodeProject-AI-SDK # Installing the CodeProject.AI SDK
711

0 commit comments

Comments
 (0)