EmbeddedLLM
diff --git a/‎README.md
Lines changed: 11 additions & 4 deletions b/‎README.md
Lines changed: 11 additions & 4 deletions
diff --git a/‎setup.py
Lines changed: 60 additions & 38 deletions b/‎setup.py
Lines changed: 60 additions & 38 deletions
diff --git a/‎src/embeddedllm/backend/base_engine.py
Lines changed: 16 additions & 13 deletions b/‎src/embeddedllm/backend/base_engine.py
Lines changed: 16 additions & 13 deletions
@@ -119,15 +119,15 @@ Run local LLMs on iGPU, APU and CPU (AMD , Intel, and Qualcomm (Coming Soon)). E
 
 1.  `ellm_chatbot --port 7788 --host localhost --server_port <ellm_server_port> --server_host localhost`. **Note:** To find out more of the supported arguments. `ellm_chatbot --help`.
 
-   ![asset/ellm_chatbot_vid.webp](asset/ellm_chatbot_vid.webp)
+![asset/ellm_chatbot_vid.webp](asset/ellm_chatbot_vid.webp)
 
 ### Launch Model Management UI
 
 It is an interface that allows you to download and deploy OpenAI API compatible server. You can find out the disk space required to download the model in the UI.
 
 1.  `ellm_modelui --port 6678`. **Note:** To find out more of the supported arguments. `ellm_modelui --help`.
 
-   ![Model Management UI](asset/ellm_modelui.png)
+![Model Management UI](asset/ellm_modelui.png)
 
 ## Compile OpenAI-API Compatible Server into Windows Executable
 
@@ -138,13 +138,20 @@ It is an interface that allows you to download and deploy OpenAI API compatible
 5. Use it like `ellm_server`. `.\ellm_api_server.exe --model_path <path/to/model/weight>`.
 
 ## Prebuilt OpenAI API Compatible Windows Executable (Alpha)
+
 You can find the prebuilt OpenAI API Compatible Windows Executable in the Release page.
 
-*Powershell/Terminal Usage (Use it like `ellm_server`)*:
+_Powershell/Terminal Usage (Use it like `ellm_server`)_:
+
 ```powershell
 .\ellm_api_server.exe --model_path <path/to/model/weight>
-```
 
+# DirectML
+.\ellm_api_server.exe --model_path 'EmbeddedLLM_Phi-3-mini-4k-instruct-062024-onnx\onnx\directml\Phi-3-mini-4k-instruct-062024-int4' --port 5555
+
+# IPEX-LLM
+.\ellm_api_server.exe --model_path '.\meta-llama_Meta-Llama-3.1-8B-Instruct\'  --backend 'ipex' --device 'xpu' --port 5555 --served_model_name 'meta-llama_Meta/Llama-3.1-8B-Instruct'
+```
 
 ## Acknowledgements
 
 
@@ -53,44 +53,67 @@ def _is_ipex() -> bool:
 class ELLMInstallCommand(install):
     def run(self):
         install.run(self)
-        print("is_ipex(): " + str(_is_ipex()))
         if _is_ipex():
-            print("Install Ipex-LLM")
-            result = subprocess.run([
-                'pip', 'install', '--pre', '--upgrade', 'ipex-llm[xpu]',
-                '--extra-index-url', 'https://pytorch-extension.intel.com/release-whl/stable/xpu/us/'
-            ], capture_output=True, text=True)
-
-            result = subprocess.run([
-                'pip', 'install', '--upgrade', 'transformers==4.43.3'
-            ], capture_output=True, text=True)
+            result = subprocess.run(
+                [
+                    "pip",
+                    "install",
+                    "--pre",
+                    "--upgrade",
+                    "ipex-llm[xpu]",
+                    "--extra-index-url",
+                    "https://pytorch-extension.intel.com/release-whl/stable/xpu/us/",
+                ],
+                capture_output=True,
+                text=True,
+            )
+
+            result = subprocess.run(
+                ["pip", "install", "--upgrade", "transformers==4.43.3"],
+                capture_output=True,
+                text=True,
+            )
 
         if _is_directml():
-            result = subprocess.run([
-                'conda', 'install', 'conda-forge::vs2015_runtime' , '-y'
-            ], capture_output=True, text=True)
+            result = subprocess.run(
+                ["conda", "install", "conda-forge::vs2015_runtime", "-y"],
+                capture_output=True,
+                text=True,
+            )
+
 
 class ELLMDevelopCommand(develop):
     def run(self):
         develop.run(self)
-        print("is_ipex(): " + str(_is_ipex()))
         if _is_ipex():
             print("Install Ipex-LLM")
-            result = subprocess.run([
-                'pip', 'install', '--pre', '--upgrade', 'ipex-llm[xpu]',
-                '--extra-index-url', 'https://pytorch-extension.intel.com/release-whl/stable/xpu/us/'
-            ], capture_output=True, text=True)
-            
-            result = subprocess.run([
-                'pip', 'install', '--upgrade', 'transformers==4.43.3'
-            ], capture_output=True, text=True)
+            result = subprocess.run(
+                [
+                    "pip",
+                    "install",
+                    "--pre",
+                    "--upgrade",
+                    "ipex-llm[xpu]",
+                    "--extra-index-url",
+                    "https://pytorch-extension.intel.com/release-whl/stable/xpu/us/",
+                ],
+                capture_output=True,
+                text=True,
+            )
+
+            result = subprocess.run(
+                ["pip", "install", "--upgrade", "transformers==4.43.3"],
+                capture_output=True,
+                text=True,
+            )
 
         if _is_directml():
-            result = subprocess.run([
-                'conda', 'install', 'conda-forge::vs2015_runtime' , '-y'
-            ], capture_output=True, text=True)
-            
-            print(result)
+            result = subprocess.run(
+                ["conda", "install", "conda-forge::vs2015_runtime", "-y"],
+                capture_output=True,
+                text=True,
+            )
+
 
 def get_path(*filepath) -> str:
     return os.path.join(ROOT_DIR, *filepath)
@@ -158,14 +181,12 @@ def get_ellm_version() -> str:
 dependency_links = []
 extra_install_requires = []
 
-if(_is_directml() or _is_cuda() or _is_cpu()):
-    dependency_links.extend(["https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-genai/pypi/simple/"])
-# elif(_is_ipex()):
-#     dependency_links.extend(["https://pytorch-extension.intel.com/release-whl/stable/xpu/us/"])
-    # extra_install_requires.extend(["torch==2.1.0a0 @ https://pytorch-extension.intel.com/release-whl/stable/xpu/us/"])
-    # extra_install_requires.extend(["ipex-llm[xpu]==2.1.0b20240702 @ https://pytorch-extension.intel.com/release-whl/stable/xpu/us/"])
-
-    # extra_install_requires = ['ipex-llm[xpu]==2.1.0b20240702 @ https://pytorch-extension.intel.com/release-whl/stable/xpu/us/']
+if _is_directml() or _is_cuda() or _is_cpu():
+    dependency_links.extend(
+        [
+            "https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-genai/pypi/simple/"
+        ]
+    )
 
 setup(
     name="embeddedllm",
@@ -192,7 +213,8 @@ def get_ellm_version() -> str:
     ],
     install_requires=get_requirements()
     + _read_requirements("requirements-common.txt")
-    + _read_requirements("requirements-build.txt") + extra_install_requires,
+    + _read_requirements("requirements-build.txt")
+    + extra_install_requires,
     # Add other metadata and dependencies as needed
     extras_require={
         "lint": _read_requirements("requirements-lint.txt"),
@@ -209,7 +231,7 @@ def get_ellm_version() -> str:
         ],
     },
     cmdclass={
-        'install': ELLMInstallCommand,
-        'develop': ELLMDevelopCommand,
+        "install": ELLMInstallCommand,
+        "develop": ELLMDevelopCommand,
     },
 )
@@ -135,15 +135,15 @@ def _get_and_verify_max_len(
     for key in possible_keys:
         max_len = getattr(hf_config, key, None)
         if max_len is not None:
-            max_len_key = key if max_len < derived_max_model_len \
-                else max_len_key
+            max_len_key = key if max_len < derived_max_model_len else max_len_key
             derived_max_model_len = min(derived_max_model_len, max_len)
 
     # If sliding window is manually disabled, max_length should be less
     # than the sliding window length in the model config.
     if disable_sliding_window and sliding_window_len is not None:
-        max_len_key = "sliding_window" \
-            if sliding_window_len < derived_max_model_len else max_len_key
+        max_len_key = (
+            "sliding_window" if sliding_window_len < derived_max_model_len else max_len_key
+        )
         derived_max_model_len = min(derived_max_model_len, sliding_window_len)
 
     # If none of the keys were found in the config, use a default and
@@ -157,8 +157,10 @@ def _get_and_verify_max_len(
         logger.warning(
             "The model's config.json does not contain any of the following "
             "keys to determine the original maximum length of the model: "
-            "%s. Assuming the model's maximum length is %d.", possible_keys,
-            default_max_len)
+            "%s. Assuming the model's maximum length is %d.",
+            possible_keys,
+            default_max_len,
+        )
         derived_max_model_len = default_max_len
 
     rope_scaling = getattr(hf_config, "rope_scaling", None)
@@ -168,8 +170,7 @@ def _get_and_verify_max_len(
         elif "rope_type" in rope_scaling:
             rope_type = rope_scaling["rope_type"]
         else:
-            raise ValueError(
-                "rope_scaling must have a 'type' or 'rope_type' key.")
+            raise ValueError("rope_scaling must have a 'type' or 'rope_type' key.")
 
         # The correct one should be "longrope", kept "su" here
         # to be backward compatible
@@ -180,13 +181,13 @@ def _get_and_verify_max_len(
                 raise NotImplementedError(
                     "Disabling sliding window is not supported for models "
                     "with rope_scaling. Please raise an issue so we can "
-                    "investigate.")
+                    "investigate."
+                )
 
             assert "factor" in rope_scaling
             scaling_factor = rope_scaling["factor"]
             if rope_type == "yarn":
-                derived_max_model_len = rope_scaling[
-                    "original_max_position_embeddings"]
+                derived_max_model_len = rope_scaling["original_max_position_embeddings"]
             derived_max_model_len *= scaling_factor
 
     # If the user specified a max length, make sure it is smaller than the
@@ -205,7 +206,8 @@ def _get_and_verify_max_len(
                 raise NotImplementedError(
                     "Disabling sliding window is not supported for models "
                     "model_max_length in the config. Please raise an issue "
-                    "so we can investigate.")
+                    "so we can investigate."
+                )
             pass
         else:
             raise ValueError(
@@ -214,5 +216,6 @@ def _get_and_verify_max_len(
                 f"({max_len_key}={derived_max_model_len} or model_max_length="
                 f"{model_max_length} in model's config.json). This may lead "
                 "to incorrect model outputs or CUDA errors. Make sure the "
-                "value is correct and within the model context size.")
+                "value is correct and within the model context size."
+            )
     return int(max_model_len)