feat: gradio demo integration (#16)

Co-authored-by: Bo Liu <[email protected]> Co-authored-by: Haoyu Lu <[email protected]>
deepseek-ai · Mar 13, 2024 · 6014260 · 6014260
1 parent 86a3096
commit 6014260
Show file tree

Hide file tree

Showing 24 changed files with 1,787 additions and 44 deletions.
diff --git a/Makefile b/Makefile
@@ -4,7 +4,7 @@ COPYRIGHT      = "DeepSeek."
 PROJECT_PATH   = deepseek_vl
 SHELL          = /bin/bash
 SOURCE_FOLDERS = deepseek_vl
-PYTHON_FILES   = $(shell find $(SOURCE_FOLDERS) -type f -name "*.py" -o -name "*.pyi")
+PYTHON_FILES   = $(shell find $(SOURCE_FOLDERS) -type f -name "*.py" -o -name "*.pyi") cli_chat.py inference.py
 COMMIT_HASH    = $(shell git log -1 --format=%h)
 PATH           := $(HOME)/go/bin:$(PATH)
 PYTHON         ?= $(shell command -v python3 || command -v python)
@@ -86,7 +86,7 @@ format: py-format-install ruff-install addlicense-install
 	$(PYTHON) -m isort --project $(PROJECT_PATH) $(PYTHON_FILES)
 	$(PYTHON) -m black $(PYTHON_FILES)
 	$(PYTHON) -m ruff check . --fix --exit-zero
-	addlicense -c $(COPYRIGHT) -ignore tests/coverage.xml -l mit -y 2023-$(shell date +"%Y") $(SOURCE_FOLDERS)
+	addlicense -c $(COPYRIGHT) -ignore tests/coverage.xml -l mit -y 2023-$(shell date +"%Y") $(SOURCE_FOLDERS) cli_chat.py inference.py
 
 clean-py:
 	find . -type f -name  '*.py[co]' -delete

diff --git a/README.md b/README.md
@@ -65,12 +65,17 @@ Introducing DeepSeek-VL, an open-source Vision-Language (VL) Model designed for
 
 [DeepSeek-VL: Towards Real-World Vision-Language Understanding](https://arxiv.org/abs/2403.05525)
 
-Haoyu Lu*, Wen Liu*, Bo Zhang**, Bingxuan Wang, Kai Dong, Bo Liu, Jingxiang Sun, Tongzheng Ren, Zhuoshu Li, Yaofeng Sun, Chengqi Deng, Hanwei Xu, Zhenda Xie, Chong Ruan (*Equal Contribution, **Project Lead)
+Haoyu Lu*, Wen Liu*, Bo Zhang**, Bingxuan Wang, Kai Dong, Bo Liu, Jingxiang Sun, Tongzheng Ren, Zhuoshu Li, Hao Yang, Yaofeng Sun, Chengqi Deng, Hanwei Xu, Zhenda Xie, Chong Ruan (*Equal Contribution, **Project Lead)
 
 ![](https://github.com/deepseek-ai/DeepSeek-VL/blob/main/images/sample.jpg)
 
 ## 2. Release
 
+<details>
+<summary>✅ <b>2024-03-13</b>: Support DeepSeek-VL gradio demo.
+
+</details>
+
 <details>
 <summary>✅ <b>2024-03-11</b>: DeepSeek-VL family released, including <code>DeepSeek-VL-7B-base</code>, <code>DeepSeek-VL-7B-chat</code>, <code>DeepSeek-VL-1.3B-base</code>, and <code>DeepSeek-VL-1.3B-chat</code>.</summary>
 <br>The release includes a diverse set of models tailored for various applications within the DeepSeek-VL family. The models come in two sizes: 7B and 1.3B parameters, each offering base and chat variants to cater to different needs and integration scenarios.
@@ -170,6 +175,16 @@ python cli_chat.py --model_path "deepseek-ai/deepseek-vl-7b-chat"
 python cli_chat.py --model_path "local model path"
 ```
 
+### Gradio Demo
+```bash
+pip install -e .[gradio]
+
+python deepseek_vl/serve/app_deepseek.py
+```
+![](./images/gradio_demo.png)
+
+Have Fun!
+
 ## 5. License
 
 This code repository is licensed under [the MIT License](https://github.com/deepseek-ai/DeepSeek-LLM/blob/HEAD/LICENSE-CODE). The use of DeepSeek-VL Base/Chat models is subject to [DeepSeek Model License](https://github.com/deepseek-ai/DeepSeek-LLM/blob/HEAD/LICENSE-MODEL). DeepSeek-VL series (including Base and Chat) supports commercial use.
@@ -179,7 +194,7 @@ This code repository is licensed under [the MIT License](https://github.com/deep
 ```
 @misc{lu2024deepseekvl,
       title={DeepSeek-VL: Towards Real-World Vision-Language Understanding},
-      author={Haoyu Lu and Wen Liu and Bo Zhang and Bingxuan Wang and Kai Dong and Bo Liu and Jingxiang Sun and Tongzheng Ren and Zhuoshu Li and Yaofeng Sun and Chengqi Deng and Hanwei Xu and Zhenda Xie and Chong Ruan},
+      author={Haoyu Lu and Wen Liu and Bo Zhang and Bingxuan Wang and Kai Dong and Bo Liu and Jingxiang Sun and Tongzheng Ren and Zhuoshu Li and Hao Yang and Yaofeng Sun and Chengqi Deng and Hanwei Xu and Zhenda Xie and Chong Ruan},
       year={2024},
       eprint={2403.05525},
       archivePrefix={arXiv},

diff --git a/cli_chat.py b/cli_chat.py
@@ -1,11 +1,31 @@
+# Copyright (c) 2023-2024 DeepSeek.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of
+# this software and associated documentation files (the "Software"), to deal in
+# the Software without restriction, including without limitation the rights to
+# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+# the Software, and to permit persons to whom the Software is furnished to do so,
+# subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
 # -*- coding: utf-8 -*-
 
 import argparse
 import os
 import sys
-from PIL import Image
 from threading import Thread
+
 import torch
+from PIL import Image
 from transformers import TextIteratorStreamer
 
 from deepseek_vl.utils.io import load_pretrained_model
@@ -33,22 +53,19 @@ def get_help_message(image_token):
 
 
 @torch.inference_mode()
-def response(args, conv, pil_images, tokenizer, vl_chat_processor, vl_gpt, generation_config):
-
+def response(
+    args, conv, pil_images, tokenizer, vl_chat_processor, vl_gpt, generation_config
+):
     prompt = conv.get_prompt()
     prepare_inputs = vl_chat_processor.__call__(
-        prompt=prompt,
-        images=pil_images,
-        force_batchify=True
+        prompt=prompt, images=pil_images, force_batchify=True
     ).to(vl_gpt.device)
 
     # run image encoder to get the image embeddings
     inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)
 
     streamer = TextIteratorStreamer(
-        tokenizer=tokenizer,
-        skip_prompt=True,
-        skip_special_tokens=True
+        tokenizer=tokenizer, skip_prompt=True, skip_special_tokens=True
     )
     generation_config["inputs_embeds"] = inputs_embeds
     generation_config["attention_mask"] = prepare_inputs.attention_mask
@@ -79,17 +96,17 @@ def chat(args, tokenizer, vl_chat_processor, vl_gpt, generation_config):
     help_msg = get_help_message(image_token)
 
     while True:
-
         print(help_msg)
 
         pil_images = []
         conv = vl_chat_processor.new_chat_template()
         roles = conv.roles
 
         while True:
-
             # get user input
-            user_input = get_user_input(f"{roles[0]} [{image_token} indicates an image]: ")
+            user_input = get_user_input(
+                f"{roles[0]} [{image_token} indicates an image]: "
+            )
 
             if user_input == "exit":
                 print("Chat program exited.")
@@ -135,11 +152,21 @@ def chat(args, tokenizer, vl_chat_processor, vl_gpt, generation_config):
                         sys.exit(0)
 
                     else:
-                        print(f"File error, `{image_file}` does not exist. Please input the correct file path.")
+                        print(
+                            f"File error, `{image_file}` does not exist. Please input the correct file path."
+                        )
 
                 # get the answer by the model's prediction
                 answer = ""
-                answer_iter = response(args, conv, pil_images, tokenizer, vl_chat_processor, vl_gpt, generation_config)
+                answer_iter = response(
+                    args,
+                    conv,
+                    pil_images,
+                    tokenizer,
+                    vl_chat_processor,
+                    vl_gpt,
+                    generation_config,
+                )
                 sys.stdout.write(f"{conv.roles[1]}: ")
                 for char in answer_iter:
                     answer += char
@@ -153,7 +180,6 @@ def chat(args, tokenizer, vl_chat_processor, vl_gpt, generation_config):
 
 
 def main(args):
-
     # setup
     tokenizer, vl_chat_processor, vl_gpt = load_pretrained_model(args.model_path)
     generation_config = dict(
@@ -164,12 +190,14 @@ def main(args):
         use_cache=True,
     )
     if args.temperature > 0:
-        generation_config.update({
-            "do_sample": True,
-            "top_p": args.top_p,
-            "temperature": args.temperature,
-            "repetition_penalty": args.repetition_penalty,
-        })
+        generation_config.update(
+            {
+                "do_sample": True,
+                "top_p": args.top_p,
+                "temperature": args.temperature,
+                "repetition_penalty": args.repetition_penalty,
+            }
+        )
     else:
         generation_config.update({"do_sample": False})
 
@@ -178,12 +206,15 @@ def main(args):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument("--model_path", type=str, default="deepseek-ai/deepseek-vl-7b-chat",
-                        help="the huggingface model name or the local path of the downloaded huggingface model.")
+    parser.add_argument(
+        "--model_path",
+        type=str,
+        default="deepseek-ai/deepseek-vl-7b-chat",
+        help="the huggingface model name or the local path of the downloaded huggingface model.",
+    )
     parser.add_argument("--temperature", type=float, default=0.2)
     parser.add_argument("--top_p", type=float, default=0.95)
     parser.add_argument("--repetition_penalty", type=float, default=1.1)
     parser.add_argument("--max_gen_len", type=int, default=512)
     args = parser.parse_args()
     main(args)
-