|
| 1 | +from pathlib import Path |
1 | 2 | import gradio as gr
|
2 |
| -import copy |
3 |
| -import re |
4 |
| -from threading import Thread |
5 |
| -from transformers import TextIteratorStreamer |
6 |
| -from qwen_vl_utils import process_vision_info |
7 |
| - |
8 |
| - |
9 |
| -def _parse_text(text): |
10 |
| - lines = text.split("\n") |
11 |
| - lines = [line for line in lines if line != ""] |
12 |
| - count = 0 |
13 |
| - for i, line in enumerate(lines): |
14 |
| - if "```" in line: |
15 |
| - count += 1 |
16 |
| - items = line.split("`") |
17 |
| - if count % 2 == 1: |
18 |
| - lines[i] = f'<pre><code class="language-{items[-1]}">' |
19 |
| - else: |
20 |
| - lines[i] = "<br></code></pre>" |
21 |
| - else: |
22 |
| - if i > 0: |
23 |
| - if count % 2 == 1: |
24 |
| - line = line.replace("`", r"\`") |
25 |
| - line = line.replace("<", "<") |
26 |
| - line = line.replace(">", ">") |
27 |
| - line = line.replace(" ", " ") |
28 |
| - line = line.replace("*", "*") |
29 |
| - line = line.replace("_", "_") |
30 |
| - line = line.replace("-", "-") |
31 |
| - line = line.replace(".", ".") |
32 |
| - line = line.replace("!", "!") |
33 |
| - line = line.replace("(", "(") |
34 |
| - line = line.replace(")", ")") |
35 |
| - line = line.replace("$", "$") |
36 |
| - lines[i] = "<br>" + line |
37 |
| - text = "".join(lines) |
38 |
| - return text |
39 |
| - |
40 |
| - |
41 |
| -def _remove_image_special(text): |
42 |
| - text = text.replace("<ref>", "").replace("</ref>", "") |
43 |
| - return re.sub(r"<box>.*?(</box>|$)", "", text) |
44 |
| - |
45 |
| - |
46 |
| -def is_video_file(filename): |
47 |
| - video_extensions = [".mp4", ".avi", ".mkv", ".mov", ".wmv", ".flv", ".webm", ".mpeg"] |
48 |
| - return any(filename.lower().endswith(ext) for ext in video_extensions) |
49 |
| - |
50 |
| - |
51 |
| -def transform_messages(original_messages): |
52 |
| - transformed_messages = [] |
53 |
| - for message in original_messages: |
54 |
| - new_content = [] |
55 |
| - for item in message["content"]: |
56 |
| - if "image" in item: |
57 |
| - new_item = {"type": "image", "image": item["image"]} |
58 |
| - elif "text" in item: |
59 |
| - new_item = {"type": "text", "text": item["text"]} |
60 |
| - elif "video" in item: |
61 |
| - new_item = {"type": "video", "video": item["video"]} |
62 |
| - else: |
63 |
| - continue |
64 |
| - new_content.append(new_item) |
65 | 3 |
|
66 |
| - new_message = {"role": message["role"], "content": new_content} |
67 |
| - transformed_messages.append(new_message) |
68 | 4 |
|
69 |
| - return transformed_messages |
| 5 | +from PIL import Image |
| 6 | +import numpy as np |
| 7 | +import requests |
| 8 | +from threading import Event, Thread |
| 9 | +import inspect |
| 10 | +from queue import Queue |
70 | 11 |
|
| 12 | +example_image_urls = [ |
| 13 | + ( |
| 14 | + "https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/1d6a0188-5613-418d-a1fd-4560aae1d907", |
| 15 | + "bee.jpg", |
| 16 | + ), |
| 17 | + ( |
| 18 | + "https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/6cc7feeb-0721-4b5d-8791-2576ed9d2863", |
| 19 | + "baklava.png", |
| 20 | + ), |
| 21 | +] |
| 22 | +for url, file_name in example_image_urls: |
| 23 | + if not Path(file_name).exists(): |
| 24 | + Image.open(requests.get(url, stream=True).raw).save(file_name) |
71 | 25 |
|
72 |
| -def make_demo(model, processor): |
73 |
| - def call_local_model(model, processor, messages): |
74 |
| - messages = transform_messages(messages) |
75 | 26 |
|
76 |
| - text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) |
77 |
| - image_inputs, video_inputs = process_vision_info(messages) |
78 |
| - inputs = processor(text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt").to(model.device) |
| 27 | +def make_demo(model): |
| 28 | + import openvino_genai |
| 29 | + import openvino as ov |
79 | 30 |
|
80 |
| - tokenizer = processor.tokenizer |
81 |
| - streamer = TextIteratorStreamer(tokenizer, timeout=3600.0, skip_prompt=True, skip_special_tokens=True) |
| 31 | + has_additonal_buttons = "undo_button" in inspect.signature(gr.ChatInterface.__init__).parameters |
82 | 32 |
|
83 |
| - gen_kwargs = {"max_new_tokens": 512, "streamer": streamer, **inputs} |
| 33 | + def read_image(path: str) -> ov.Tensor: |
| 34 | + """ |
84 | 35 |
|
85 |
| - thread = Thread(target=model.generate, kwargs=gen_kwargs) |
86 |
| - thread.start() |
| 36 | + Args: |
| 37 | + path: The path to the image. |
87 | 38 |
|
88 |
| - generated_text = "" |
89 |
| - for new_text in streamer: |
90 |
| - generated_text += new_text |
91 |
| - yield generated_text |
92 |
| - |
93 |
| - def create_predict_fn(): |
94 |
| - def predict(_chatbot, task_history): |
95 |
| - chat_query = _chatbot[-1][0] |
96 |
| - query = task_history[-1][0] |
97 |
| - if len(chat_query) == 0: |
98 |
| - _chatbot.pop() |
99 |
| - task_history.pop() |
100 |
| - return _chatbot |
101 |
| - print("User: " + _parse_text(query)) |
102 |
| - history_cp = copy.deepcopy(task_history) |
103 |
| - full_response = "" |
104 |
| - messages = [] |
105 |
| - content = [] |
106 |
| - for q, a in history_cp: |
107 |
| - if isinstance(q, (tuple, list)): |
108 |
| - if is_video_file(q[0]): |
109 |
| - content.append({"video": f"file://{q[0]}"}) |
110 |
| - else: |
111 |
| - content.append({"image": f"file://{q[0]}"}) |
112 |
| - else: |
113 |
| - content.append({"text": q}) |
114 |
| - messages.append({"role": "user", "content": content}) |
115 |
| - messages.append({"role": "assistant", "content": [{"text": a}]}) |
116 |
| - content = [] |
117 |
| - messages.pop() |
118 |
| - |
119 |
| - for response in call_local_model(model, processor, messages): |
120 |
| - _chatbot[-1] = (_parse_text(chat_query), _remove_image_special(_parse_text(response))) |
121 |
| - |
122 |
| - yield _chatbot |
123 |
| - full_response = _parse_text(response) |
124 |
| - |
125 |
| - task_history[-1] = (query, full_response) |
126 |
| - print("Qwen-VL-Chat: " + _parse_text(full_response)) |
127 |
| - yield _chatbot |
128 |
| - |
129 |
| - return predict |
130 |
| - |
131 |
| - def create_regenerate_fn(): |
132 |
| - def regenerate(_chatbot, task_history): |
133 |
| - if not task_history: |
134 |
| - return _chatbot |
135 |
| - item = task_history[-1] |
136 |
| - if item[1] is None: |
137 |
| - return _chatbot |
138 |
| - task_history[-1] = (item[0], None) |
139 |
| - chatbot_item = _chatbot.pop(-1) |
140 |
| - if chatbot_item[0] is None: |
141 |
| - _chatbot[-1] = (_chatbot[-1][0], None) |
142 |
| - else: |
143 |
| - _chatbot.append((chatbot_item[0], None)) |
144 |
| - _chatbot_gen = predict(_chatbot, task_history) |
145 |
| - for _chatbot in _chatbot_gen: |
146 |
| - yield _chatbot |
147 |
| - |
148 |
| - return regenerate |
149 |
| - |
150 |
| - predict = create_predict_fn() |
151 |
| - regenerate = create_regenerate_fn() |
152 |
| - |
153 |
| - def add_text(history, task_history, text): |
154 |
| - task_text = text |
155 |
| - history = history if history is not None else [] |
156 |
| - task_history = task_history if task_history is not None else [] |
157 |
| - history = history + [(_parse_text(text), None)] |
158 |
| - task_history = task_history + [(task_text, None)] |
159 |
| - return history, task_history, "" |
160 |
| - |
161 |
| - def add_file(history, task_history, file): |
162 |
| - history = history if history is not None else [] |
163 |
| - task_history = task_history if task_history is not None else [] |
164 |
| - history = history + [((file.name,), None)] |
165 |
| - task_history = task_history + [((file.name,), None)] |
166 |
| - return history, task_history |
167 |
| - |
168 |
| - def reset_user_input(): |
169 |
| - return gr.update(value="") |
170 |
| - |
171 |
| - def reset_state(task_history): |
172 |
| - task_history.clear() |
173 |
| - return [] |
174 |
| - |
175 |
| - with gr.Blocks() as demo: |
176 |
| - gr.Markdown("""<center><font size=8>Qwen2-VL OpenVINO demo</center>""") |
177 |
| - |
178 |
| - chatbot = gr.Chatbot(label="Qwen2-VL", elem_classes="control-height", height=500) |
179 |
| - query = gr.Textbox(lines=2, label="Input") |
180 |
| - task_history = gr.State([]) |
181 |
| - |
182 |
| - with gr.Row(): |
183 |
| - addfile_btn = gr.UploadButton("📁 Upload (上传文件)", file_types=["image", "video"]) |
184 |
| - submit_btn = gr.Button("🚀 Submit (发送)") |
185 |
| - regen_btn = gr.Button("🤔️ Regenerate (重试)") |
186 |
| - empty_bin = gr.Button("🧹 Clear History (清除历史)") |
187 |
| - |
188 |
| - submit_btn.click(add_text, [chatbot, task_history, query], [chatbot, task_history]).then( |
189 |
| - predict, [chatbot, task_history], [chatbot], show_progress=True |
190 |
| - ) |
191 |
| - submit_btn.click(reset_user_input, [], [query]) |
192 |
| - empty_bin.click(reset_state, [task_history], [chatbot], show_progress=True) |
193 |
| - regen_btn.click(regenerate, [chatbot, task_history], [chatbot], show_progress=True) |
194 |
| - addfile_btn.upload(add_file, [chatbot, task_history, addfile_btn], [chatbot, task_history], show_progress=True) |
195 |
| - |
196 |
| - gr.Markdown( |
197 |
| - """\ |
198 |
| -<font size=2>Note: This demo is governed by the original license of Qwen2-VL. \ |
199 |
| -We strongly advise users not to knowingly generate or allow others to knowingly generate harmful content, \ |
200 |
| -including hate speech, violence, pornography, deception, etc. \ |
201 |
| -(注:本演示受Qwen2-VL的许可协议限制。我们强烈建议,用户不应传播及不应允许他人传播以下内容,\ |
202 |
| -包括但不限于仇恨言论、暴力、色情、欺诈相关的有害信息。)""" |
203 |
| - ) |
| 39 | + Returns: the ov.Tensor containing the image. |
| 40 | +
|
| 41 | + """ |
| 42 | + pic = Image.open(path).convert("RGB") |
| 43 | + image_data = np.array(pic.getdata()).reshape(1, pic.size[1], pic.size[0], 3).astype(np.byte) |
| 44 | + return ov.Tensor(image_data) |
| 45 | + |
| 46 | + class TextQueue: |
| 47 | + def __init__(self) -> None: |
| 48 | + self.text_queue = Queue() |
| 49 | + self.stop_signal = None |
| 50 | + self.stop_tokens = [] |
204 | 51 |
|
| 52 | + def __call__(self, text): |
| 53 | + self.text_queue.put(text) |
| 54 | + |
| 55 | + def __iter__(self): |
| 56 | + return self |
| 57 | + |
| 58 | + def __next__(self): |
| 59 | + value = self.text_queue.get() |
| 60 | + if value == self.stop_signal or value in self.stop_tokens: |
| 61 | + raise StopIteration() |
| 62 | + else: |
| 63 | + return value |
| 64 | + |
| 65 | + def reset(self): |
| 66 | + self.text_queue = Queue() |
| 67 | + |
| 68 | + def end(self): |
| 69 | + self.text_queue.put(self.stop_signal) |
| 70 | + |
| 71 | + def bot_streaming(message, history): |
| 72 | + print(f"message is - {message}") |
| 73 | + print(f"history is - {history}") |
| 74 | + |
| 75 | + if not history: |
| 76 | + model.start_chat() |
| 77 | + generation_config = openvino_genai.GenerationConfig() |
| 78 | + generation_config.max_new_tokens = 128 |
| 79 | + files = message["files"] if isinstance(message, dict) else message.files |
| 80 | + message_text = message["text"] if isinstance(message, dict) else message.text |
| 81 | + |
| 82 | + image = None |
| 83 | + if files: |
| 84 | + # message["files"][-1] is a Dict or just a string |
| 85 | + if isinstance(files[-1], dict): |
| 86 | + image = files[-1]["path"] |
| 87 | + else: |
| 88 | + if isinstance(files[-1], (str, Path)): |
| 89 | + image = files[-1] |
| 90 | + else: |
| 91 | + image = files[-1] if isinstance(files[-1], (list, tuple)) else files[-1].path |
| 92 | + if image is not None: |
| 93 | + image = read_image(image) |
| 94 | + streamer = TextQueue() |
| 95 | + stream_complete = Event() |
| 96 | + |
| 97 | + def generate_and_signal_complete(): |
| 98 | + """ |
| 99 | + generation function for single thread |
| 100 | + """ |
| 101 | + streamer.reset() |
| 102 | + generation_kwargs = {"prompt": message_text, "generation_config": generation_config, "streamer": streamer} |
| 103 | + if image is not None: |
| 104 | + generation_kwargs["image"] = image |
| 105 | + model.generate(**generation_kwargs) |
| 106 | + stream_complete.set() |
| 107 | + streamer.end() |
| 108 | + |
| 109 | + t1 = Thread(target=generate_and_signal_complete) |
| 110 | + t1.start() |
| 111 | + |
| 112 | + buffer = "" |
| 113 | + for new_text in streamer: |
| 114 | + buffer += new_text |
| 115 | + yield buffer |
| 116 | + |
| 117 | + additional_buttons = {} |
| 118 | + if has_additonal_buttons: |
| 119 | + additional_buttons = {"undo_button": None, "retry_button": None} |
| 120 | + demo = gr.ChatInterface( |
| 121 | + fn=bot_streaming, |
| 122 | + title="Qwen2-VL OpenVINO Demo", |
| 123 | + examples=[ |
| 124 | + {"text": "What is on the flower?", "files": ["./bee.jpg"]}, |
| 125 | + {"text": "How to make this pastry?", "files": ["./baklava.png"]}, |
| 126 | + ], |
| 127 | + stop_btn=None, |
| 128 | + multimodal=True, |
| 129 | + **additional_buttons, |
| 130 | + ) |
205 | 131 | return demo
|
0 commit comments