Skip to content

Commit e0aa6c5

Browse files
yatarkaneaidova
andauthored
Switch Qwen2-VL notebook to GenAI (#2723)
Ticket: CVS-158716 --------- Co-authored-by: Ekaterina Aidova <ekaterina.aidova@intel.com>
1 parent 24cfbea commit e0aa6c5

File tree

3 files changed

+212
-304
lines changed

3 files changed

+212
-304
lines changed

notebooks/qwen2-vl/README.md

+2-1
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,8 @@ The tutorial consists from following steps:
3434

3535
- Install requirements
3636
- Convert and Optimize model
37-
- Run OpenVINO model inference
37+
- Prepare OpenVINO GenAI Inference Pipeline
38+
- Run OpenVINO GenAI model inference
3839
- Launch Interactive demo
3940

4041
In this demonstration, you'll create interactive chatbot that can answer questions about provided image's content.

notebooks/qwen2-vl/gradio_helper.py

+119-193
Original file line numberDiff line numberDiff line change
@@ -1,205 +1,131 @@
1+
from pathlib import Path
12
import gradio as gr
2-
import copy
3-
import re
4-
from threading import Thread
5-
from transformers import TextIteratorStreamer
6-
from qwen_vl_utils import process_vision_info
7-
8-
9-
def _parse_text(text):
10-
lines = text.split("\n")
11-
lines = [line for line in lines if line != ""]
12-
count = 0
13-
for i, line in enumerate(lines):
14-
if "```" in line:
15-
count += 1
16-
items = line.split("`")
17-
if count % 2 == 1:
18-
lines[i] = f'<pre><code class="language-{items[-1]}">'
19-
else:
20-
lines[i] = "<br></code></pre>"
21-
else:
22-
if i > 0:
23-
if count % 2 == 1:
24-
line = line.replace("`", r"\`")
25-
line = line.replace("<", "&lt;")
26-
line = line.replace(">", "&gt;")
27-
line = line.replace(" ", "&nbsp;")
28-
line = line.replace("*", "&ast;")
29-
line = line.replace("_", "&lowbar;")
30-
line = line.replace("-", "&#45;")
31-
line = line.replace(".", "&#46;")
32-
line = line.replace("!", "&#33;")
33-
line = line.replace("(", "&#40;")
34-
line = line.replace(")", "&#41;")
35-
line = line.replace("$", "&#36;")
36-
lines[i] = "<br>" + line
37-
text = "".join(lines)
38-
return text
39-
40-
41-
def _remove_image_special(text):
42-
text = text.replace("<ref>", "").replace("</ref>", "")
43-
return re.sub(r"<box>.*?(</box>|$)", "", text)
44-
45-
46-
def is_video_file(filename):
47-
video_extensions = [".mp4", ".avi", ".mkv", ".mov", ".wmv", ".flv", ".webm", ".mpeg"]
48-
return any(filename.lower().endswith(ext) for ext in video_extensions)
49-
50-
51-
def transform_messages(original_messages):
52-
transformed_messages = []
53-
for message in original_messages:
54-
new_content = []
55-
for item in message["content"]:
56-
if "image" in item:
57-
new_item = {"type": "image", "image": item["image"]}
58-
elif "text" in item:
59-
new_item = {"type": "text", "text": item["text"]}
60-
elif "video" in item:
61-
new_item = {"type": "video", "video": item["video"]}
62-
else:
63-
continue
64-
new_content.append(new_item)
653

66-
new_message = {"role": message["role"], "content": new_content}
67-
transformed_messages.append(new_message)
684

69-
return transformed_messages
5+
from PIL import Image
6+
import numpy as np
7+
import requests
8+
from threading import Event, Thread
9+
import inspect
10+
from queue import Queue
7011

12+
example_image_urls = [
13+
(
14+
"https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/1d6a0188-5613-418d-a1fd-4560aae1d907",
15+
"bee.jpg",
16+
),
17+
(
18+
"https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/6cc7feeb-0721-4b5d-8791-2576ed9d2863",
19+
"baklava.png",
20+
),
21+
]
22+
for url, file_name in example_image_urls:
23+
if not Path(file_name).exists():
24+
Image.open(requests.get(url, stream=True).raw).save(file_name)
7125

72-
def make_demo(model, processor):
73-
def call_local_model(model, processor, messages):
74-
messages = transform_messages(messages)
7526

76-
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
77-
image_inputs, video_inputs = process_vision_info(messages)
78-
inputs = processor(text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt").to(model.device)
27+
def make_demo(model):
28+
import openvino_genai
29+
import openvino as ov
7930

80-
tokenizer = processor.tokenizer
81-
streamer = TextIteratorStreamer(tokenizer, timeout=3600.0, skip_prompt=True, skip_special_tokens=True)
31+
has_additonal_buttons = "undo_button" in inspect.signature(gr.ChatInterface.__init__).parameters
8232

83-
gen_kwargs = {"max_new_tokens": 512, "streamer": streamer, **inputs}
33+
def read_image(path: str) -> ov.Tensor:
34+
"""
8435
85-
thread = Thread(target=model.generate, kwargs=gen_kwargs)
86-
thread.start()
36+
Args:
37+
path: The path to the image.
8738
88-
generated_text = ""
89-
for new_text in streamer:
90-
generated_text += new_text
91-
yield generated_text
92-
93-
def create_predict_fn():
94-
def predict(_chatbot, task_history):
95-
chat_query = _chatbot[-1][0]
96-
query = task_history[-1][0]
97-
if len(chat_query) == 0:
98-
_chatbot.pop()
99-
task_history.pop()
100-
return _chatbot
101-
print("User: " + _parse_text(query))
102-
history_cp = copy.deepcopy(task_history)
103-
full_response = ""
104-
messages = []
105-
content = []
106-
for q, a in history_cp:
107-
if isinstance(q, (tuple, list)):
108-
if is_video_file(q[0]):
109-
content.append({"video": f"file://{q[0]}"})
110-
else:
111-
content.append({"image": f"file://{q[0]}"})
112-
else:
113-
content.append({"text": q})
114-
messages.append({"role": "user", "content": content})
115-
messages.append({"role": "assistant", "content": [{"text": a}]})
116-
content = []
117-
messages.pop()
118-
119-
for response in call_local_model(model, processor, messages):
120-
_chatbot[-1] = (_parse_text(chat_query), _remove_image_special(_parse_text(response)))
121-
122-
yield _chatbot
123-
full_response = _parse_text(response)
124-
125-
task_history[-1] = (query, full_response)
126-
print("Qwen-VL-Chat: " + _parse_text(full_response))
127-
yield _chatbot
128-
129-
return predict
130-
131-
def create_regenerate_fn():
132-
def regenerate(_chatbot, task_history):
133-
if not task_history:
134-
return _chatbot
135-
item = task_history[-1]
136-
if item[1] is None:
137-
return _chatbot
138-
task_history[-1] = (item[0], None)
139-
chatbot_item = _chatbot.pop(-1)
140-
if chatbot_item[0] is None:
141-
_chatbot[-1] = (_chatbot[-1][0], None)
142-
else:
143-
_chatbot.append((chatbot_item[0], None))
144-
_chatbot_gen = predict(_chatbot, task_history)
145-
for _chatbot in _chatbot_gen:
146-
yield _chatbot
147-
148-
return regenerate
149-
150-
predict = create_predict_fn()
151-
regenerate = create_regenerate_fn()
152-
153-
def add_text(history, task_history, text):
154-
task_text = text
155-
history = history if history is not None else []
156-
task_history = task_history if task_history is not None else []
157-
history = history + [(_parse_text(text), None)]
158-
task_history = task_history + [(task_text, None)]
159-
return history, task_history, ""
160-
161-
def add_file(history, task_history, file):
162-
history = history if history is not None else []
163-
task_history = task_history if task_history is not None else []
164-
history = history + [((file.name,), None)]
165-
task_history = task_history + [((file.name,), None)]
166-
return history, task_history
167-
168-
def reset_user_input():
169-
return gr.update(value="")
170-
171-
def reset_state(task_history):
172-
task_history.clear()
173-
return []
174-
175-
with gr.Blocks() as demo:
176-
gr.Markdown("""<center><font size=8>Qwen2-VL OpenVINO demo</center>""")
177-
178-
chatbot = gr.Chatbot(label="Qwen2-VL", elem_classes="control-height", height=500)
179-
query = gr.Textbox(lines=2, label="Input")
180-
task_history = gr.State([])
181-
182-
with gr.Row():
183-
addfile_btn = gr.UploadButton("📁 Upload (上传文件)", file_types=["image", "video"])
184-
submit_btn = gr.Button("🚀 Submit (发送)")
185-
regen_btn = gr.Button("🤔️ Regenerate (重试)")
186-
empty_bin = gr.Button("🧹 Clear History (清除历史)")
187-
188-
submit_btn.click(add_text, [chatbot, task_history, query], [chatbot, task_history]).then(
189-
predict, [chatbot, task_history], [chatbot], show_progress=True
190-
)
191-
submit_btn.click(reset_user_input, [], [query])
192-
empty_bin.click(reset_state, [task_history], [chatbot], show_progress=True)
193-
regen_btn.click(regenerate, [chatbot, task_history], [chatbot], show_progress=True)
194-
addfile_btn.upload(add_file, [chatbot, task_history, addfile_btn], [chatbot, task_history], show_progress=True)
195-
196-
gr.Markdown(
197-
"""\
198-
<font size=2>Note: This demo is governed by the original license of Qwen2-VL. \
199-
We strongly advise users not to knowingly generate or allow others to knowingly generate harmful content, \
200-
including hate speech, violence, pornography, deception, etc. \
201-
(注:本演示受Qwen2-VL的许可协议限制。我们强烈建议,用户不应传播及不应允许他人传播以下内容,\
202-
包括但不限于仇恨言论、暴力、色情、欺诈相关的有害信息。)"""
203-
)
39+
Returns: the ov.Tensor containing the image.
40+
41+
"""
42+
pic = Image.open(path).convert("RGB")
43+
image_data = np.array(pic.getdata()).reshape(1, pic.size[1], pic.size[0], 3).astype(np.byte)
44+
return ov.Tensor(image_data)
45+
46+
class TextQueue:
47+
def __init__(self) -> None:
48+
self.text_queue = Queue()
49+
self.stop_signal = None
50+
self.stop_tokens = []
20451

52+
def __call__(self, text):
53+
self.text_queue.put(text)
54+
55+
def __iter__(self):
56+
return self
57+
58+
def __next__(self):
59+
value = self.text_queue.get()
60+
if value == self.stop_signal or value in self.stop_tokens:
61+
raise StopIteration()
62+
else:
63+
return value
64+
65+
def reset(self):
66+
self.text_queue = Queue()
67+
68+
def end(self):
69+
self.text_queue.put(self.stop_signal)
70+
71+
def bot_streaming(message, history):
72+
print(f"message is - {message}")
73+
print(f"history is - {history}")
74+
75+
if not history:
76+
model.start_chat()
77+
generation_config = openvino_genai.GenerationConfig()
78+
generation_config.max_new_tokens = 128
79+
files = message["files"] if isinstance(message, dict) else message.files
80+
message_text = message["text"] if isinstance(message, dict) else message.text
81+
82+
image = None
83+
if files:
84+
# message["files"][-1] is a Dict or just a string
85+
if isinstance(files[-1], dict):
86+
image = files[-1]["path"]
87+
else:
88+
if isinstance(files[-1], (str, Path)):
89+
image = files[-1]
90+
else:
91+
image = files[-1] if isinstance(files[-1], (list, tuple)) else files[-1].path
92+
if image is not None:
93+
image = read_image(image)
94+
streamer = TextQueue()
95+
stream_complete = Event()
96+
97+
def generate_and_signal_complete():
98+
"""
99+
generation function for single thread
100+
"""
101+
streamer.reset()
102+
generation_kwargs = {"prompt": message_text, "generation_config": generation_config, "streamer": streamer}
103+
if image is not None:
104+
generation_kwargs["image"] = image
105+
model.generate(**generation_kwargs)
106+
stream_complete.set()
107+
streamer.end()
108+
109+
t1 = Thread(target=generate_and_signal_complete)
110+
t1.start()
111+
112+
buffer = ""
113+
for new_text in streamer:
114+
buffer += new_text
115+
yield buffer
116+
117+
additional_buttons = {}
118+
if has_additonal_buttons:
119+
additional_buttons = {"undo_button": None, "retry_button": None}
120+
demo = gr.ChatInterface(
121+
fn=bot_streaming,
122+
title="Qwen2-VL OpenVINO Demo",
123+
examples=[
124+
{"text": "What is on the flower?", "files": ["./bee.jpg"]},
125+
{"text": "How to make this pastry?", "files": ["./baklava.png"]},
126+
],
127+
stop_btn=None,
128+
multimodal=True,
129+
**additional_buttons,
130+
)
205131
return demo

0 commit comments

Comments
 (0)