Skip to content

Commit 0639306

Browse files
committed
add readme
1 parent d1adbbf commit 0639306

File tree

4 files changed

+520
-298
lines changed

4 files changed

+520
-298
lines changed

notebooks/phi-4-multimodal/README.md

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# Multimodal assistant with Phi-4-mini-multimodal and OpenVINO
2+
3+
Phi-4-multimodal-instruct is a lightweight open multimodal foundation model. The model processes text, image, and audio inputs, generating text outputs. Phi-4-multimodal-instruct has 5.6B parameters and is a multimodal transformer model. The model has the pretrained Phi-4-mini as the backbone language model, and the advanced encoders and adapters of vision and speech.
4+
In this tutorial we will explore how to run Phi-4-mini-multimodal model using [OpenVINO](https://github.com/openvinotoolkit/openvino) and optimize it using [NNCF](https://github.com/openvinotoolkit/nncf).
5+
6+
## Notebook contents
7+
The tutorial consists from following steps:
8+
9+
- Install requirements
10+
- Convert and Optimize model
11+
- Run OpenVINO model inference
12+
- Launch Interactive demo
13+
14+
In this demonstration, you'll create interactive chatbot that can answer questions about provided image's and audio's content.
15+
![phi4](https://github.com/user-attachments/assets/8c0b8e50-417e-4579-b799-e9b9c15e8a87)
16+
17+
## Installation instructions
18+
This is a self-contained example that relies solely on its own code.</br>
19+
We recommend running the notebook in a virtual environment. You only need a Jupyter server to start.
20+
For details, please refer to [Installation Guide](../../README.md).
21+
<img referrerpolicy="no-referrer-when-downgrade" src="https://static.scarf.sh/a.png?x-pxid=5b5a4db0-7875-4bfb-bdbd-01698b5b1a77&file=notebooks/phi-4-multimodal/README.md" />

notebooks/phi-4-multimodal/gradio_helper.py

+17-27
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
21
from copy import deepcopy
32
from typing import Dict, List
43
from PIL import Image
@@ -10,8 +9,8 @@
109
IMAGE_EXTENSIONS = (".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".webp")
1110
AUDIO_EXTENSIONS = (".mp3", ".wav", "flac", ".m4a", ".wma")
1211

13-
IMAGE_SPECIAL = '<|endoftext10|>'
14-
AUDIO_SPECIAL = '<|endoftext11|>'
12+
IMAGE_SPECIAL = "<|endoftext10|>"
13+
AUDIO_SPECIAL = "<|endoftext11|>"
1514

1615
DEFAULT_SAMPLING_PARAMS = {
1716
"top_p": 0.0,
@@ -24,7 +23,6 @@
2423
MAX_NEW_TOKENS = 512
2524

2625

27-
2826
def history2messages(history: List[Dict]) -> List[Dict]:
2927
"""
3028
Transform gradio history to chat messages.
@@ -65,14 +63,15 @@ def history2messages(history: List[Dict]) -> List[Dict]:
6563
messages.append(cur_message)
6664
return messages, images, audios
6765

66+
6867
def check_messages(history, message, audio):
6968
has_text = message["text"] and message["text"].strip()
7069
has_files = len(message["files"]) > 0
7170
has_audio = audio is not None
7271

7372
if not (has_text or has_files or has_audio):
7473
raise gr.Error("Message is empty")
75-
74+
7675
audios = []
7776
images = []
7877

@@ -110,9 +109,10 @@ def check_messages(history, message, audio):
110109

111110
if message["text"]:
112111
history.append({"role": "user", "content": message["text"], "metadata": {}})
113-
112+
114113
return history, gr.MultimodalTextbox(value=None, interactive=False), None
115114

115+
116116
def make_demo(ov_model, processor):
117117
def bot(
118118
history: list,
@@ -130,7 +130,7 @@ def bot(
130130

131131
if not history:
132132
return history
133-
133+
134134
msgs, images, audios = history2messages(history)
135135
audios = audios if len(audios) > 0 else None
136136
images = images if len(images) > 0 else None
@@ -146,15 +146,13 @@ def bot(
146146
"temperature": temperature,
147147
"repetition_penalty": repetition_penalty,
148148
"max_new_tokens": max_new_tokens,
149-
"do_sample": temperature > 0,
149+
"do_sample": temperature > 0,
150150
"streamer": streamer,
151-
**inputs
151+
**inputs,
152152
}
153153

154154
history.append({"role": "assistant", "content": ""})
155155

156-
157-
158156
thread = Thread(target=ov_model.generate, kwargs=generation_params)
159157
thread.start()
160158

@@ -169,14 +167,13 @@ def change_state(state):
169167

170168
def reset_user_input():
171169
return gr.update(value="")
172-
170+
173171
with gr.Blocks(theme=gr.themes.Soft()) as demo:
174172
gr.Markdown("# 🪐 Chat with OpenVINO Phi-4-multimodal")
175-
chatbot = gr.Chatbot(elem_id="chatbot", bubble_full_width=False, type="messages", height='48vh')
173+
chatbot = gr.Chatbot(elem_id="chatbot", bubble_full_width=False, type="messages", height="48vh")
176174

177175
sampling_params_group_hidden_state = gr.State(False)
178176

179-
180177
with gr.Row(equal_height=True):
181178
chat_input = gr.MultimodalTextbox(
182179
file_count="multiple",
@@ -188,12 +185,7 @@ def reset_user_input():
188185
# stop_btn=True,
189186
)
190187
with gr.Row(equal_height=True):
191-
audio_input = gr.Audio(
192-
sources=["microphone", "upload"],
193-
type="filepath",
194-
scale=1,
195-
max_length=30
196-
)
188+
audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath", scale=1, max_length=30)
197189
with gr.Row(equal_height=True):
198190
with gr.Column(scale=1, min_width=150):
199191
with gr.Row(equal_height=True):
@@ -205,9 +197,7 @@ def reset_user_input():
205197

206198
with gr.Group(visible=False) as sampling_params_group:
207199
with gr.Row():
208-
temperature = gr.Slider(
209-
minimum=0, maximum=1, value=DEFAULT_SAMPLING_PARAMS["temperature"], label="Temperature"
210-
)
200+
temperature = gr.Slider(minimum=0, maximum=1, value=DEFAULT_SAMPLING_PARAMS["temperature"], label="Temperature")
211201
repetition_penalty = gr.Slider(
212202
minimum=0,
213203
maximum=2,
@@ -234,9 +224,9 @@ def reset_user_input():
234224
[sampling_params_group, sampling_params_group_hidden_state],
235225
)
236226
chat_msg = chat_input.submit(
237-
check_messages,
238-
[chatbot, chat_input, audio_input],
239-
[chatbot, chat_input, audio_input],
227+
check_messages,
228+
[chatbot, chat_input, audio_input],
229+
[chatbot, chat_input, audio_input],
240230
)
241231

242232
bot_msg = chat_msg.then(
@@ -252,4 +242,4 @@ def reset_user_input():
252242
inputs=[chatbot, top_p, top_k, temperature, repetition_penalty, max_new_tokens, gr.State(True)],
253243
outputs=chatbot,
254244
)
255-
return demo
245+
return demo

0 commit comments

Comments
 (0)