|
| 1 | +from copy import deepcopy |
| 2 | +from typing import Dict, List |
| 3 | +from PIL import Image |
| 4 | +import librosa |
| 5 | +from transformers import TextIteratorStreamer |
| 6 | +from threading import Thread |
| 7 | +import gradio as gr |
| 8 | + |
| 9 | +IMAGE_EXTENSIONS = (".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".webp") |
| 10 | +AUDIO_EXTENSIONS = (".mp3", ".wav", "flac", ".m4a", ".wma") |
| 11 | + |
| 12 | +IMAGE_SPECIAL = "<|endoftext10|>" |
| 13 | +AUDIO_SPECIAL = "<|endoftext11|>" |
| 14 | + |
| 15 | +DEFAULT_SAMPLING_PARAMS = { |
| 16 | + "top_p": 0.0, |
| 17 | + "top_k": 1, |
| 18 | + "temperature": 0.0, |
| 19 | + "do_sample": True, |
| 20 | + "num_beams": 1, |
| 21 | + "repetition_penalty": 1.2, |
| 22 | +} |
| 23 | +MAX_NEW_TOKENS = 512 |
| 24 | + |
| 25 | + |
| 26 | +def history2messages(history: List[Dict]) -> List[Dict]: |
| 27 | + """ |
| 28 | + Transform gradio history to chat messages. |
| 29 | + """ |
| 30 | + print(history) |
| 31 | + messages = [] |
| 32 | + cur_message = dict() |
| 33 | + images = [] |
| 34 | + audios = [] |
| 35 | + cur_special_tags = "" |
| 36 | + for item in history: |
| 37 | + if item["role"] == "assistant": |
| 38 | + if len(cur_message) > 0: |
| 39 | + cur_message["content"] = cur_special_tags + cur_message["content"] |
| 40 | + messages.append(deepcopy(cur_message)) |
| 41 | + cur_message = dict() |
| 42 | + cur_special_tags = "" |
| 43 | + messages.append({"role": "assistant", "content": item["content"]}) |
| 44 | + continue |
| 45 | + |
| 46 | + if "role" not in cur_message: |
| 47 | + cur_message["role"] = "user" |
| 48 | + if "content" not in cur_message: |
| 49 | + cur_message["content"] = "" |
| 50 | + |
| 51 | + if "metadata" not in item: |
| 52 | + item["metadata"] = {"title": None} |
| 53 | + if item["metadata"].get("title") is None: |
| 54 | + cur_message["content"] = item["content"] |
| 55 | + elif item["metadata"]["title"] == "image": |
| 56 | + cur_special_tags += IMAGE_SPECIAL |
| 57 | + images.append(Image.open(item["content"][0])) |
| 58 | + elif item["metadata"]["title"] == "audio": |
| 59 | + cur_special_tags += AUDIO_SPECIAL |
| 60 | + audios.append(librosa.load(item["content"][0])) |
| 61 | + if len(cur_message) > 0: |
| 62 | + cur_message["content"] = cur_special_tags + cur_message["content"] |
| 63 | + messages.append(cur_message) |
| 64 | + return messages, images, audios |
| 65 | + |
| 66 | + |
| 67 | +def check_messages(history, message, audio): |
| 68 | + has_text = message["text"] and message["text"].strip() |
| 69 | + has_files = len(message["files"]) > 0 |
| 70 | + has_audio = audio is not None |
| 71 | + |
| 72 | + if not (has_text or has_files or has_audio): |
| 73 | + raise gr.Error("Message is empty") |
| 74 | + |
| 75 | + audios = [] |
| 76 | + images = [] |
| 77 | + |
| 78 | + for file_msg in message["files"]: |
| 79 | + if file_msg.endswith(AUDIO_EXTENSIONS): |
| 80 | + duration = librosa.get_duration(filename=file_msg) |
| 81 | + if duration > 60: |
| 82 | + raise gr.Error("Audio file too long. For efficiency we recommend to use audio < 60s") |
| 83 | + if duration == 0: |
| 84 | + raise gr.Error("Audio file too short") |
| 85 | + audios.append(file_msg) |
| 86 | + elif file_msg.endswith(IMAGE_EXTENSIONS): |
| 87 | + images.append(file_msg) |
| 88 | + else: |
| 89 | + filename = file_msg.split("/")[-1] |
| 90 | + raise gr.Error(f"Unsupported file type: {filename}. It should be an image or audio file.") |
| 91 | + |
| 92 | + if len(audios) > 1: |
| 93 | + raise gr.Error("Please upload only one audio file.") |
| 94 | + |
| 95 | + if len(images) > 1: |
| 96 | + raise gr.Error("Please upload only one image file.") |
| 97 | + |
| 98 | + if audio is not None: |
| 99 | + if len(audios) > 0: |
| 100 | + raise gr.Error("Please upload only one audio file or record audio.") |
| 101 | + audios.append(audio) |
| 102 | + |
| 103 | + # Append the message to the history |
| 104 | + for image in images: |
| 105 | + history.append({"role": "user", "content": (image,), "metadata": {"title": "image"}}) |
| 106 | + |
| 107 | + for audio in audios: |
| 108 | + history.append({"role": "user", "content": (audio,), "metadata": {"title": "audio"}}) |
| 109 | + |
| 110 | + if message["text"]: |
| 111 | + history.append({"role": "user", "content": message["text"], "metadata": {}}) |
| 112 | + |
| 113 | + return history, gr.MultimodalTextbox(value=None, interactive=False), None |
| 114 | + |
| 115 | + |
| 116 | +def make_demo(ov_model, processor): |
| 117 | + def bot( |
| 118 | + history: list, |
| 119 | + top_p: float, |
| 120 | + top_k: int, |
| 121 | + temperature: float, |
| 122 | + repetition_penalty: float, |
| 123 | + max_new_tokens: int = MAX_NEW_TOKENS, |
| 124 | + regenerate: bool = False, |
| 125 | + ): |
| 126 | + |
| 127 | + print(history) |
| 128 | + if history and regenerate: |
| 129 | + history = history[:-1] |
| 130 | + |
| 131 | + if not history: |
| 132 | + return history |
| 133 | + |
| 134 | + msgs, images, audios = history2messages(history) |
| 135 | + audios = audios if len(audios) > 0 else None |
| 136 | + images = images if len(images) > 0 else None |
| 137 | + |
| 138 | + print(msgs) |
| 139 | + prompt = processor.tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True) |
| 140 | + print(prompt) |
| 141 | + inputs = processor(text=prompt, audios=audios, images=images) |
| 142 | + streamer = TextIteratorStreamer(processor.tokenizer, skip_prompt=True, skip_special_tokens=True) |
| 143 | + generation_params = { |
| 144 | + "top_p": top_p, |
| 145 | + "top_k": top_k, |
| 146 | + "temperature": temperature, |
| 147 | + "repetition_penalty": repetition_penalty, |
| 148 | + "max_new_tokens": max_new_tokens, |
| 149 | + "do_sample": temperature > 0, |
| 150 | + "streamer": streamer, |
| 151 | + **inputs, |
| 152 | + } |
| 153 | + |
| 154 | + history.append({"role": "assistant", "content": ""}) |
| 155 | + |
| 156 | + thread = Thread(target=ov_model.generate, kwargs=generation_params) |
| 157 | + thread.start() |
| 158 | + |
| 159 | + buffer = "" |
| 160 | + for new_text in streamer: |
| 161 | + buffer += new_text |
| 162 | + history[-1]["content"] = buffer |
| 163 | + yield history |
| 164 | + |
| 165 | + def change_state(state): |
| 166 | + return gr.update(visible=not state), not state |
| 167 | + |
| 168 | + def reset_user_input(): |
| 169 | + return gr.update(value="") |
| 170 | + |
| 171 | + with gr.Blocks(theme=gr.themes.Soft()) as demo: |
| 172 | + gr.Markdown("# 🪐 Chat with OpenVINO Phi-4-multimodal") |
| 173 | + chatbot = gr.Chatbot(elem_id="chatbot", bubble_full_width=False, type="messages", height="48vh") |
| 174 | + |
| 175 | + sampling_params_group_hidden_state = gr.State(False) |
| 176 | + |
| 177 | + with gr.Row(equal_height=True): |
| 178 | + chat_input = gr.MultimodalTextbox( |
| 179 | + file_count="multiple", |
| 180 | + placeholder="Enter your prompt or upload image/audio here, then press ENTER...", |
| 181 | + show_label=False, |
| 182 | + scale=8, |
| 183 | + file_types=["image", "audio"], |
| 184 | + interactive=True, |
| 185 | + # stop_btn=True, |
| 186 | + ) |
| 187 | + with gr.Row(equal_height=True): |
| 188 | + audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath", scale=1, max_length=30) |
| 189 | + with gr.Row(equal_height=True): |
| 190 | + with gr.Column(scale=1, min_width=150): |
| 191 | + with gr.Row(equal_height=True): |
| 192 | + regenerate_btn = gr.Button("Regenerate", variant="primary") |
| 193 | + clear_btn = gr.ClearButton([chat_input, audio_input, chatbot]) |
| 194 | + |
| 195 | + with gr.Row(): |
| 196 | + sampling_params_toggle_btn = gr.Button("Sampling Parameters") |
| 197 | + |
| 198 | + with gr.Group(visible=False) as sampling_params_group: |
| 199 | + with gr.Row(): |
| 200 | + temperature = gr.Slider(minimum=0, maximum=1, value=DEFAULT_SAMPLING_PARAMS["temperature"], label="Temperature") |
| 201 | + repetition_penalty = gr.Slider( |
| 202 | + minimum=0, |
| 203 | + maximum=2, |
| 204 | + value=DEFAULT_SAMPLING_PARAMS["repetition_penalty"], |
| 205 | + label="Repetition Penalty", |
| 206 | + ) |
| 207 | + |
| 208 | + with gr.Row(): |
| 209 | + top_p = gr.Slider(minimum=0, maximum=1, value=DEFAULT_SAMPLING_PARAMS["top_p"], label="Top-p") |
| 210 | + top_k = gr.Slider(minimum=0, maximum=1000, value=DEFAULT_SAMPLING_PARAMS["top_k"], label="Top-k") |
| 211 | + |
| 212 | + with gr.Row(): |
| 213 | + max_new_tokens = gr.Slider( |
| 214 | + minimum=1, |
| 215 | + maximum=MAX_NEW_TOKENS, |
| 216 | + value=MAX_NEW_TOKENS, |
| 217 | + label="Max New Tokens", |
| 218 | + interactive=True, |
| 219 | + ) |
| 220 | + |
| 221 | + sampling_params_toggle_btn.click( |
| 222 | + change_state, |
| 223 | + sampling_params_group_hidden_state, |
| 224 | + [sampling_params_group, sampling_params_group_hidden_state], |
| 225 | + ) |
| 226 | + chat_msg = chat_input.submit( |
| 227 | + check_messages, |
| 228 | + [chatbot, chat_input, audio_input], |
| 229 | + [chatbot, chat_input, audio_input], |
| 230 | + ) |
| 231 | + |
| 232 | + bot_msg = chat_msg.then( |
| 233 | + bot, |
| 234 | + inputs=[chatbot, top_p, top_k, temperature, repetition_penalty, max_new_tokens], |
| 235 | + outputs=chatbot, |
| 236 | + ) |
| 237 | + |
| 238 | + bot_msg.then(lambda: gr.MultimodalTextbox(interactive=True), None, [chat_input]) |
| 239 | + |
| 240 | + regenerate_btn.click( |
| 241 | + bot, |
| 242 | + inputs=[chatbot, top_p, top_k, temperature, repetition_penalty, max_new_tokens, gr.State(True)], |
| 243 | + outputs=chatbot, |
| 244 | + ) |
| 245 | + return demo |
0 commit comments