add readme

eaidova · eaidova · commit 9468ba66d0ad · 2025-02-27T00:46:11.000+04:00
diff --git a/.ci/ignore_treon_docker.txt b/.ci/ignore_treon_docker.txt
@@ -82,4 +82,5 @@ notebooks/llm-agent-react/llm-agent-react-langchain.ipynb
 notebooks/multimodal-rag/multimodal-rag-llamaindex.ipynb
 notebooks/llm-rag-langchain/llm-rag-langchain-genai.ipynb
 notebooks/ltx-video/ltx-video.ipynb
-notebooks/outetts-text-to-speech/outetts-text-to-speech.ipynb
+notebooks/outetts-text-to-speech/outetts-text-to-speech.ipynb
+notebooks/phi-4-multimodal/phi-4-multimodal.ipynb
diff --git a/.ci/skipped_notebooks.yml b/.ci/skipped_notebooks.yml
@@ -559,4 +559,11 @@
 - notebook: notebooks/paddle-to-openvino/paddle-to-openvino-classification.ipynb
   skips:
     - python:
-        - '3.12'
+        - '3.12'
+- notebook: notebooks/phi-4-multimodal/phi-4-multimodal.ipynb
+  skips:
+    - os:
+        - macos-13
+        - ubuntu-20.04
+        - ubuntu-22.04
+        - windows-2019
diff --git a/notebooks/phi-4-multimodal/README.md b/notebooks/phi-4-multimodal/README.md
@@ -0,0 +1,21 @@
+# Multimodal assistant with Phi-4-mini-multimodal and OpenVINO
+
+Phi-4-multimodal-instruct is a lightweight open multimodal foundation model. The model processes text, image, and audio inputs, generating text outputs. Phi-4-multimodal-instruct has 5.6B parameters and is a multimodal transformer model. The model has the pretrained Phi-4-mini as the  backbone language model, and the advanced encoders and adapters of vision and speech.
+In this tutorial we will explore how to run Phi-4-mini-multimodal model using [OpenVINO](https://github.com/openvinotoolkit/openvino) and optimize it using [NNCF](https://github.com/openvinotoolkit/nncf).
+
+## Notebook contents
+The tutorial consists from following steps:
+
+- Install requirements
+- Convert and Optimize model
+- Run OpenVINO model inference
+- Launch Interactive demo
+
+In this demonstration, you'll create interactive chatbot that can answer questions about provided image's and audio's content.
+![phi4](https://github.com/user-attachments/assets/8c0b8e50-417e-4579-b799-e9b9c15e8a87)
+
+## Installation instructions
+This is a self-contained example that relies solely on its own code.</br>
+We recommend running the notebook in a virtual environment. You only need a Jupyter server to start.
+For details, please refer to [Installation Guide](../../README.md).
+<img referrerpolicy="no-referrer-when-downgrade" src="https://static.scarf.sh/a.png?x-pxid=5b5a4db0-7875-4bfb-bdbd-01698b5b1a77&file=notebooks/phi-4-multimodal/README.md" />
diff --git a/notebooks/phi-4-multimodal/gradio_helper.py b/notebooks/phi-4-multimodal/gradio_helper.py
@@ -1,4 +1,3 @@
-
 from copy import deepcopy
 from typing import Dict, List
 from PIL import Image
@@ -10,8 +9,8 @@
 IMAGE_EXTENSIONS = (".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".webp")
 AUDIO_EXTENSIONS = (".mp3", ".wav", "flac", ".m4a", ".wma")
 
-IMAGE_SPECIAL = '<|endoftext10|>'
-AUDIO_SPECIAL = '<|endoftext11|>'
+IMAGE_SPECIAL = "<|endoftext10|>"
+AUDIO_SPECIAL = "<|endoftext11|>"
 
 DEFAULT_SAMPLING_PARAMS = {
     "top_p": 0.0,
@@ -24,7 +23,6 @@
 MAX_NEW_TOKENS = 512
 
 
-
 def history2messages(history: List[Dict]) -> List[Dict]:
     """
     Transform gradio history to chat messages.
@@ -65,14 +63,15 @@ def history2messages(history: List[Dict]) -> List[Dict]:
         messages.append(cur_message)
     return messages, images, audios
 
+
 def check_messages(history, message, audio):
     has_text = message["text"] and message["text"].strip()
     has_files = len(message["files"]) > 0
     has_audio = audio is not None
 
     if not (has_text or has_files or has_audio):
         raise gr.Error("Message is empty")
-        
+
     audios = []
     images = []
 
@@ -110,9 +109,10 @@ def check_messages(history, message, audio):
 
     if message["text"]:
         history.append({"role": "user", "content": message["text"], "metadata": {}})
-        
+
     return history, gr.MultimodalTextbox(value=None, interactive=False), None
 
+
 def make_demo(ov_model, processor):
     def bot(
         history: list,
@@ -130,7 +130,7 @@ def bot(
 
         if not history:
             return history
-        
+
         msgs, images, audios = history2messages(history)
         audios = audios if len(audios) > 0 else None
         images = images if len(images) > 0 else None
@@ -146,15 +146,13 @@ def bot(
             "temperature": temperature,
             "repetition_penalty": repetition_penalty,
             "max_new_tokens": max_new_tokens,
-            "do_sample": temperature > 0, 
+            "do_sample": temperature > 0,
             "streamer": streamer,
-            **inputs
+            **inputs,
         }
 
         history.append({"role": "assistant", "content": ""})
 
-
-
         thread = Thread(target=ov_model.generate, kwargs=generation_params)
         thread.start()
 
@@ -169,14 +167,13 @@ def change_state(state):
 
     def reset_user_input():
         return gr.update(value="")
-    
+
     with gr.Blocks(theme=gr.themes.Soft()) as demo:
         gr.Markdown("# 🪐 Chat with OpenVINO Phi-4-multimodal")
-        chatbot = gr.Chatbot(elem_id="chatbot", bubble_full_width=False, type="messages", height='48vh')
+        chatbot = gr.Chatbot(elem_id="chatbot", bubble_full_width=False, type="messages", height="48vh")
 
         sampling_params_group_hidden_state = gr.State(False)
 
-
         with gr.Row(equal_height=True):
             chat_input = gr.MultimodalTextbox(
                 file_count="multiple",
@@ -188,12 +185,7 @@ def reset_user_input():
                 # stop_btn=True,
             )
         with gr.Row(equal_height=True):
-            audio_input = gr.Audio(
-                sources=["microphone", "upload"],
-                type="filepath",
-                scale=1,
-                max_length=30
-            )
+            audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath", scale=1, max_length=30)
         with gr.Row(equal_height=True):
             with gr.Column(scale=1, min_width=150):
                 with gr.Row(equal_height=True):
@@ -205,9 +197,7 @@ def reset_user_input():
 
         with gr.Group(visible=False) as sampling_params_group:
             with gr.Row():
-                temperature = gr.Slider(
-                    minimum=0, maximum=1, value=DEFAULT_SAMPLING_PARAMS["temperature"], label="Temperature"
-                )
+                temperature = gr.Slider(minimum=0, maximum=1, value=DEFAULT_SAMPLING_PARAMS["temperature"], label="Temperature")
                 repetition_penalty = gr.Slider(
                     minimum=0,
                     maximum=2,
@@ -234,9 +224,9 @@ def reset_user_input():
             [sampling_params_group, sampling_params_group_hidden_state],
         )
         chat_msg = chat_input.submit(
-                check_messages,
-                [chatbot, chat_input, audio_input],
-                [chatbot, chat_input, audio_input],
+            check_messages,
+            [chatbot, chat_input, audio_input],
+            [chatbot, chat_input, audio_input],
         )
 
         bot_msg = chat_msg.then(
@@ -252,4 +242,4 @@ def reset_user_input():
             inputs=[chatbot, top_p, top_k, temperature, repetition_penalty, max_new_tokens, gr.State(True)],
             outputs=chatbot,
         )
-    return demo
+    return demo
diff --git a/notebooks/phi-4-multimodal/ov_phi4_helper.py b/notebooks/phi-4-multimodal/ov_phi4_helper.py
diff --git a/notebooks/phi-4-multimodal/phi-4-multimodal.ipynb b/notebooks/phi-4-multimodal/phi-4-multimodal.ipynb