Skip to content

Commit f91ea19

Browse files
authored
allow to run whisper and vlms on NPU (#2803)
CVS-162932
1 parent 62943b6 commit f91ea19

File tree

11 files changed

+136
-207
lines changed

11 files changed

+136
-207
lines changed

notebooks/internvl2/internvl2.ipynb

+2-2
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@
5555
"%pip install -q \"transformers>4.36\" \"torch>=2.1\" \"torchvision\" \"einops\" \"timm\" \"Pillow\" \"gradio>=4.36\" --extra-index-url https://download.pytorch.org/whl/cpu\n",
5656
"%pip install -q \"nncf>=2.14.0\" \"datasets\"\n",
5757
"%pip install -q \"git+https://github.com/huggingface/optimum-intel.git\" --extra-index-url https://download.pytorch.org/whl/cpu\n",
58-
"%pip install -q -U \"openvino>=2024.5\" \"openvino-tokenizers>=2024.5\" \"openvino-genai>=2024.5\"\n",
58+
"%pip install -q -U --pre \"openvino>=2025.0\" \"openvino-tokenizers>=2025.0\" \"openvino-genai>=2025.0\" --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly\n",
5959
"\n",
6060
"if platform.system() == \"Darwin\":\n",
6161
" %pip install -q \"numpy<2.0.0\""
@@ -371,7 +371,7 @@
371371
"source": [
372372
"from notebook_utils import device_widget\n",
373373
"\n",
374-
"device = device_widget(default=\"CPU\", exclude=[\"NPU\", \"AUTO\"])\n",
374+
"device = device_widget(default=\"CPU\", exclude=[\"AUTO\"])\n",
375375
"\n",
376376
"device"
377377
]

notebooks/llava-multimodal-chatbot/llava-multimodal-chatbot-genai.ipynb

+4-4
Original file line numberDiff line numberDiff line change
@@ -97,10 +97,10 @@
9797
"from pathlib import Path\n",
9898
"import requests\n",
9999
"\n",
100-
"%pip install \"torch>=2.3.0\" \"torchvision\" \"torchaudio\" --extra-index-url https://download.pytorch.org/whl/cpu\n",
100+
"%pip install \"torch>=2.1.0\" \"torchvision\" \"torchaudio\" --extra-index-url https://download.pytorch.org/whl/cpu\n",
101101
"%pip install \"git+https://github.com/huggingface/optimum-intel.git\" --extra-index-url https://download.pytorch.org/whl/cpu\n",
102-
"%pip install \"nncf>=2.14.0\" \"sentencepiece\" \"tokenizers>=0.12.1\" \"transformers>=4.45.0\" \"gradio>=4.36\"\n",
103-
"%pip install -U \"openvino-tokenizers>=2024.5.0\" \"openvino>=2024.5.0\" \"openvino-genai>=2024.5.0\"\n",
102+
"%pip install \"nncf>=2.15.0\" \"sentencepiece\" \"tokenizers>=0.12.1\" \"transformers>=4.45.0\" \"gradio>=4.36\"\n",
103+
"%pip install -q -U --pre \"openvino>=2025.0\" \"openvino-tokenizers>=2025.0\" \"openvino-genai>=2025.0\" --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly\n",
104104
"\n",
105105
"\n",
106106
"utility_files = [\"notebook_utils.py\", \"cmd_helper.py\"]\n",
@@ -355,7 +355,7 @@
355355
"source": [
356356
"from notebook_utils import device_widget\n",
357357
"\n",
358-
"device = device_widget(exclude=[\"NPU\"])\n",
358+
"device = device_widget()\n",
359359
"\n",
360360
"device"
361361
]

notebooks/llava-next-multimodal-chatbot/llava-next-multimodal-chatbot.ipynb

+2-2
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@
6464
"outputs": [],
6565
"source": [
6666
"%pip install -q \"nncf>=2.14.0\" \"torch>=2.1\" \"transformers>=4.39.1\" \"accelerate\" \"pillow\" \"gradio>=4.26\" \"datasets>=2.14.6\" \"tqdm\" --extra-index-url https://download.pytorch.org/whl/cpu\n",
67-
"%pip install -q -U \"openvino>=2024.5.0\" \"openvino-tokenizers>=2024.5.0\" \"openvino-genai>=2024.5\"\n",
67+
"%pip install -q -U --pre \"openvino>=2025.0\" \"openvino-tokenizers>=2025.0\" \"openvino-genai>=2025.0\" --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly\n",
6868
"%pip install -q \"git+https://github.com/huggingface/optimum-intel.git\" --extra-index-url https://download.pytorch.org/whl/cpu"
6969
]
7070
},
@@ -305,7 +305,7 @@
305305
"source": [
306306
"from notebook_utils import device_widget\n",
307307
"\n",
308-
"device = device_widget(\"CPU\", exclude=[\"NPU\"])\n",
308+
"device = device_widget(\"CPU\")\n",
309309
"\n",
310310
"device"
311311
]

notebooks/llm-question-answering/llm-question-answering.ipynb

+42-63
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,28 @@
190190
"print(f\"Selected model {model_id.value}\")"
191191
]
192192
},
193+
{
194+
"attachments": {},
195+
"cell_type": "markdown",
196+
"id": "ec2cdb27",
197+
"metadata": {},
198+
"source": [
199+
"## Select device for inference and model variant\n",
200+
"[back to top ⬆️](#Table-of-contents:)"
201+
]
202+
},
203+
{
204+
"cell_type": "code",
205+
"execution_count": null,
206+
"id": "c0f95e9b",
207+
"metadata": {},
208+
"outputs": [],
209+
"source": [
210+
"device = device_widget(\"CPU\")\n",
211+
"\n",
212+
"device"
213+
]
214+
},
193215
{
194216
"attachments": {},
195217
"cell_type": "markdown",
@@ -291,7 +313,7 @@
291313
"prepare_int8_model = widgets.Checkbox(\n",
292314
" value=False,\n",
293315
" description=\"Prepare INT8 model\",\n",
294-
" disabled=False,\n",
316+
" disabled=device.value == \"NPU\",\n",
295317
")\n",
296318
"prepare_fp16_model = widgets.Checkbox(\n",
297319
" value=False,\n",
@@ -330,6 +352,7 @@
330352
"fp16_model_dir = Path(model_id.value) / \"FP16\"\n",
331353
"int8_model_dir = Path(model_id.value) / \"INT8_compressed_weights\"\n",
332354
"int4_model_dir = Path(model_id.value) / \"INT4_compressed_weights\"\n",
355+
"int4_npu_friendly = Path(model_id.value) / \"INT4_NPU_compressed_weights\"\n",
333356
"\n",
334357
"core = ov.Core()\n",
335358
"\n",
@@ -376,15 +399,22 @@
376399
" },\n",
377400
" }\n",
378401
"\n",
402+
" int4_result_model_dir = int4_model_dir if device.value != \"NPU\" else int4_npu_friendly\n",
403+
"\n",
379404
" model_compression_params = compression_configs.get(model_id.value, compression_configs[\"default\"])\n",
380-
" if (int4_model_dir / \"openvino_model.xml\").exists():\n",
405+
" if device.value == \"NPU\":\n",
406+
" model_compression_params[\"group_size\"] = -1\n",
407+
" model_compression_params[\"sym\"] = True\n",
408+
" model_compression_params[\"ratio\"] = 1.0\n",
409+
" \n",
410+
" if (int4_result_model_dir / \"openvino_model.xml\").exists():\n",
381411
" return\n",
382412
" export_command_base = \"optimum-cli export openvino --model {} --task text-generation-with-past --weight-format int4\".format(pt_model_id)\n",
383-
" int4_compression_args = \" --group-size {} --ratio {}\".format(model_compression_params[\"group_size\"], model_compression_params[\"ratio\"])\n",
413+
" int4_compression_args = \" --group-size {} --ratio {}\".format(model_compression_params[\"group_size\"], model_compression_params[\"ratio\"]) \n",
384414
" if model_compression_params[\"sym\"]:\n",
385415
" int4_compression_args += \" --sym\"\n",
386416
" export_command_base += int4_compression_args\n",
387-
" export_command = export_command_base + \" \" + str(int4_model_dir)\n",
417+
" export_command = export_command_base + \" \" + str(int4_result_model_dir)\n",
388418
" display(Markdown(\"**Export command:**\"))\n",
389419
" display(Markdown(f\"`{export_command}`\"))\n",
390420
" ! $export_command\n",
@@ -428,7 +458,7 @@
428458
"source": [
429459
"fp16_weights = fp16_model_dir / \"openvino_model.bin\"\n",
430460
"int8_weights = int8_model_dir / \"openvino_model.bin\"\n",
431-
"int4_weights = int4_model_dir / \"openvino_model.bin\"\n",
461+
"int4_weights = (int4_model_dir if not device.value == \"NPU\" else int4_npu_friendly) / \"openvino_model.bin\"\n",
432462
"\n",
433463
"if fp16_weights.exists():\n",
434464
" print(f\"Size of FP16 model is {fp16_weights.stat().st_size / 1024 / 1024:.2f} MB\")\n",
@@ -439,48 +469,6 @@
439469
" print(f\"Compression rate for INT{precision} model: {fp16_weights.stat().st_size / compressed_weights.stat().st_size:.3f}\")"
440470
]
441471
},
442-
{
443-
"attachments": {},
444-
"cell_type": "markdown",
445-
"id": "3df73379-bccc-41b1-9c94-c3040819805b",
446-
"metadata": {},
447-
"source": [
448-
"## Select device for inference and model variant\n",
449-
"[back to top ⬆️](#Table-of-contents:)\n",
450-
"\n",
451-
">**Note**: There may be no speedup for INT4/INT8 compressed models on dGPU."
452-
]
453-
},
454-
{
455-
"cell_type": "code",
456-
"execution_count": 8,
457-
"id": "d2d7bf5b-8a05-4c3b-a36b-631af5c197e9",
458-
"metadata": {},
459-
"outputs": [
460-
{
461-
"data": {
462-
"application/vnd.jupyter.widget-view+json": {
463-
"model_id": "76c6c52c8af04d9084b1e3b56686f563",
464-
"version_major": 2,
465-
"version_minor": 0
466-
},
467-
"text/plain": [
468-
"Dropdown(description='Device:', options=('CPU', 'AUTO'), value='CPU')"
469-
]
470-
},
471-
"execution_count": 8,
472-
"metadata": {},
473-
"output_type": "execute_result"
474-
}
475-
],
476-
"source": [
477-
"core = ov.Core()\n",
478-
"\n",
479-
"device = device_widget(\"CPU\", exclude=[\"NPU\"])\n",
480-
"\n",
481-
"device"
482-
]
483-
},
484472
{
485473
"cell_type": "code",
486474
"execution_count": 9,
@@ -505,9 +493,11 @@
505493
],
506494
"source": [
507495
"available_models = []\n",
508-
"if int4_model_dir.exists():\n",
496+
"if int4_model_dir.exists() and device.value != \"NPU\":\n",
509497
" available_models.append(\"INT4\")\n",
510-
"if int8_model_dir.exists():\n",
498+
"if int4_npu_friendly.exists() and device.value == \"NPU\":\n",
499+
" available_models.append(\"INT4\")\n",
500+
"if int8_model_dir.exists() and device.value != \"NPU\":\n",
511501
" available_models.append(\"INT8\")\n",
512502
"if fp16_model_dir.exists():\n",
513503
" available_models.append(\"FP16\")\n",
@@ -541,7 +531,7 @@
541531
"from openvino_tokenizers import convert_tokenizer\n",
542532
"\n",
543533
"if model_to_run.value == \"INT4\":\n",
544-
" model_dir = int4_model_dir\n",
534+
" model_dir = int4_model_dir if device.value != \"NPU\" else int4_npu_friendly\n",
545535
"elif model_to_run.value == \"INT8\":\n",
546536
" model_dir = int8_model_dir\n",
547537
"else:\n",
@@ -880,24 +870,13 @@
880870
"demo = make_demo(run_fn=run_generation, title=f\"Question Answering with {model_id.value} and OpenVINO\")\n",
881871
"\n",
882872
"try:\n",
883-
" demo.queue().launch(height=800)\n",
873+
" demo.queue().launch(height=800, debug=True)\n",
884874
"except Exception:\n",
885-
" demo.queue().launch(share=True, height=800)\n",
875+
" demo.queue().launch(share=True, height=800, debug=True)\n",
886876
"# If you are launching remotely, specify server_name and server_port\n",
887877
"# EXAMPLE: `demo.launch(server_name='your server name', server_port='server port in int')`\n",
888878
"# To learn more please refer to the Gradio docs: https://gradio.app/docs/"
889879
]
890-
},
891-
{
892-
"cell_type": "code",
893-
"execution_count": null,
894-
"id": "59038a29",
895-
"metadata": {},
896-
"outputs": [],
897-
"source": [
898-
"# please uncomment and run this cell for stopping gradio interface\n",
899-
"# demo.close()"
900-
]
901880
}
902881
],
903882
"metadata": {

notebooks/minicpm-v-multimodal-chatbot/minicpm-v-multimodal-chatbot.ipynb

+2-2
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@
5555
"%pip install -q \"torch>=2.1\" \"torchvision\" \"timm>=0.9.2\" \"transformers>=4.45\" \"Pillow\" \"gradio>=4.40\" \"tqdm\" \"sentencepiece\" \"peft\" \"huggingface-hub>=0.24.0\" --extra-index-url https://download.pytorch.org/whl/cpu\n",
5656
"%pip install -q \"nncf>=2.14.0\"\n",
5757
"%pip install -q \"git+https://github.com/huggingface/optimum-intel.git\" --extra-index-url https://download.pytorch.org/whl/cpu\n",
58-
"%pip install -q -U \"openvino>=2024.5\" \"openvino-tokenizers>=2024.5\" \"openvino-genai>=2024.5\""
58+
"%pip install -q -U --pre \"openvino>=2025.0\" \"openvino-tokenizers>=2025.0\" \"openvino-genai>=2025.0\" --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly"
5959
]
6060
},
6161
{
@@ -236,7 +236,7 @@
236236
"source": [
237237
"from notebook_utils import device_widget\n",
238238
"\n",
239-
"device = device_widget(default=\"AUTO\", exclude=[\"NPU\"])\n",
239+
"device = device_widget(default=\"AUTO\")\n",
240240
"\n",
241241
"device"
242242
]

notebooks/nuextract-structure-extraction/nuextract-structure-extraction.ipynb

+26-44
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@
6565
"outputs": [],
6666
"source": [
6767
"%pip uninstall -q -y optimum optimum-intel\n",
68-
"%pip install -Uq \"openvino>=2024.3.0\" \"openvino-genai\"\n",
68+
"%pip install -q -U --pre \"openvino>=2025.0.0\" openvino-tokenizers[transformers] openvino-genai --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly\n",
6969
"%pip install -q \"torch>=2.1\" \"nncf>=2.12\" \"transformers>=4.40.0\" \"accelerate\" \"gradio>=4.19\" \"git+https://github.com/huggingface/optimum-intel.git\" --extra-index-url https://download.pytorch.org/whl/cpu"
7070
]
7171
},
@@ -112,6 +112,30 @@
112112
"collect_telemetry(\"nuextract-structure-extraction.ipynb\")"
113113
]
114114
},
115+
{
116+
"attachments": {},
117+
"cell_type": "markdown",
118+
"id": "83259407",
119+
"metadata": {},
120+
"source": [
121+
"## Select device for inference\n",
122+
"[back to top ⬆️](#Table-of-contents:)"
123+
]
124+
},
125+
{
126+
"cell_type": "code",
127+
"execution_count": null,
128+
"id": "9ad5772d",
129+
"metadata": {},
130+
"outputs": [],
131+
"source": [
132+
"from notebook_utils import device_widget\n",
133+
"\n",
134+
"device = device_widget(default=\"CPU\")\n",
135+
"\n",
136+
"device"
137+
]
138+
},
115139
{
116140
"attachments": {},
117141
"cell_type": "markdown",
@@ -164,7 +188,7 @@
164188
" \"NuExtract_large\": {\"model_id\": \"numind/NuExtract-large\"},\n",
165189
"}\n",
166190
"\n",
167-
"form, _, model_dropdown, compression_dropdown, _ = get_llm_selection_widget(languages=None, models=models, show_preconverted_checkbox=False)\n",
191+
"form, _, model_dropdown, compression_dropdown, _ = get_llm_selection_widget(languages=None, models=models, show_preconverted_checkbox=False, device=device.value)\n",
168192
"\n",
169193
"form"
170194
]
@@ -345,48 +369,6 @@
345369
"compare_model_size(model_dir)"
346370
]
347371
},
348-
{
349-
"attachments": {},
350-
"cell_type": "markdown",
351-
"id": "3df73379-bccc-41b1-9c94-c3040819805b",
352-
"metadata": {},
353-
"source": [
354-
"## Select device for inference and model variant\n",
355-
"[back to top ⬆️](#Table-of-contents:)\n",
356-
"\n",
357-
">**Note**: There may be no speedup for INT4/INT8 compressed models on dGPU."
358-
]
359-
},
360-
{
361-
"cell_type": "code",
362-
"execution_count": 8,
363-
"id": "d2d7bf5b-8a05-4c3b-a36b-631af5c197e9",
364-
"metadata": {},
365-
"outputs": [
366-
{
367-
"data": {
368-
"application/vnd.jupyter.widget-view+json": {
369-
"model_id": "fa8a102d350b487cb3e0a4cb397295e6",
370-
"version_major": 2,
371-
"version_minor": 0
372-
},
373-
"text/plain": [
374-
"Dropdown(description='Device:', options=('CPU', 'GPU', 'AUTO'), value='CPU')"
375-
]
376-
},
377-
"execution_count": 8,
378-
"metadata": {},
379-
"output_type": "execute_result"
380-
}
381-
],
382-
"source": [
383-
"from notebook_utils import device_widget\n",
384-
"\n",
385-
"device = device_widget(default=\"CPU\", exclude=[\"NPU\"])\n",
386-
"\n",
387-
"device"
388-
]
389-
},
390372
{
391373
"attachments": {},
392374
"cell_type": "markdown",

0 commit comments

Comments
 (0)