|
190 | 190 | "print(f\"Selected model {model_id.value}\")"
|
191 | 191 | ]
|
192 | 192 | },
|
| 193 | + { |
| 194 | + "attachments": {}, |
| 195 | + "cell_type": "markdown", |
| 196 | + "id": "ec2cdb27", |
| 197 | + "metadata": {}, |
| 198 | + "source": [ |
| 199 | + "## Select device for inference and model variant\n", |
| 200 | + "[back to top ⬆️](#Table-of-contents:)" |
| 201 | + ] |
| 202 | + }, |
| 203 | + { |
| 204 | + "cell_type": "code", |
| 205 | + "execution_count": null, |
| 206 | + "id": "c0f95e9b", |
| 207 | + "metadata": {}, |
| 208 | + "outputs": [], |
| 209 | + "source": [ |
| 210 | + "device = device_widget(\"CPU\")\n", |
| 211 | + "\n", |
| 212 | + "device" |
| 213 | + ] |
| 214 | + }, |
193 | 215 | {
|
194 | 216 | "attachments": {},
|
195 | 217 | "cell_type": "markdown",
|
|
291 | 313 | "prepare_int8_model = widgets.Checkbox(\n",
|
292 | 314 | " value=False,\n",
|
293 | 315 | " description=\"Prepare INT8 model\",\n",
|
294 |
| - " disabled=False,\n", |
| 316 | + " disabled=device.value == \"NPU\",\n", |
295 | 317 | ")\n",
|
296 | 318 | "prepare_fp16_model = widgets.Checkbox(\n",
|
297 | 319 | " value=False,\n",
|
|
330 | 352 | "fp16_model_dir = Path(model_id.value) / \"FP16\"\n",
|
331 | 353 | "int8_model_dir = Path(model_id.value) / \"INT8_compressed_weights\"\n",
|
332 | 354 | "int4_model_dir = Path(model_id.value) / \"INT4_compressed_weights\"\n",
|
| 355 | + "int4_npu_friendly = Path(model_id.value) / \"INT4_NPU_compressed_weights\"\n", |
333 | 356 | "\n",
|
334 | 357 | "core = ov.Core()\n",
|
335 | 358 | "\n",
|
|
376 | 399 | " },\n",
|
377 | 400 | " }\n",
|
378 | 401 | "\n",
|
| 402 | + " int4_result_model_dir = int4_model_dir if device.value != \"NPU\" else int4_npu_friendly\n", |
| 403 | + "\n", |
379 | 404 | " model_compression_params = compression_configs.get(model_id.value, compression_configs[\"default\"])\n",
|
380 |
| - " if (int4_model_dir / \"openvino_model.xml\").exists():\n", |
| 405 | + " if device.value == \"NPU\":\n", |
| 406 | + " model_compression_params[\"group_size\"] = -1\n", |
| 407 | + " model_compression_params[\"sym\"] = True\n", |
| 408 | + " model_compression_params[\"ratio\"] = 1.0\n", |
| 409 | + " \n", |
| 410 | + " if (int4_result_model_dir / \"openvino_model.xml\").exists():\n", |
381 | 411 | " return\n",
|
382 | 412 | " export_command_base = \"optimum-cli export openvino --model {} --task text-generation-with-past --weight-format int4\".format(pt_model_id)\n",
|
383 |
| - " int4_compression_args = \" --group-size {} --ratio {}\".format(model_compression_params[\"group_size\"], model_compression_params[\"ratio\"])\n", |
| 413 | + " int4_compression_args = \" --group-size {} --ratio {}\".format(model_compression_params[\"group_size\"], model_compression_params[\"ratio\"]) \n", |
384 | 414 | " if model_compression_params[\"sym\"]:\n",
|
385 | 415 | " int4_compression_args += \" --sym\"\n",
|
386 | 416 | " export_command_base += int4_compression_args\n",
|
387 |
| - " export_command = export_command_base + \" \" + str(int4_model_dir)\n", |
| 417 | + " export_command = export_command_base + \" \" + str(int4_result_model_dir)\n", |
388 | 418 | " display(Markdown(\"**Export command:**\"))\n",
|
389 | 419 | " display(Markdown(f\"`{export_command}`\"))\n",
|
390 | 420 | " ! $export_command\n",
|
|
428 | 458 | "source": [
|
429 | 459 | "fp16_weights = fp16_model_dir / \"openvino_model.bin\"\n",
|
430 | 460 | "int8_weights = int8_model_dir / \"openvino_model.bin\"\n",
|
431 |
| - "int4_weights = int4_model_dir / \"openvino_model.bin\"\n", |
| 461 | + "int4_weights = (int4_model_dir if not device.value == \"NPU\" else int4_npu_friendly) / \"openvino_model.bin\"\n", |
432 | 462 | "\n",
|
433 | 463 | "if fp16_weights.exists():\n",
|
434 | 464 | " print(f\"Size of FP16 model is {fp16_weights.stat().st_size / 1024 / 1024:.2f} MB\")\n",
|
|
439 | 469 | " print(f\"Compression rate for INT{precision} model: {fp16_weights.stat().st_size / compressed_weights.stat().st_size:.3f}\")"
|
440 | 470 | ]
|
441 | 471 | },
|
442 |
| - { |
443 |
| - "attachments": {}, |
444 |
| - "cell_type": "markdown", |
445 |
| - "id": "3df73379-bccc-41b1-9c94-c3040819805b", |
446 |
| - "metadata": {}, |
447 |
| - "source": [ |
448 |
| - "## Select device for inference and model variant\n", |
449 |
| - "[back to top ⬆️](#Table-of-contents:)\n", |
450 |
| - "\n", |
451 |
| - ">**Note**: There may be no speedup for INT4/INT8 compressed models on dGPU." |
452 |
| - ] |
453 |
| - }, |
454 |
| - { |
455 |
| - "cell_type": "code", |
456 |
| - "execution_count": 8, |
457 |
| - "id": "d2d7bf5b-8a05-4c3b-a36b-631af5c197e9", |
458 |
| - "metadata": {}, |
459 |
| - "outputs": [ |
460 |
| - { |
461 |
| - "data": { |
462 |
| - "application/vnd.jupyter.widget-view+json": { |
463 |
| - "model_id": "76c6c52c8af04d9084b1e3b56686f563", |
464 |
| - "version_major": 2, |
465 |
| - "version_minor": 0 |
466 |
| - }, |
467 |
| - "text/plain": [ |
468 |
| - "Dropdown(description='Device:', options=('CPU', 'AUTO'), value='CPU')" |
469 |
| - ] |
470 |
| - }, |
471 |
| - "execution_count": 8, |
472 |
| - "metadata": {}, |
473 |
| - "output_type": "execute_result" |
474 |
| - } |
475 |
| - ], |
476 |
| - "source": [ |
477 |
| - "core = ov.Core()\n", |
478 |
| - "\n", |
479 |
| - "device = device_widget(\"CPU\", exclude=[\"NPU\"])\n", |
480 |
| - "\n", |
481 |
| - "device" |
482 |
| - ] |
483 |
| - }, |
484 | 472 | {
|
485 | 473 | "cell_type": "code",
|
486 | 474 | "execution_count": 9,
|
|
505 | 493 | ],
|
506 | 494 | "source": [
|
507 | 495 | "available_models = []\n",
|
508 |
| - "if int4_model_dir.exists():\n", |
| 496 | + "if int4_model_dir.exists() and device.value != \"NPU\":\n", |
509 | 497 | " available_models.append(\"INT4\")\n",
|
510 |
| - "if int8_model_dir.exists():\n", |
| 498 | + "if int4_npu_friendly.exists() and device.value == \"NPU\":\n", |
| 499 | + " available_models.append(\"INT4\")\n", |
| 500 | + "if int8_model_dir.exists() and device.value != \"NPU\":\n", |
511 | 501 | " available_models.append(\"INT8\")\n",
|
512 | 502 | "if fp16_model_dir.exists():\n",
|
513 | 503 | " available_models.append(\"FP16\")\n",
|
|
541 | 531 | "from openvino_tokenizers import convert_tokenizer\n",
|
542 | 532 | "\n",
|
543 | 533 | "if model_to_run.value == \"INT4\":\n",
|
544 |
| - " model_dir = int4_model_dir\n", |
| 534 | + " model_dir = int4_model_dir if device.value != \"NPU\" else int4_npu_friendly\n", |
545 | 535 | "elif model_to_run.value == \"INT8\":\n",
|
546 | 536 | " model_dir = int8_model_dir\n",
|
547 | 537 | "else:\n",
|
|
880 | 870 | "demo = make_demo(run_fn=run_generation, title=f\"Question Answering with {model_id.value} and OpenVINO\")\n",
|
881 | 871 | "\n",
|
882 | 872 | "try:\n",
|
883 |
| - " demo.queue().launch(height=800)\n", |
| 873 | + " demo.queue().launch(height=800, debug=True)\n", |
884 | 874 | "except Exception:\n",
|
885 |
| - " demo.queue().launch(share=True, height=800)\n", |
| 875 | + " demo.queue().launch(share=True, height=800, debug=True)\n", |
886 | 876 | "# If you are launching remotely, specify server_name and server_port\n",
|
887 | 877 | "# EXAMPLE: `demo.launch(server_name='your server name', server_port='server port in int')`\n",
|
888 | 878 | "# To learn more please refer to the Gradio docs: https://gradio.app/docs/"
|
889 | 879 | ]
|
890 |
| - }, |
891 |
| - { |
892 |
| - "cell_type": "code", |
893 |
| - "execution_count": null, |
894 |
| - "id": "59038a29", |
895 |
| - "metadata": {}, |
896 |
| - "outputs": [], |
897 |
| - "source": [ |
898 |
| - "# please uncomment and run this cell for stopping gradio interface\n", |
899 |
| - "# demo.close()" |
900 |
| - ] |
901 | 880 | }
|
902 | 881 | ],
|
903 | 882 | "metadata": {
|
|
0 commit comments