Import InferRequestWrapper from optimum-intel instead of re-defining it (#1660)

nikita-savelyevv · web-flow · commit 2a57cc8c2ad0 · 2024-01-29T19:02:09.000+04:00
diff --git a/notebooks/267-distil-whisper-asr/267-distil-whisper-asr.ipynb b/notebooks/267-distil-whisper-asr/267-distil-whisper-asr.ipynb
@@ -924,7 +924,7 @@
     "### Prepare calibration datasets\n",
     "[back to top ⬆️](#Table-of-contents:)\n",
     "\n",
-    "First step is to prepare calibration datasets for quantization. Since we quantize whisper encoder and decoder separately, we need to prepare a calibration dataset for each of the models. We define a `InferRequestWrapper` class that will intercept model inputs and collect them to a list. Then we run model inference on some small amount of audio samples. Generally, increasing the calibration dataset size improves quantization quality."
+    "First step is to prepare calibration datasets for quantization. Since we quantize whisper encoder and decoder separately, we need to prepare a calibration dataset for each of the models. We import an `InferRequestWrapper` class that will intercept model inputs and collect them to a list. Then we run model inference on some small amount of audio samples. Generally, increasing the calibration dataset size improves quantization quality."
    ]
   },
   {
@@ -946,44 +946,10 @@
     "%%skip not $to_quantize.value\n",
     "\n",
     "from itertools import islice\n",
-    "from typing import List, Any\n",
-    "from openvino import Tensor\n",
+    "from optimum.intel.openvino.quantization import InferRequestWrapper\n",
     "\n",
     "\n",
-    "class InferRequestWrapper:\n",
-    "    def __init__(self, request, data_cache: List):\n",
-    "        self.request = request\n",
-    "        self.data_cache = data_cache\n",
-    "\n",
-    "    def __call__(self, *args, **kwargs):\n",
-    "        self.data_cache.append(*args)\n",
-    "        return self.request(*args, **kwargs)\n",
-    "\n",
-    "    def infer(self, inputs: Any = None, shared_memory: bool = False):\n",
-    "        self.data_cache.append(inputs)\n",
-    "        return self.request.infer(inputs, shared_memory)\n",
-    "\n",
-    "    def start_async(\n",
-    "        self,\n",
-    "        inputs: Any = None,\n",
-    "        userdata: Any = None,\n",
-    "        share_inputs: bool = False,\n",
-    "    ):\n",
-    "        self.data_cache.append(inputs)\n",
-    "        self.request.infer(inputs, share_inputs)\n",
-    "\n",
-    "    def wait(self):\n",
-    "        pass\n",
-    "\n",
-    "    def get_tensor(self, name: str):\n",
-    "        return Tensor(self.request.results[name])\n",
-    "\n",
-    "    def __getattr__(self, attr):\n",
-    "        if attr in self.__dict__:\n",
-    "            return getattr(self, attr)\n",
-    "        return getattr(self.request, attr)\n",
-    "\n",
-    "def collect_calibration_dataset(ov_model, calibration_dataset_size):\n",
+    "def collect_calibration_dataset(ov_model: OVModelForSpeechSeq2Seq, calibration_dataset_size: int):\n",
     "    # Overwrite model request properties, saving the original ones for restoring later\n",
     "    original_encoder_request = ov_model.encoder.request\n",
     "    original_decoder_with_past_request = ov_model.decoder_with_past.request\n",
@@ -1124,25 +1090,24 @@
     "import nncf\n",
     "\n",
     "CALIBRATION_DATASET_SIZE = 50\n",
-    "quantized_distil_model_path = Path(f\"{model_path}_quantized\")\n",
+    "quantized_model_path = Path(f\"{model_path}_quantized\")\n",
     "\n",
     "\n",
-    "def quantize(ov_model, calibration_dataset_size):\n",
-    "    if not quantized_distil_model_path.exists():\n",
+    "def quantize(ov_model: OVModelForSpeechSeq2Seq, calibration_dataset_size: int):\n",
+    "    if not quantized_model_path.exists():\n",
     "        encoder_calibration_data, decoder_calibration_data = collect_calibration_dataset(\n",
     "            ov_model, calibration_dataset_size\n",
     "        )\n",
     "        print(\"Quantizing encoder\")\n",
     "        quantized_encoder = nncf.quantize(\n",
     "            ov_model.encoder.model,\n",
     "            nncf.Dataset(encoder_calibration_data),\n",
-    "            preset=nncf.QuantizationPreset.MIXED,\n",
     "            subset_size=len(encoder_calibration_data),\n",
     "            model_type=nncf.ModelType.TRANSFORMER,\n",
     "            # Smooth Quant algorithm reduces activation quantization error; optimal alpha value was obtained through grid search\n",
     "            advanced_parameters=nncf.AdvancedQuantizationParameters(smooth_quant_alpha=0.50)\n",
     "        )\n",
-    "        ov.save_model(quantized_encoder, quantized_distil_model_path / \"openvino_encoder_model.xml\")\n",
+    "        ov.save_model(quantized_encoder, quantized_model_path / \"openvino_encoder_model.xml\")\n",
     "        del quantized_encoder\n",
     "        del encoder_calibration_data\n",
     "        gc.collect()\n",
@@ -1151,23 +1116,22 @@
     "        quantized_decoder_with_past = nncf.quantize(\n",
     "            ov_model.decoder_with_past.model,\n",
     "            nncf.Dataset(decoder_calibration_data),\n",
-    "            preset=nncf.QuantizationPreset.MIXED,\n",
     "            subset_size=len(decoder_calibration_data),\n",
     "            model_type=nncf.ModelType.TRANSFORMER,\n",
     "            # Smooth Quant algorithm reduces activation quantization error; optimal alpha value was obtained through grid search\n",
     "            advanced_parameters=nncf.AdvancedQuantizationParameters(smooth_quant_alpha=0.95)\n",
     "        )\n",
-    "        ov.save_model(quantized_decoder_with_past, quantized_distil_model_path / \"openvino_decoder_with_past_model.xml\")\n",
+    "        ov.save_model(quantized_decoder_with_past, quantized_model_path / \"openvino_decoder_with_past_model.xml\")\n",
     "        del quantized_decoder_with_past\n",
     "        del decoder_calibration_data\n",
     "        gc.collect()\n",
     "\n",
     "        # Copy the config file and the first-step-decoder manually\n",
-    "        shutil.copy(model_path / \"config.json\", quantized_distil_model_path / \"config.json\")\n",
-    "        shutil.copy(model_path / \"openvino_decoder_model.xml\", quantized_distil_model_path / \"openvino_decoder_model.xml\")\n",
-    "        shutil.copy(model_path / \"openvino_decoder_model.bin\", quantized_distil_model_path / \"openvino_decoder_model.bin\")\n",
+    "        shutil.copy(model_path / \"config.json\", quantized_model_path / \"config.json\")\n",
+    "        shutil.copy(model_path / \"openvino_decoder_model.xml\", quantized_model_path / \"openvino_decoder_model.xml\")\n",
+    "        shutil.copy(model_path / \"openvino_decoder_model.bin\", quantized_model_path / \"openvino_decoder_model.bin\")\n",
     "\n",
-    "    quantized_ov_model = OVModelForSpeechSeq2Seq.from_pretrained(quantized_distil_model_path, ov_config=ov_config, compile=False)\n",
+    "    quantized_ov_model = OVModelForSpeechSeq2Seq.from_pretrained(quantized_model_path, ov_config=ov_config, compile=False)\n",
     "    quantized_ov_model.to(device.value)\n",
     "    quantized_ov_model.compile()\n",
     "    return quantized_ov_model\n",