fix issues found by nightly (#2739)

eaidova · web-flow · commit c24b2b03b6ec · 2025-02-10T22:03:53.000+04:00
diff --git a/notebooks/bark-text-to-audio/bark-text-to-audio.ipynb b/notebooks/bark-text-to-audio/bark-text-to-audio.ipynb
@@ -72,7 +72,7 @@
    "source": [
     "%pip install -q \"torch\" \"torchvision\" \"torchaudio\" --extra-index-url https://download.pytorch.org/whl/cpu\n",
     "%pip install -q \"openvino>=2023.1.0\" \"gradio>=4.19\"\n",
-    "%pip install -q \"git+https://github.com/suno-ai/bark.git\" --extra-index-url https://download.pytorch.org/whl/cpu"
+    "%pip install -q \"git+https://github.com/const-volatile/bark.git@pytorch-2.4+\" --extra-index-url https://download.pytorch.org/whl/cpu"
    ]
   },
   {
@@ -142,7 +142,7 @@
    "source": [
     "text_use_small = True\n",
     "\n",
-    "text_encoder = load_model(model_type=\"text\", use_gpu=False, use_small=text_use_small, force_reload=False, weights_only=False)\n",
+    "text_encoder = load_model(model_type=\"text\", use_gpu=False, use_small=text_use_small, force_reload=False)\n",
     "\n",
     "text_encoder_model = text_encoder[\"model\"]\n",
     "tokenizer = text_encoder[\"tokenizer\"]"
diff --git a/notebooks/multimodal-rag/multimodal-rag-llamaindex.ipynb b/notebooks/multimodal-rag/multimodal-rag-llamaindex.ipynb
@@ -385,11 +385,12 @@
    "source": [
     "from optimum.intel import OVModelForSpeechSeq2Seq\n",
     "from transformers import AutoProcessor, pipeline\n",
+    "import torch\n",
     "\n",
     "asr_model = OVModelForSpeechSeq2Seq.from_pretrained(asr_model_path, device=asr_device.value)\n",
     "asr_processor = AutoProcessor.from_pretrained(asr_model_path)\n",
     "\n",
-    "pipe = pipeline(\"automatic-speech-recognition\", model=asr_model, tokenizer=asr_processor.tokenizer, feature_extractor=asr_processor.feature_extractor)"
+    "pipe = pipeline(\"automatic-speech-recognition\", model=asr_model, tokenizer=asr_processor.tokenizer, feature_extractor=asr_processor.feature_extractor, device=torch.device(\"cpu\"))"
    ]
   },
   {
diff --git a/notebooks/named-entity-recognition/named-entity-recognition.ipynb b/notebooks/named-entity-recognition/named-entity-recognition.ipynb
@@ -54,8 +54,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "import platform\n",
+    "\n",
     "%pip install -q \"diffusers>=0.17.1\" \"openvino>=2023.1.0\" \"nncf>=2.5.0\" \"gradio>=4.19\" \"onnx>=1.11.0,<1.16.2\" \"transformers>=4.33.0\" \"torch>=2.1\" --extra-index-url https://download.pytorch.org/whl/cpu\n",
-    "%pip install -q \"git+https://github.com/huggingface/optimum-intel.git\""
+    "%pip install -q \"git+https://github.com/huggingface/optimum-intel.git\"\n",
+    "\n",
+    "if platform.system() == \"Darwin\":\n",
+    "    %pip install -q \"numpy<2.0\""
    ]
   },
   {
@@ -412,10 +417,11 @@
    "outputs": [],
    "source": [
     "from transformers import pipeline\n",
+    "import torch\n",
     "\n",
-    "ner_pipeline_optimized = pipeline(\"token-classification\", model=optimized_model, tokenizer=tokenizer)\n",
+    "ner_pipeline_optimized = pipeline(\"token-classification\", model=optimized_model, tokenizer=tokenizer, device=torch.device(\"cpu\"))\n",
     "\n",
-    "ner_pipeline_original = pipeline(\"token-classification\", model=model, tokenizer=tokenizer)"
+    "ner_pipeline_original = pipeline(\"token-classification\", model=model, tokenizer=tokenizer, device=torch.device(\"cpu\"))"
    ]
   },
   {
diff --git a/notebooks/omnigen/ov_omnigen_helper.py b/notebooks/omnigen/ov_omnigen_helper.py
@@ -272,8 +272,11 @@ def rope_fwd(self, x, position_ids, seq_len=None):
 
             pipe.model.llm._orig_forward = pipe.model.llm.forward
             pipe.model.llm.forward = MethodType(forward_wrap, pipe.model.llm)
-            for layer in pipe.model.llm.layers:
-                layer.self_attn.rotary_emb.forward = MethodType(rope_fwd, layer.self_attn.rotary_emb)
+            if hasattr(pipe.model.llm, "rotary_emb"):
+                pipe.model.llm.rotary_emb.forward = MethodType(rope_fwd, pipe.model.llm.rotary_emb)
+            else:
+                for layer in pipe.model.llm.layers:
+                    layer.self_attn.rotary_emb.forward = MethodType(rope_fwd, layer.self_attn.rotary_emb)
             for i in range(num_hidden_layers):
                 past_key_values.append((torch.randn(pkv_shape), torch.randn(pkv_shape)))
                 input_names.extend([f"past_key_values.{i}.key", f"past_key_values.{i}.value"])
diff --git a/notebooks/optical-character-recognition/optical-character-recognition.ipynb b/notebooks/optical-character-recognition/optical-character-recognition.ipynb
@@ -298,8 +298,7 @@
     "        filename=image_path.name,\n",
     "        directory=image_path.parent,\n",
     "    )\n",
-    "else:\n",
-    "    image = cv2.imread(str(image_path))\n",
+    "image = cv2.imread(str(image_path))\n",
     "\n",
     "# N,C,H,W = batch size, number of channels, height, width.\n",
     "N, C, H, W = detection_input_layer.shape\n",
diff --git a/notebooks/qwen2-audio/ov_qwen2_audio_helper.py b/notebooks/qwen2-audio/ov_qwen2_audio_helper.py
@@ -273,14 +273,18 @@ def forward_wrap(
             past_key_values=None,
             inputs_embeds=None,
         ):
+            from transformers.cache_utils import DynamicCache
+
+            if past_key_values is not None:
+                pkv = DynamicCache.from_legacy_cache(past_key_values)
             result = self._orig_forward(
                 input_ids=None,
                 attention_mask=attention_mask,
                 position_ids=position_ids,
-                past_key_values=past_key_values,
+                past_key_values=pkv,
                 inputs_embeds=inputs_embeds,
             )
-            return tuple(result.values())
+            return (result.logits, result.past_key_values.to_legacy_cache())
 
         lang_model = model.language_model
         print(lang_model.config)
diff --git a/notebooks/whisper-subtitles-generation/whisper-subtitles-generation.ipynb b/notebooks/whisper-subtitles-generation/whisper-subtitles-generation.ipynb
@@ -747,6 +747,7 @@
     "from datasets import load_dataset\n",
     "from transformers import pipeline\n",
     "from optimum.intel.openvino.quantization import InferRequestWrapper\n",
+    "import torch\n",
     "\n",
     "\n",
     "def collect_calibration_dataset(ov_model: OVModelForSpeechSeq2Seq, calibration_dataset_size: int):\n",
@@ -763,7 +764,7 @@
     "      model=ov_model,\n",
     "      chunk_length_s=30,\n",
     "      tokenizer=processor.tokenizer,\n",
-    "      feature_extractor=processor.feature_extractor, devide=torch.device(\"cpu\"))\n",
+    "      feature_extractor=processor.feature_extractor, device=torch.device(\"cpu\"))\n",
     "    try:\n",
     "        calibration_dataset = dataset = load_dataset(\"openslr/librispeech_asr\", \"clean\", split=\"validation\", streaming=True, trust_remote_code=True)\n",
     "        for sample in tqdm(islice(calibration_dataset, calibration_dataset_size), desc=\"Collecting calibration data\",\n",

Original file line number	Diff line number	Diff line change
`@@ -385,11 +385,12 @@`
`385`	`385`	`"source": [`
`386`	`386`	`"from optimum.intel import OVModelForSpeechSeq2Seq\n",`
`387`	`387`	`"from transformers import AutoProcessor, pipeline\n",`
	`388`	`+ "import torch\n",`
`388`	`389`	`"\n",`
`389`	`390`	`"asr_model = OVModelForSpeechSeq2Seq.from_pretrained(asr_model_path, device=asr_device.value)\n",`
`390`	`391`	`"asr_processor = AutoProcessor.from_pretrained(asr_model_path)\n",`
`391`	`392`	`"\n",`
`392`		`- "pipe = pipeline(\"automatic-speech-recognition\", model=asr_model, tokenizer=asr_processor.tokenizer, feature_extractor=asr_processor.feature_extractor)"`
	`393`	`+ "pipe = pipeline(\"automatic-speech-recognition\", model=asr_model, tokenizer=asr_processor.tokenizer, feature_extractor=asr_processor.feature_extractor, device=torch.device(\"cpu\"))"`
`393`	`394`	`]`
`394`	`395`	`},`
`395`	`396`	`{`