align streamer for genai llm notebooks (#2765)

eaidova · web-flow · commit 51c4221c552d · 2025-02-24T11:12:08.000+04:00
diff --git a/notebooks/deepseek-r1/deepseek-r1.ipynb b/notebooks/deepseek-r1/deepseek-r1.ipynb
@@ -89,6 +89,10 @@
     "    r = requests.get(url=\"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py\")\n",
     "    open(\"notebook_utils.py\", \"w\").write(r.text)\n",
     "\n",
+    "if not Path(\"genai_helper.py\").exists():\n",
+    "    r = requests.get(url=\"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/genai_helper.py\")\n",
+    "    open(\"genai_helper.py\", \"w\").write(r.text)\n",
+    "\n",
     "# Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry\n",
     "from notebook_utils import collect_telemetry\n",
     "\n",
diff --git a/notebooks/deepseek-r1/gradio_helper.py b/notebooks/deepseek-r1/gradio_helper.py
@@ -5,6 +5,7 @@
 import queue
 import sys
 import re
+from genai_helper import ChunkStreamer
 
 max_new_tokens = 256
 
@@ -52,137 +53,6 @@ def get_system_prompt(model_language, system_prompt=None):
     return DEFAULT_SYSTEM_PROMPT_CHINESE if (model_language == "Chinese") else DEFAULT_SYSTEM_PROMPT
 
 
-class IterableStreamer(ov_genai.StreamerBase):
-    """
-    A custom streamer class for handling token streaming and detokenization with buffering.
-
-    Attributes:
-        tokenizer (Tokenizer): The tokenizer used for encoding and decoding tokens.
-        tokens_cache (list): A buffer to accumulate tokens for detokenization.
-        text_queue (Queue): A synchronized queue for storing decoded text chunks.
-        print_len (int): The length of the printed text to manage incremental decoding.
-    """
-
-    def __init__(self, tokenizer):
-        """
-        Initializes the IterableStreamer with the given tokenizer.
-
-        Args:
-            tokenizer (Tokenizer): The tokenizer to use for encoding and decoding tokens.
-        """
-        super().__init__()
-        self.tokenizer = tokenizer
-        self.tokens_cache = []
-        self.text_queue = queue.Queue()
-        self.print_len = 0
-
-    def __iter__(self):
-        """
-        Returns the iterator object itself.
-        """
-        return self
-
-    def __next__(self):
-        """
-        Returns the next value from the text queue.
-
-        Returns:
-            str: The next decoded text chunk.
-
-        Raises:
-            StopIteration: If there are no more elements in the queue.
-        """
-        value = self.text_queue.get()  # get() will be blocked until a token is available.
-        if value is None:
-            raise StopIteration
-        return value
-
-    def get_stop_flag(self):
-        """
-        Checks whether the generation process should be stopped.
-
-        Returns:
-            bool: Always returns False in this implementation.
-        """
-        return False
-
-    def put_word(self, word: str):
-        """
-        Puts a word into the text queue.
-
-        Args:
-            word (str): The word to put into the queue.
-        """
-        self.text_queue.put(word)
-
-    def put(self, token_id: int) -> bool:
-        """
-        Processes a token and manages the decoding buffer. Adds decoded text to the queue.
-
-        Args:
-            token_id (int): The token_id to process.
-
-        Returns:
-            bool: True if generation should be stopped, False otherwise.
-        """
-        self.tokens_cache.append(token_id)
-        text = self.tokenizer.decode(self.tokens_cache)
-
-        word = ""
-        if len(text) > self.print_len and "\n" == text[-1]:
-            # Flush the cache after the new line symbol.
-            word = text[self.print_len :]
-            self.tokens_cache = []
-            self.print_len = 0
-        elif len(text) >= 3 and text[-3:] == chr(65533):
-            # Don't print incomplete text.
-            pass
-        elif len(text) > self.print_len:
-            # It is possible to have a shorter text after adding new token.
-            # Print to output only if text length is increaesed.
-            word = text[self.print_len :]
-            self.print_len = len(text)
-        self.put_word(word)
-
-        if self.get_stop_flag():
-            # When generation is stopped from streamer then end is not called, need to call it here manually.
-            self.end()
-            return True  # True means stop  generation
-        else:
-            return False  # False means continue generation
-
-    def end(self):
-        """
-        Flushes residual tokens from the buffer and puts a None value in the queue to signal the end.
-        """
-        text = self.tokenizer.decode(self.tokens_cache)
-        if len(text) > self.print_len:
-            word = text[self.print_len :]
-            self.put_word(word)
-            self.tokens_cache = []
-            self.print_len = 0
-        self.put_word(None)
-
-    def reset(self):
-        self.tokens_cache = []
-        self.text_queue = queue.Queue()
-        self.print_len = 0
-
-
-class ChunkStreamer(IterableStreamer):
-
-    def __init__(self, tokenizer, tokens_len=4):
-        super().__init__(tokenizer)
-        self.tokens_len = tokens_len
-
-    def put(self, token_id: int) -> bool:
-        if (len(self.tokens_cache) + 1) % self.tokens_len != 0:
-            self.tokens_cache.append(token_id)
-            return False
-        sys.stdout.flush()
-        return super().put(token_id)
-
-
 def make_demo(pipe, model_configuration, model_id, model_language, disable_advanced=False):
     import gradio as gr
 
diff --git a/notebooks/llm-chatbot/gradio_helper_genai.py b/notebooks/llm-chatbot/gradio_helper_genai.py
@@ -2,8 +2,7 @@
 import openvino_genai as ov_genai
 from uuid import uuid4
 from threading import Event, Thread
-import queue
-import sys
+from gena_helper import ChunkStreamer
 
 max_new_tokens = 256
 
@@ -65,137 +64,6 @@ def get_system_prompt(model_language, system_prompt=None):
     )
 
 
-class IterableStreamer(ov_genai.StreamerBase):
-    """
-    A custom streamer class for handling token streaming and detokenization with buffering.
-
-    Attributes:
-        tokenizer (Tokenizer): The tokenizer used for encoding and decoding tokens.
-        tokens_cache (list): A buffer to accumulate tokens for detokenization.
-        text_queue (Queue): A synchronized queue for storing decoded text chunks.
-        print_len (int): The length of the printed text to manage incremental decoding.
-    """
-
-    def __init__(self, tokenizer):
-        """
-        Initializes the IterableStreamer with the given tokenizer.
-
-        Args:
-            tokenizer (Tokenizer): The tokenizer to use for encoding and decoding tokens.
-        """
-        super().__init__()
-        self.tokenizer = tokenizer
-        self.tokens_cache = []
-        self.text_queue = queue.Queue()
-        self.print_len = 0
-        self.decoded_lengths = []
-
-    def __iter__(self):
-        """
-        Returns the iterator object itself.
-        """
-        return self
-
-    def __next__(self):
-        """
-        Returns the next value from the text queue.
-
-        Returns:
-            str: The next decoded text chunk.
-
-        Raises:
-            StopIteration: If there are no more elements in the queue.
-        """
-        value = self.text_queue.get()  # get() will be blocked until a token is available.
-        if value is None:
-            raise StopIteration
-        return value
-
-    def get_stop_flag(self):
-        """
-        Checks whether the generation process should be stopped.
-
-        Returns:
-            bool: Always returns False in this implementation.
-        """
-        return False
-
-    def put_word(self, word: str):
-        """
-        Puts a word into the text queue.
-
-        Args:
-            word (str): The word to put into the queue.
-        """
-        self.text_queue.put(word)
-
-    def put(self, token_id: int) -> bool:
-        """
-        Processes a token and manages the decoding buffer. Adds decoded text to the queue.
-
-        Args:
-            token_id (int): The token_id to process.
-
-        Returns:
-            bool: True if generation should be stopped, False otherwise.
-        """
-        self.tokens_cache.append(token_id)
-        text = self.tokenizer.decode(self.tokens_cache)
-        self.decoded_lengths.append(len(text))
-
-        word = ""
-        delay_n_tokens = 3
-        if len(text) > self.print_len and "\n" == text[-1]:
-            # Flush the cache after the new line symbol.
-            word = text[self.print_len :]
-            self.tokens_cache = []
-            self.decoded_lengths = []
-            self.print_len = 0
-        elif len(text) > 0 and text[-1] == chr(65533):
-            # Don't print incomplete text.
-            self.decoded_lengths[-1] = -1
-        elif len(self.tokens_cache) >= delay_n_tokens:
-            print_until = self.decoded_lengths[-delay_n_tokens]
-            if print_until != -1 and print_until > self.print_len:
-                # It is possible to have a shorter text after adding new token.
-                # Print to output only if text length is increased and text is complete (print_until != -1).
-                word = text[self.print_len : print_until]
-                self.print_len = print_until
-        self.put_word(word)
-
-        if self.get_stop_flag():
-            # When generation is stopped from streamer then end is not called, need to call it here manually.
-            self.end()
-            return True  # True means stop generation
-        else:
-            return False  # False means continue generation
-
-    def end(self):
-        """
-        Flushes residual tokens from the buffer and puts a None value in the queue to signal the end.
-        """
-        text = self.tokenizer.decode(self.tokens_cache)
-        if len(text) > self.print_len:
-            word = text[self.print_len :]
-            self.put_word(word)
-            self.tokens_cache = []
-            self.print_len = 0
-        self.put_word(None)
-
-
-class ChunkStreamer(IterableStreamer):
-
-    def __init__(self, tokenizer, tokens_len):
-        super().__init__(tokenizer)
-        self.tokens_len = tokens_len
-
-    def put(self, token_id: int) -> bool:
-        if (len(self.tokens_cache) + 1) % self.tokens_len != 0:
-            self.tokens_cache.append(token_id)
-            self.decoded_lengths.append(-1)
-            return False
-        return super().put(token_id)
-
 
 def make_demo(pipe, model_configuration, model_id, model_language, disable_advanced=False):
     import gradio as gr
diff --git a/notebooks/llm-chatbot/llm-chatbot-generate-api.ipynb b/notebooks/llm-chatbot/llm-chatbot-generate-api.ipynb
@@ -134,6 +134,10 @@
     "    r = requests.get(url=\"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py\")\n",
     "    open(\"notebook_utils.py\", \"w\").write(r.text)\n",
     "\n",
+    "if not Path(\"genai_helper.py\").exists():\n",
+    "    r = requests.get(url=\"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/genai_helper.py\")\n",
+    "    open(\"genai_helper.py\", \"w\").write(r.text)\n",
+    "\n",
     "# Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry\n",
     "from notebook_utils import collect_telemetry\n",
     "\n",
diff --git a/notebooks/llm-rag-langchain/llm-rag-langchain-genai.ipynb b/notebooks/llm-rag-langchain/llm-rag-langchain-genai.ipynb
@@ -94,6 +94,10 @@
     "    )\n",
     "    open(\"pip_helper.py\", \"w\").write(r.text)\n",
     "\n",
+    "if not Path(\"genai_helper.py\").exists():\n",
+    "    r = requests.get(url=\"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/genai_helper.py\")\n",
+    "    open(\"genai_helper.py\", \"w\").write(r.text)\n",
+    "\n",
     "from pip_helper import pip_install\n",
     "\n",
     "os.environ[\"GIT_CLONE_PROTECTION_ACTIVE\"] = \"false\"\n",
diff --git a/notebooks/llm-rag-langchain/ov_langchain_helper.py b/notebooks/llm-rag-langchain/ov_langchain_helper.py
diff --git a/utils/genai_helper.py b/utils/genai_helper.py