move sd notebook to genai (#2730)

eaidova · web-flow · commit e3eef013121c · 2025-02-07T16:02:06.000+04:00
CVS-161646
diff --git a/.ci/ignore_treon_docker.txt b/.ci/ignore_treon_docker.txt
@@ -4,7 +4,6 @@ notebooks/model-server/model-server.ipynb
 notebooks/quantizing-model-with-accuracy-control/speech-recognition-quantization-wav2vec2.ipynb
 notebooks/quantizing-model-with-accuracy-control/yolov11-quantization-with-accuracy-control.ipynb
 notebooks/big-transfer-quantization/tensorflow-bit-image-classification-nncf-quantization.ipynb
-notebooks/stable-diffusion-text-to-image/stable-diffusion-text-to-image.ipynb
 notebooks/clip-zero-shot-image-classification/clip-zero-shot-classification.ipynb
 notebooks/instruct-pix2pix-image-editing/instruct-pix2pix-image-editing.ipynb
 notebooks/blip-visual-language-processing/blip-visual-language-processing.ipynb
diff --git a/.ci/skipped_notebooks.yml b/.ci/skipped_notebooks.yml
@@ -50,13 +50,6 @@
     - os:
         - ubuntu-20.04
         - ubuntu-22.04
-- notebook: notebooks/stable-diffusion-text-to-image/stable-diffusion-text-to-image.ipynb
-  skips:
-    - os:
-        - macos-13
-        - ubuntu-20.04
-        - ubuntu-22.04
-        - windows-2019
 - notebook: notebooks/clip-zero-shot-image-classification/clip-zero-shot-classification.ipynb
   skips:
     - os:
diff --git a/notebooks/stable-diffusion-text-to-image/README.md b/notebooks/stable-diffusion-text-to-image/README.md
@@ -26,7 +26,7 @@ The following image shows an example of the input sequence and corresponding pre
 
 ## Notebook Contents
 
-This notebook demonstrates how to convert and run stable diffusion using OpenVINO.
+This notebook demonstrates how to convert and run stable diffusion using OpenVINO.  For user experience simplification, we will use  [Hugging Face Optimum](https://huggingface.co/docs/optimum/installation) library accelerated by OpenVINO integration for model conversion and [OpenVINO GenAI API](https://docs.openvino.ai/2024/learn-openvino/llm_inference_guide/genai-guide.html) for model inference.
 
 Notebook contains the following parts:
 1. Download the model from the Hugging Face Hub and converted to OpenVINO IR format with [Optimum Intel](https://huggingface.co/docs/optimum/intel/inference#stable-diffusion).
diff --git a/notebooks/stable-diffusion-text-to-image/gradio_helper.py b/notebooks/stable-diffusion-text-to-image/gradio_helper.py
@@ -1,14 +1,76 @@
 import gradio as gr
 import numpy as np
+from tqdm.auto import tqdm
+import openvino as ov
+import openvino_genai as ov_genai
+import sys
+from PIL import Image
 
+sample_text = (
+    "cyberpunk cityscape like Tokyo New York  with tall buildings at dusk golden hour cinematic lighting, epic composition. "
+    "A golden daylight, hyper-realistic environment. "
+    "Hyper and intricate detail, photo-realistic. "
+    "Cinematic and volumetric light. "
+    "Epic concept art. "
+    "Octane render and Unreal Engine, trending on artstation"
+)
 
-def make_demo(pipeline, preprocess, postprocess, default_image_path):
+
+def image_to_tensor(image) -> ov.Tensor:
+    pic = image.convert("RGB")
+    image_data = np.array(pic.getdata()).reshape(1, pic.size[1], pic.size[0], 3).astype(np.uint8)
+    return ov.Tensor(image_data)
+
+
+def make_demo(pipeline):
+    def generate_from_text(text, seed, num_steps, _=gr.Progress(track_tqdm=True)):
+        random_generator = ov_genai.TorchGenerator(seed)
+
+        pbar = tqdm(total=num_steps)
+
+        def callback(step, num_steps, latent):
+            if num_steps != pbar.total:
+                pbar.reset(num_steps)
+            pbar.update(1)
+            sys.stdout.flush()
+            return False
+
+        result = pipeline.generate(text, num_inference_steps=num_steps, generator=random_generator, callback=callback)
+
+        pbar.close()
+        return Image.fromarray(result.data[0])
+
+    with gr.Blocks() as demo:
+        with gr.Tab("Text-to-Image generation"):
+            with gr.Row():
+                with gr.Column():
+                    text_input = gr.Textbox(lines=3, label="Text")
+                    seed_input = gr.Slider(0, 10000000, value=42, step=1, label="Seed")
+                    steps_input = gr.Slider(1, 50, value=20, step=1, label="Steps")
+                out = gr.Image(label="Result", type="pil")
+            btn = gr.Button()
+            btn.click(generate_from_text, [text_input, seed_input, steps_input], out)
+            gr.Examples([[sample_text, 42, 20]], [text_input, seed_input, steps_input])
+    return demo
+
+
+def make_demo_i2i(pipeline, default_image_path):
     def generate_from_image(img, text, seed, num_steps, strength, _=gr.Progress(track_tqdm=True)):
-        preprocessed_img, meta_data = preprocess(img)
-        np.random.seed(seed)
-        result = pipeline(text, preprocessed_img, num_inference_steps=num_steps, strength=strength)
-        result_img = postprocess(result["images"][0], meta_data["src_width"], meta_data["src_height"])
-        return result_img
+        img_tensor = image_to_tensor(img)
+
+        pbar = tqdm(total=int(num_steps * strength) + 1)
+
+        def callback(step, num_steps, latent):
+            if num_steps != pbar.total:
+                pbar.reset(num_steps)
+            pbar.update(1)
+            sys.stdout.flush()
+            return False
+
+        random_generator = ov_genai.TorchGenerator(seed)
+        result = pipeline.generate(text, img_tensor, num_inference_steps=num_steps, strength=strength, generator=random_generator, callaback=callback)
+        pbar.close()
+        return Image.fromarray(result.data[0])
 
     with gr.Blocks() as demo:
         with gr.Tab("Image-to-Image generation"):
@@ -18,7 +80,7 @@ def generate_from_image(img, text, seed, num_steps, strength, _=gr.Progress(trac
                     i2i_text_input = gr.Textbox(lines=3, label="Text")
                     i2i_seed_input = gr.Slider(0, 1024, value=42, step=1, label="Seed")
                     i2i_steps_input = gr.Slider(1, 50, value=10, step=1, label="Steps")
-                    strength_input = gr.Slider(0, 1, value=0.5, label="Strength")
+                    strength_input = gr.Slider(0, 1, value=0.25, label="Strength")
                 i2i_out = gr.Image(label="Result")
             i2i_btn = gr.Button()
             sample_i2i_text = "amazing watercolor painting"
@@ -34,7 +96,7 @@ def generate_from_image(img, text, seed, num_steps, strength, _=gr.Progress(trac
                 i2i_out,
             )
             gr.Examples(
-                [[str(default_image_path), sample_i2i_text, 42, 10, 0.5]],
+                [[str(default_image_path), sample_i2i_text, 42, 10, 0.25]],
                 [
                     i2i_input,
                     i2i_text_input,
diff --git a/notebooks/stable-diffusion-text-to-image/stable-diffusion-text-to-image.ipynb b/notebooks/stable-diffusion-text-to-image/stable-diffusion-text-to-image.ipynb