Pranshu-S
diff --git a/‎.github/workflows/build_doc.yml
+1-1 b/‎.github/workflows/build_doc.yml
+1-1
diff --git a/‎.github/workflows/code_snippets.yml
+1-1 b/‎.github/workflows/code_snippets.yml
+1-1
diff --git a/‎.github/workflows/job_samples_tests.yml
+1-1 b/‎.github/workflows/job_samples_tests.yml
+1-1
diff --git a/‎.github/workflows/linux.yml
+13-8 b/‎.github/workflows/linux.yml
+13-8
diff --git a/‎.github/workflows/linux_arm64.yml
+6-1 b/‎.github/workflows/linux_arm64.yml
+6-1
diff --git a/‎docs/articles_en/learn-openvino.rst
+2-2 b/‎docs/articles_en/learn-openvino.rst
+2-2
diff --git a/‎docs/articles_en/learn-openvino/large-language-models/llm-inference-hf.rst
+255 b/‎docs/articles_en/learn-openvino/large-language-models/llm-inference-hf.rst
+255
@@ -21,7 +21,7 @@ jobs:
           lfs: 'true'
 
       - name: Install apt-get dependencies
-        uses: awalsh128/cache-apt-pkgs-action@v1.4.1
+        uses: awalsh128/cache-apt-pkgs-action@v1.4.2
         with:
           packages: graphviz texlive liblua5.2-0 libclang1-9 libclang-cpp9
           version: 3.0
 
@@ -30,7 +30,7 @@ jobs:
           submodules: 'true'
 
       - name: Install OpenCL
-        uses: awalsh128/cache-apt-pkgs-action@v1.4.1
+        uses: awalsh128/cache-apt-pkgs-action@v1.4.2
         if: runner.os == 'Linux'
         with:
           packages: ocl-icd-opencl-dev opencl-headers
 
@@ -124,7 +124,7 @@ jobs:
 
           source ${INSTALL_DIR}/setupvars.sh
 
-          PYTHONCOERCECLOCALE=warn python3 -bb -W error -X dev -X warn_default_encoding -m pytest $INSTALL_TEST_DIR/smoke_tests \
+          PYTHONCOERCECLOCALE=warn python3 -bb -W error -X dev -m pytest $INSTALL_TEST_DIR/smoke_tests \
             --junitxml=$INSTALL_TEST_DIR/TEST-SamplesSmokeTests.xml
 
       - name: Upload Test Results
 
@@ -318,7 +318,7 @@ jobs:
 
   Conformance:
     needs: [ Build, Smart_CI ]
-    timeout-minutes: ${{ matrix.TEST_TYPE == 'API' && 5 || 30 }}
+    timeout-minutes: ${{ matrix.TEST_TYPE == 'API' && 5 || 20 }}
     defaults:
       run:
         shell: bash
@@ -511,18 +511,23 @@ jobs:
       runner: 'ubuntu-20.04-8-cores'
       model_scope: 'precommit'
 
-  TensorFlow_Models_Tests_Nightly:
-    name: TensorFlow Models tests
+  TensorFlow_Models_Tests_Nightly_TF_HUB:
+    name: TensorFlow TF Hub Models tests
+    if: ${{ github.event_name == 'schedule' }}
+    needs: [ Build, Smart_CI, Openvino_tokenizers ]
+    uses: ./.github/workflows/job_tensorflow_models_tests.yml
+    with:
+      runner: 'ubuntu-20.04-16-cores'
+      model_scope: 'nightly_tf_hub'
+
+  TensorFlow_Models_Tests_Nightly_HF:
+    name: TensorFlow Hugging Face Models tests
     if: ${{ github.event_name == 'schedule' }}
     needs: [ Build, Smart_CI, Openvino_tokenizers ]
-    strategy:
-      max-parallel: 2
-      matrix:
-        MODEL_SCOPE: ['nightly_hf', 'nightly_tf_hub']
     uses: ./.github/workflows/job_tensorflow_models_tests.yml
     with:
       runner: 'ubuntu-20.04-16-cores'
-      model_scope: ${{ matrix.MODEL_SCOPE }}
+      model_scope: 'nightly_hf'
 
   # TODO: Switch back to self-hosted runners
   # container:
 
@@ -116,8 +116,13 @@ jobs:
       - name: Install build dependencies
         run: |
           bash ${OPENVINO_REPO}/install_build_dependencies.sh
+          
           # default-jdk - Java API
-          apt install --assume-yes --no-install-recommends default-jdk
+          apt install --assume-yes --no-install-recommends default-jdk gcc-10 g++-10
+          
+          # Set gcc-10 as a default one
+          update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 30
+          update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-10 30
 
       - name: Install sccache
         uses: mozilla-actions/sccache-action@v0.0.4
 
@@ -16,7 +16,7 @@ Learn OpenVINO
 
    Interactive Tutorials (Python) <tutorials>
    Sample Applications (Python & C++) <openvino_docs_OV_UG_Samples_Overview>
-   Generative AI Optimization and Deployment <gen_ai_guide>
+   Large Language Models Inference Guide <native_vs_hugging_face_api>
 
 
 This section will help you get a hands-on experience with OpenVINO even if you are just starting
@@ -30,5 +30,5 @@ as well as an experienced user.
 | :doc:`OpenVINO Samples <openvino_docs_OV_UG_Samples_Overview>`
 | The OpenVINO samples (Python and C++) are simple console applications that show how to use specific OpenVINO API features. They can assist you in executing tasks such as loading a model, running inference, querying particular device capabilities, etc.
 
-| :doc:`Optimize and Deploy Generative AI Models <gen_ai_guide>`
+| :doc:`Large Language Models in OpenVINO <native_vs_hugging_face_api>`
 | Detailed information on how OpenVINO accelerates Generative AI use cases and what models it supports. This tutorial provides instructions for running Generative AI models using Hugging Face Optimum Intel and Native OpenVINO APIs.
@@ -0,0 +1,255 @@
+.. {#llm_inference}
+
+LLM Inference with Hugging Face and Optimum Intel
+=====================================================
+
+The steps below show how to load and infer LLMs from Hugging Face using Optimum Intel.
+They also show how to convert models into OpenVINO IR format so they can be optimized
+by NNCF and used with other OpenVINO tools.
+
+Prerequisites
+############################################################
+
+* Create a Python environment by following the instructions on the :doc:`Install OpenVINO PIP <openvino_docs_install_guides_overview>` page.
+* Install the necessary dependencies for Optimum Intel:
+
+.. code-block:: console
+
+    pip install optimum[openvino,nncf]
+
+
+Loading a Hugging Face Model to Optimum Intel
+############################################################
+
+To start using OpenVINO as a backend for Hugging Face, change the original Hugging Face code in two places:
+
+.. code-block:: diff
+
+    -from transformers import AutoModelForCausalLM
+    +from optimum.intel import OVModelForCausalLM
+
+    model_id = "meta-llama/Llama-2-7b-chat-hf"
+    -model = AutoModelForCausalLM.from_pretrained(model_id)
+    +model = OVModelForCausalLM.from_pretrained(model_id, export=True)
+
+
+Instead of using ``AutoModelForCasualLM`` from the Hugging Face transformers library,
+switch to ``OVModelForCasualLM`` from the optimum.intel library. This change enables
+you to use OpenVINO's optimization features. You may also use other AutoModel types,
+such as ``OVModelForSeq2SeqLM``, though this guide will focus on CausalLM.
+
+By setting the parameter ``export=True``, the model is converted to OpenVINO IR format on the fly.
+
+After that, you can call ``save_pretrained()`` method to save model to the folder in the OpenVINO
+Intermediate Representation and use it further.
+
+.. code-block:: python
+
+    model.save_pretrained("ov_model")
+
+This will create a new folder called `ov_model` with the LLM in OpenVINO IR format inside.
+You can change the folder and provide another model directory instead of `ov_model`.
+
+Once the model is saved, you can load it with the following command:
+
+.. code-block:: python
+
+    model = OVModelForCausalLM.from_pretrained("ov_model")
+
+Converting a Hugging Face Model to OpenVINO IR
+############################################################
+
+The optimum-cli tool allows you to convert models from Hugging Face to
+the OpenVINO IR format:
+
+.. code-block:: python
+
+    optimum-cli export openvino --model <MODEL_NAME> <NEW_MODEL_NAME>
+
+If you want to convert the `Llama 2` model from Hugging Face to an OpenVINO IR
+model and name it `ov_llama_2`, the command would look like this:
+
+.. code-block:: python
+
+    optimum-cli export openvino --model meta-llama/Llama-2-7b-chat-hf ov_llama_2
+
+In this case, you can load the converted model in OpenVINO representation directly from the disk:
+
+.. code-block:: python
+
+    model_id = "llama_openvino"
+    model = OVModelForCausalLM.from_pretrained(model_id)
+
+
+Optimum-Intel API also provides out-of-the-box model optimization through weight compression
+using NNCF which substantially reduces the model footprint and inference latency:
+
+.. code-block:: python
+
+    model = OVModelForCausalLM.from_pretrained(model_id, export=True, load_in_8bit=True)
+
+    # or if model was already converted
+    model = OVModelForCausalLM.from_pretrained(model_path, load_in_8bit=True)
+
+    # save model after optimization
+    model.save_pretrained(optimized_model_path)
+
+
+Weight compression is applied by default to models larger than one billion parameters and is
+also available for CLI interface as the ``--int8`` option.
+
+.. note::
+
+   8-bit weight compression is enabled by default for models larger than 1 billion parameters.
+
+`Optimum Intel <https://huggingface.co/docs/optimum/intel/inference>`__ also provides 4-bit weight
+compression with ``OVWeightQuantizationConfig`` class to control weight quantization parameters.
+
+
+.. code-block:: python
+
+    from optimum.intel import OVModelForCausalLM, OVWeightQuantizationConfig
+    import nncf
+
+    model = OVModelForCausalLM.from_pretrained(
+        model_id,
+        export=True,
+        quantization_config=OVWeightQuantizationConfig(bits=4, asym=True, ratio=0.8, dataset="ptb"),
+    )
+
+    # or if model was already converted
+    mmodel = OVModelForCausalLM.from_pretrained(
+        model_path,
+        quantization_config=OVWeightQuantizationConfig(bits=4, asym=True, ratio=0.8, dataset="ptb"),
+    )
+
+    # save model after optimization
+    model.save_pretrained(optimized_model_path)
+
+
+The optimized model can be saved as usual with a call to ``save_pretrained()``.
+For more details on compression options, refer to the :doc:`weight compression guide <weight_compression>`.
+
+.. note::
+
+   OpenVINO also supports 4-bit models from Hugging Face `Transformers <https://github.com/huggingface/transformers>`__ library optimized
+   with `GPTQ <https://github.com/PanQiWei/AutoGPTQ>`__. In this case, there is no need for an additional model optimization step because model conversion will automatically preserve the INT4 optimization results, allowing model inference to benefit from it.
+
+Below are some examples of using Optimum-Intel for model conversion and inference:
+
+* `Instruction following using Databricks Dolly 2.0 and OpenVINO <https://github.com/openvinotoolkit/openvino_notebooks/blob/main/notebooks/240-dolly-2-instruction-following/240-dolly-2-instruction-following.ipynb>`__
+* `Create an LLM-powered Chatbot using OpenVINO <https://github.com/openvinotoolkit/openvino_notebooks/blob/main/notebooks/254-llm-chatbot/254-llm-chatbot.ipynb>`__
+
+.. note::
+
+  Optimum-Intel can be used for other generative AI models. See `Stable Diffusion v2.1 using Optimum-Intel OpenVINO <https://github.com/openvinotoolkit/openvino_notebooks/blob/main/notebooks/236-stable-diffusion-v2/236-stable-diffusion-v2-optimum-demo.ipynb>`__ and `Image generation with Stable Diffusion XL and OpenVINO <https://github.com/openvinotoolkit/openvino_notebooks/blob/main/notebooks/248-stable-diffusion-xl/248-stable-diffusion-xl.ipynb>`__ for more examples.
+
+Inference Example
+############################################################
+
+For Hugging Face models, the ``AutoTokenizer`` and the ``pipeline`` function are used to create
+an inference pipeline. This setup allows for easy text processing and model interaction:
+
+.. code-block:: python
+
+  from optimum.intel import OVModelForCausalLM
+  # new imports for inference
+  from transformers import AutoTokenizer
+
+  # load the model
+  model_id = "meta-llama/Llama-2-7b-chat-hf"
+  model = OVModelForCausalLM.from_pretrained(model_id, export=True)
+
+  # inference
+  prompt = "The weather is:"
+  tokenizer = AutoTokenizer.from_pretrained(model_id)
+  inputs = tokenizer(prompt, return_tensors="pt")
+
+  outputs = model.generate(**inputs, max_new_tokens=50)
+  print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+
+.. note::
+
+  Converting LLMs on the fly every time to OpenVINO IR is a resource intensive task.
+  It is a good practice to convert the model once, save it in a folder and load it for inference.
+
+By default, inference will run on CPU. To select a different inference device, for example, GPU,
+add ``device="GPU"`` to the ``from_pretrained()`` call. To switch to a different device after
+the model has been loaded, use the ``.to()`` method. The device naming convention is the same
+as in OpenVINO native API:
+
+.. code-block:: python
+
+    model.to("GPU")
+
+Enabling OpenVINO Runtime Optimizations
+############################################################
+
+OpenVINO runtime provides a set of optimizations for more efficient LLM inference. This includes **Dynamic quantization** of activations of 4/8-bit quantized MatMuls and **KV-cache quantization**.
+
+* **Dynamic quantization** enables quantization of activations of MatMul operations that have 4 or 8-bit quantized weights (see :doc:`LLM Weight Compression <weight_compression>`).
+  It improves inference latency and throughput of LLMs, though it may cause insignificant deviation in generation accuracy.  Quantization is performed in a
+  group-wise manner, with configurable group size. It means that values in a group share quantization parameters. Larger group sizes lead to faster inference but lower accuracy. Recommended group size values are: ``32``, ``64``, or ``128``. To enable Dynamic quantization, use the corresponding
+  inference property as follows:
+
+
+  .. code-block:: python
+
+      model = OVModelForCausalLM.from_pretrained(
+          model_path,
+          ov_config={"DYNAMIC_QUANTIZATION_GROUP_SIZE": "32", "PERFORMANCE_HINT": "LATENCY"}
+      )
+
+
+
+* **KV-cache quantization** allows lowering the precision of Key and Value cache in LLMs. This helps reduce memory consumption during inference, improving latency and throughput. KV-cache can be quantized into the following precisions:
+  ``u8``, ``bf16``, ``f16``.  If ``u8`` is used, KV-cache quantization is also applied in a group-wise manner. Thus, it can use ``DYNAMIC_QUANTIZATION_GROUP_SIZE`` value if defined.
+  Otherwise, the group size ``32`` is used by default. KV-cache quantization can be enabled as follows:
+
+
+  .. code-block:: python
+
+      model = OVModelForCausalLM.from_pretrained(
+          model_path,
+          ov_config={"KV_CACHE_PRECISION": "u8", "DYNAMIC_QUANTIZATION_GROUP_SIZE": "32", "PERFORMANCE_HINT": "LATENCY"}
+      )
+
+
+.. note::
+
+  Currently, both Dynamic quantization and KV-cache quantization are available for CPU device.
+
+
+Working with Models Tuned with LoRA
+#########################################
+
+Low-rank Adaptation (LoRA) is a popular method to tune Generative AI models to a downstream task
+or custom data. However, it requires some extra steps to be done for efficient deployment using
+the Hugging Face API. Namely, the trained adapters should be fused into the baseline model to
+avoid extra computation. This is how it can be done for LLMs:
+
+.. code-block:: python
+
+    model_id = "meta-llama/Llama-2-7b-chat-hf"
+    lora_adaptor = "./lora_adaptor"
+
+    model = AutoModelForCausalLM.from_pretrained(model_id, use_cache=True)
+    model = PeftModelForCausalLM.from_pretrained(model, lora_adaptor)
+    model.merge_and_unload()
+    model.get_base_model().save_pretrained("fused_lora_model")
+
+
+Now the model can be converted to OpenVINO using Optimum Intel Python API or CLI interfaces mentioned above.
+
+
+Additional Resources
+#####################
+
+* `Optimum Intel documentation <https://huggingface.co/docs/optimum/intel/inference>`__
+* :doc:`LLM Weight Compression <weight_compression>`
+* `Neural Network Compression Framework <https://github.com/openvinotoolkit/nncf>`__
+* `Hugging Face Transformers <https://huggingface.co/docs/transformers/index>`__
+* `Generation with LLMs <https://huggingface.co/docs/transformers/llm_tutorial>`__
+*	`Pipeline class <https://huggingface.co/docs/transformers/main_classes/pipelines>`__
+* `GenAI Pipeline Repository <https://github.com/openvinotoolkit/openvino.genai>`__
+* `OpenVINO Tokenizers <https://github.com/openvinotoolkit/openvino_contrib/tree/master/modules/custom_operations/user_ie_extensions/tokenizer/python>`__