NVIDIA · Shixiaowei02 · Sep 30, 2024 · Sep 30, 2024 · Sep 30, 2024
diff --git a/.gitmodules b/.gitmodules
@@ -11,3 +11,6 @@
 [submodule "3rdparty/NVTX"]
 	path = 3rdparty/NVTX
 	url = https://github.com/NVIDIA/NVTX.git
+[submodule "3rdparty/ucxx"]
+	path = 3rdparty/ucxx
+	url = https://github.com/GuanLuo/ucxx.git
diff --git a/3rdparty/ucxx b/3rdparty/ucxx
diff --git a/README.md b/README.md
@@ -7,8 +7,8 @@ TensorRT-LLM
 [![Documentation](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](https://nvidia.github.io/TensorRT-LLM/)
 [![python](https://img.shields.io/badge/python-3.10.12-green)](https://www.python.org/downloads/release/python-31012/)
 [![cuda](https://img.shields.io/badge/cuda-12.5.1-green)](https://developer.nvidia.com/cuda-downloads)
-[![trt](https://img.shields.io/badge/TRT-10.3.0-green)](https://developer.nvidia.com/tensorrt)
-[![version](https://img.shields.io/badge/release-0.12.0-green)](./tensorrt_llm/version.py)
+[![trt](https://img.shields.io/badge/TRT-10.4.0-green)](https://developer.nvidia.com/tensorrt)
+[![version](https://img.shields.io/badge/release-0.13.0-green)](./tensorrt_llm/version.py)
 [![license](https://img.shields.io/badge/license-Apache%202-blue)](./LICENSE)
 
 [Architecture](./docs/source/architecture/overview.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Results](./docs/source/performance/perf-overview.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Examples](./examples/)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Documentation](./docs/source/)
@@ -17,11 +17,11 @@ TensorRT-LLM
 <div align="left">
 
 ## Latest News
+* [2024/08/20] 🏎️SDXL with #TensorRT Model Optimizer ⏱️⚡ 🏁 cache diffusion 🏁 quantization aware training 🏁 QLoRA 🏁 #Python 3.12
+[➡️ link](https://developer.nvidia.com/blog/nvidia-tensorrt-model-optimizer-v0-15-boosts-inference-performance-and-expands-model-support/)
+
 * [2024/08/13] 🐍 DIY Code Completion with #Mamba ⚡ #TensorRT #LLM for speed 🤖 NIM for ease ☁️ deploy anywhere
 [➡️ link](https://developer.nvidia.com/blog/revolutionizing-code-completion-with-codestral-mamba-the-next-gen-coding-llm/)
-<div align="center">
-<img src="docs/source/media/picture-08-13-2024.png" width="50%">
-<div align="left">
 
 * [2024/08/06] 🗫 Multilingual Challenge Accepted 🗫
 🤖 #TensorRT #LLM boosts low-resource languages like Hebrew, Indonesian and Vietnamese ⚡[➡️ link](https://developer.nvidia.com/blog/accelerating-hebrew-llm-performance-with-nvidia-tensorrt-llm/?linkId=100000278659647)

diff --git a/benchmarks/cpp/README.md b/benchmarks/cpp/README.md
@@ -267,10 +267,10 @@ for nloras in ${NUM_LORAS[@]}; do
         --input-mean 256 --input-stdev 16 --output-mean 128 --output-stdev 24
 done
 
-# Generate random lora weights for 256 adapters
+# Generate random lora weights for 16 adapters
 python benchmarks/cpp/utils/generate_rand_loras.py ${CPP_LORA} ${EG_DIR}/loras 16
 
-# perform benchmarking
+# Perform benchmarking
 
 # First run inference without LoRAs
 mkdir -p ${EG_DIR}/log-base-lora

diff --git a/benchmarks/cpp/gptManagerBenchmark.cpp b/benchmarks/cpp/gptManagerBenchmark.cpp
diff --git a/benchmarks/cpp/gptSessionBenchmark.cpp b/benchmarks/cpp/gptSessionBenchmark.cpp
@@ -427,7 +427,8 @@ int main(int argc, char* argv[])
 
     options.add_options()("ctx_micro_batch_size", "Batch size for context phase.", cxxopts::value<int>());
     options.add_options()("gen_micro_batch_size", "Batch size for generation phase.", cxxopts::value<int>());
-    options.add_options()("max_attention_window", "Max kv cache length per sequence.", cxxopts::value<int>());
+    options.add_options()(
+        "max_attention_window", "Max kv cache length per sequence.", cxxopts::value<std::vector<int>>());
     options.add_options()("max_tokens_in_paged_kvcache", "Max tokens in paged K-V Cache.", cxxopts::value<int>());
     options.add_options()("sink_token_len", "Sink token length in kv cache per sequence.", cxxopts::value<int>());
     options.add_options()(
@@ -535,7 +536,7 @@ int main(int argc, char* argv[])
     // Argument: Max KV Cache Length
     if (result.count("max_attention_window"))
     {
-        sessionConfig.kvCacheConfig.maxAttentionWindow = result["max_attention_window"].as<int>();
+        sessionConfig.kvCacheConfig.maxAttentionWindowVec = result["max_attention_window"].as<std::vector<int>>();
     }
     // Argument: Sink token length
     if (result.count("sink_token_len"))

diff --git a/benchmarks/python/all_reduce.py b/benchmarks/python/all_reduce.py
@@ -23,7 +23,6 @@
 
 import tensorrt_llm as tllm
 from tensorrt_llm import Mapping, Tensor
-from tensorrt_llm._ipc_utils import peer_access
 from tensorrt_llm._utils import OMPI_COMM_TYPE_HOST, mpi_comm
 from tensorrt_llm.functional import AllReduceStrategy, allreduce
 from tensorrt_llm.plugin.plugin import current_all_reduce_helper
@@ -106,18 +105,18 @@ def allreduce_benchmark(dtype: str,
             _, start = cuda.cuEventCreate(0)
             _, stop = cuda.cuEventCreate(0)
             runtimes = []
-            with peer_access(mapping):
-                tllm.mpi_barrier()
-
-                for _ in range(10):
-                    cuda.cuEventRecord(start, stream.cuda_stream)
-                    session.run(inputs=feed_dict,
-                                outputs={"output": output},
-                                stream=stream.cuda_stream)
-                    cuda.cuEventRecord(stop, stream.cuda_stream)
-                    torch.cuda.synchronize()
-                    _, ms = cuda.cuEventElapsedTime(start, stop)
-                    runtimes.append(ms)
+
+            tllm.mpi_barrier()
+
+            for _ in range(10):
+                cuda.cuEventRecord(start, stream.cuda_stream)
+                session.run(inputs=feed_dict,
+                            outputs={"output": output},
+                            stream=stream.cuda_stream)
+                cuda.cuEventRecord(stop, stream.cuda_stream)
+                torch.cuda.synchronize()
+                _, ms = cuda.cuEventElapsedTime(start, stop)
+                runtimes.append(ms)
 
             median_ms = sorted(runtimes)[len(runtimes) // 2]
             assert torch.allclose(output, (input * world_size)**inner_loop)

diff --git a/benchmarks/python/check_accuracy_mlperf.py b/benchmarks/python/check_accuracy_mlperf.py
@@ -9,6 +9,7 @@
 from transformers import AutoTokenizer, LlamaTokenizerFast
 
 nltk.download("punkt", quiet=False)
+nltk.download('punkt_tab')
 import argparse
 
 
@@ -25,10 +26,9 @@ class Model(Enum):
         "tokens_per_sample": 294.45 * 0.9
     },
     Model.GPT_J: {
-        "rouge1": 42.9435135,
-        "rouge2": 20.1033765,
-        "rougeL": 29.9581119,
-        # "tokens_per_sample": ??
+        "rouge1": 42.9865 * 0.99,
+        "rouge2": 20.1235 * 0.99,
+        "rougeL": 29.9881 * 0.99,
     }
 }
 
@@ -138,7 +138,6 @@ def main():
         target_texts = get_reference_df(args.dataset)
         model = Model.Llama_v2_70B
         tokenizer = LlamaTokenizerFast.from_pretrained(args.base_model)
-        relaxing_factor = 1.0
     elif args.dataset.lower().endswith(".json"):
         target_texts = get_reference_json(args.dataset)
         model = Model.GPT_J
@@ -147,7 +146,6 @@ def main():
                                                   padding_side="left",
                                                   use_fast=False)
         tokenizer.pad_token = tokenizer.eos_token
-        relaxing_factor = 0.93
     else:
         raise RuntimeError(
             "Dataset expected to be pkl (open-orca) or json (cnn-dailymail)")
@@ -169,7 +167,7 @@ def main():
     print("Targets: ", targets)
 
     for k, _ in targets.items():
-        assert targets[k] * relaxing_factor <= achieved_scores[k]
+        assert targets[k] <= achieved_scores[k]
 
 
 if __name__ == "__main__":

diff --git a/benchmarks/python/enc_dec_benchmark.py b/benchmarks/python/enc_dec_benchmark.py
@@ -25,6 +25,7 @@
 from tensorrt_llm.quantization import QuantMode
 from tensorrt_llm.runtime.session import TensorInfo
 from tensorrt_llm.runtime import ModelConfig
+from tensorrt_llm.models.modeling_utils import get_kv_cache_type_from_legacy
 
 
 class EncDecBenchmark(BaseBenchmark):
@@ -100,6 +101,9 @@ def read_config(component):
                 dtype = pretrained_config["dtype"]
 
                 paged_kv_cache = plugin_config['paged_kv_cache']
+                kv_cache_type = get_kv_cache_type_from_legacy(
+                    True, paged_kv_cache)
+
                 tokens_per_block = plugin_config['tokens_per_block']
 
                 gather_context_logits = builder_config.get(
@@ -120,7 +124,7 @@ def read_config(component):
                     num_layers=num_layers,
                     gpt_attention_plugin=use_gpt_attention_plugin,
                     remove_input_padding=remove_input_padding,
-                    paged_kv_cache=paged_kv_cache,
+                    kv_cache_type=kv_cache_type,
                     tokens_per_block=tokens_per_block,
                     cross_attention=cross_attention,
                     has_position_embedding=has_position_embedding,

diff --git a/benchmarks/python/gpt_benchmark.py b/benchmarks/python/gpt_benchmark.py
@@ -20,6 +20,7 @@
 import torch
 
 import tensorrt_llm
+from tensorrt_llm.bindings import KVCacheType
 from tensorrt_llm.builder import Engine
 from tensorrt_llm.runtime import (ChatGLMGenerationSession, GenerationSession,
                                   SamplingConfig)
@@ -77,6 +78,13 @@ def __init__(self, args, batch_sizes, in_out_lens, gpu_weights_percents,
             if hasattr(self, item):
                 rnn_configs_kwargs[item] = getattr(self, item)
 
+        kv_cache_type = KVCacheType.CONTINUOUS
+        if hasattr(self, 'kv_cache_type'):
+            kv_cache_type = self.kv_cache_type
+        else:
+            if hasattr(self, 'paged_kv_cache'):
+                kv_cache_type = KVCacheType.PAGED if self.paged_kv_cache == True else KVCacheType.CONTINUOUS
+
         model_config = tensorrt_llm.runtime.ModelConfig(
             max_batch_size=self.max_batch_size,
             max_beam_width=self.num_beams,
@@ -86,8 +94,7 @@ def __init__(self, args, batch_sizes, in_out_lens, gpu_weights_percents,
             num_kv_heads=ceil(self.num_kv_heads / self.world_size),
             hidden_size=self.hidden_size // self.world_size,
             gpt_attention_plugin=self.use_gpt_attention_plugin,
-            paged_kv_cache=self.paged_kv_cache if hasattr(
-                self, 'paged_kv_cache') else False,
+            kv_cache_type=kv_cache_type,
             paged_state=self.paged_state
             if hasattr(self, 'paged_state') else False,
             dtype=self.dtype,

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -96,6 +96,23 @@ else()
   message(STATUS "Importing nvrtc wrapper")
 endif()
 
+if(EXISTS
+   "${CMAKE_CURRENT_SOURCE_DIR}/tensorrt_llm/kernels/internal_cutlass_kernels/CMakeLists.txt"
+)
+  set(BUILD_INTERNAL_CUTLASS_KERNELS_DEFAULT ON)
+else()
+  set(BUILD_INTERNAL_CUTLASS_KERNELS_DEFAULT OFF)
+endif()
+option(BUILD_INTERNAL_CUTLASS_KERNELS
+       "Build internal cutlass kernels from source"
+       ${BUILD_INTERNAL_CUTLASS_KERNELS_DEFAULT})
+
+if(BUILD_INTERNAL_CUTLASS_KERNELS)
+  message(STATUS "Building internal cutlass kernels")
+else()
+  message(STATUS "Importing internal cutlass kernels")
+endif()
+
 if(BUILD_PYT)
   message(STATUS "Building PyTorch")
 else()
@@ -289,7 +306,7 @@ set(CMAKE_CUDA_RUNTIME_LIBRARY Static)
 find_library(RT_LIB rt)
 
 set_ifndef(ENABLE_MULTI_DEVICE 1)
-if(ENABLE_MULTI_DEVICE EQUAL 1)
+if(ENABLE_MULTI_DEVICE)
   # NCCL dependencies
   set_ifndef(NCCL_LIB_DIR /usr/lib/${CMAKE_SYSTEM_PROCESSOR}-linux-gnu/)
   set_ifndef(NCCL_INCLUDE_DIR /usr/include/)
@@ -364,7 +381,7 @@ endif()
 # set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -G")
 
 set(CMAKE_CXX_FLAGS
-    "${CMAKE_CXX_FLAGS} -DBUILD_SYSTEM=cmake_oss -DENABLE_MULTI_DEVICE=${ENABLE_MULTI_DEVICE}"
+    "${CMAKE_CXX_FLAGS} -DBUILD_SYSTEM=cmake_oss -DENABLE_MULTI_DEVICE=${ENABLE_MULTI_DEVICE} -DENABLE_UCX=${ENABLE_UCX}"
 )
 
 # Fix linking issue with TRT 10, the detailed description about `--mcmodel` can
@@ -521,6 +538,30 @@ elseif(NOT WIN32)
   message(STATUS "Build without PyTorch, USE_CXX11_ABI=${USE_CXX11_ABI}")
 endif()
 
+# Defer UCX/UCXX setup until after USE_CXX11_ABI is well defined, as UCXX will
+# need to be built to have aligned symbols
+set_ifndef(ENABLE_UCX 0)
+if(ENABLE_UCX)
+  # Only enable UCX related features if the system has UCX library
+  find_package(ucx)
+  if(NOT ${ucx_FOUND})
+    set(ENABLE_UCX 0)
+  else()
+    # installing ucxx via add_subdirectory results in strange cudart linking
+    # error, thus using their installation script to isolate the installation
+    # process until the issue is understood. And always trigger the build so
+    # that change in USE_CXX11_ABI will not be ignored.
+    execute_process(
+      COMMAND
+        ${3RDPARTY_DIR}/ucxx/build.sh libucxx -n
+        --cmake-args=\"-DBUILD_SHARED_LIBS=OFF
+        -DCMAKE_CXX_FLAGS=-D_GLIBCXX_USE_CXX11_ABI=${USE_CXX11_ABI}\"
+        COMMAND_ECHO STDOUT)
+    find_package(ucxx REQUIRED PATHS ${3RDPARTY_DIR}/ucxx/cpp/build
+                 NO_DEFAULT_PATH)
+  endif()
+endif()
+
 file(STRINGS "${TRT_INCLUDE_DIR}/NvInferVersion.h" VERSION_STRINGS
      REGEX "#define NV_TENSORRT_.*")
 foreach(TYPE MAJOR MINOR PATCH BUILD)

diff --git a/cpp/include/tensorrt_llm/batch_manager/inferenceRequest.h b/cpp/include/tensorrt_llm/batch_manager/inferenceRequest.h
@@ -18,6 +18,7 @@
 
 #include "tensorrt_llm/batch_manager/llmRequest.h"
 #include "tensorrt_llm/batch_manager/namedTensor.h"
+#include "tensorrt_llm/executor/executor.h"
 #include "tensorrt_llm/runtime/iTensor.h"
 
 #include <algorithm>
@@ -35,10 +36,12 @@ namespace inference_request
 {
 // Input tensors
 auto constexpr kInputIdsTensorName = "input_ids";
+auto constexpr kPositionIdsTensorName = "position_ids";
 auto constexpr kDraftInputIdsTensorName = "draft_input_ids";
 auto constexpr kDraftLogitsTensorName = "draft_logits";
 auto constexpr kMaxNewTokensTensorName = "request_output_len";
 auto constexpr kBeamWidthTensorName = "beam_width";
+auto constexpr kNumReturnSequencesTensorName = "num_return_sequences";
 auto constexpr kEndIdTensorName = "end_id";
 auto constexpr kPadIdTensorName = "pad_id";
 auto constexpr kBadWordsListTensorName = "bad_words_list";
@@ -165,17 +168,34 @@ class GenericInferenceRequest
         mLogitsPostProcessor = cb;
     }
 
+    [[nodiscard]] std::optional<executor::LookaheadDecodingConfig> getLookaheadConfig() const
+    {
+        return mLookaheadConfig;
+    }
+
+    void setLookaheadConfig(executor::LookaheadDecodingConfig config)
+    {
+        mLookaheadConfig = config;
+    }
+
+    void clearLookaheadConfig()
+    {
+        mLookaheadConfig = std::nullopt;
+    }
+
     std::optional<LogitsPostProcessor> getLogitsPostProcessor()
     {
         return mLogitsPostProcessor;
     }
 
     static std::array constexpr kTensorNames = {
         inference_request::kInputIdsTensorName,
+        inference_request::kPositionIdsTensorName,
         inference_request::kDraftInputIdsTensorName,
         inference_request::kDraftLogitsTensorName,
         inference_request::kMaxNewTokensTensorName,
         inference_request::kBeamWidthTensorName,
+        inference_request::kNumReturnSequencesTensorName,
         inference_request::kEndIdTensorName,
         inference_request::kPadIdTensorName,
         inference_request::kBadWordsListTensorName,
@@ -240,10 +260,12 @@ class GenericInferenceRequest
     }
 
     TENSOR_GETTER_SETTER(InputIds, inference_request::kInputIdsTensorName)
+    TENSOR_GETTER_SETTER(PositionIds, inference_request::kPositionIdsTensorName)
     TENSOR_GETTER_SETTER(DraftInputIds, inference_request::kDraftInputIdsTensorName)
     TENSOR_GETTER_SETTER(DraftLogits, inference_request::kDraftLogitsTensorName)
     TENSOR_GETTER_SETTER(MaxNewTokens, inference_request::kMaxNewTokensTensorName)
     TENSOR_GETTER_SETTER(BeamWidth, inference_request::kBeamWidthTensorName)
+    TENSOR_GETTER_SETTER(NumReturnSequences, inference_request::kNumReturnSequencesTensorName)
     TENSOR_GETTER_SETTER(EndId, inference_request::kEndIdTensorName)
     TENSOR_GETTER_SETTER(PadId, inference_request::kPadIdTensorName)
     TENSOR_GETTER_SETTER(BadWordsList, inference_request::kBadWordsListTensorName)
@@ -282,6 +304,7 @@ class GenericInferenceRequest
     bool mIsStreaming;
     TensorMap mInputTensors;
     std::optional<LogitsPostProcessor> mLogitsPostProcessor;
+    std::optional<executor::LookaheadDecodingConfig> mLookaheadConfig;
 };
 
 class InferenceRequest : public GenericInferenceRequest<tensorrt_llm::runtime::ITensor::SharedPtr, NamedTensor>