Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TensorRT-LLM v0.13 Update #2269

Merged
merged 2 commits into from
Sep 30, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,6 @@
[submodule "3rdparty/NVTX"]
path = 3rdparty/NVTX
url = https://github.com/NVIDIA/NVTX.git
[submodule "3rdparty/ucxx"]
path = 3rdparty/ucxx
url = https://github.com/GuanLuo/ucxx.git
1 change: 1 addition & 0 deletions 3rdparty/ucxx
Submodule ucxx added at b99181
10 changes: 5 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@ TensorRT-LLM
[![Documentation](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](https://nvidia.github.io/TensorRT-LLM/)
[![python](https://img.shields.io/badge/python-3.10.12-green)](https://www.python.org/downloads/release/python-31012/)
[![cuda](https://img.shields.io/badge/cuda-12.5.1-green)](https://developer.nvidia.com/cuda-downloads)
[![trt](https://img.shields.io/badge/TRT-10.3.0-green)](https://developer.nvidia.com/tensorrt)
[![version](https://img.shields.io/badge/release-0.12.0-green)](./tensorrt_llm/version.py)
[![trt](https://img.shields.io/badge/TRT-10.4.0-green)](https://developer.nvidia.com/tensorrt)
[![version](https://img.shields.io/badge/release-0.13.0-green)](./tensorrt_llm/version.py)
[![license](https://img.shields.io/badge/license-Apache%202-blue)](./LICENSE)

[Architecture](./docs/source/architecture/overview.md)   |   [Results](./docs/source/performance/perf-overview.md)   |   [Examples](./examples/)   |   [Documentation](./docs/source/)
Expand All @@ -17,11 +17,11 @@ TensorRT-LLM
<div align="left">

## Latest News
* [2024/08/20] 🏎️SDXL with #TensorRT Model Optimizer ⏱️⚡ 🏁 cache diffusion 🏁 quantization aware training 🏁 QLoRA 🏁 #Python 3.12
[➡️ link](https://developer.nvidia.com/blog/nvidia-tensorrt-model-optimizer-v0-15-boosts-inference-performance-and-expands-model-support/)

* [2024/08/13] 🐍 DIY Code Completion with #Mamba ⚡ #TensorRT #LLM for speed 🤖 NIM for ease ☁️ deploy anywhere
[➡️ link](https://developer.nvidia.com/blog/revolutionizing-code-completion-with-codestral-mamba-the-next-gen-coding-llm/)
<div align="center">
<img src="docs/source/media/picture-08-13-2024.png" width="50%">
<div align="left">

* [2024/08/06] 🗫 Multilingual Challenge Accepted 🗫
🤖 #TensorRT #LLM boosts low-resource languages like Hebrew, Indonesian and Vietnamese ⚡[➡️ link](https://developer.nvidia.com/blog/accelerating-hebrew-llm-performance-with-nvidia-tensorrt-llm/?linkId=100000278659647)
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/cpp/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -267,10 +267,10 @@ for nloras in ${NUM_LORAS[@]}; do
--input-mean 256 --input-stdev 16 --output-mean 128 --output-stdev 24
done

# Generate random lora weights for 256 adapters
# Generate random lora weights for 16 adapters
python benchmarks/cpp/utils/generate_rand_loras.py ${CPP_LORA} ${EG_DIR}/loras 16

# perform benchmarking
# Perform benchmarking

# First run inference without LoRAs
mkdir -p ${EG_DIR}/log-base-lora
Expand Down
142 changes: 106 additions & 36 deletions benchmarks/cpp/gptManagerBenchmark.cpp

Large diffs are not rendered by default.

5 changes: 3 additions & 2 deletions benchmarks/cpp/gptSessionBenchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -427,7 +427,8 @@ int main(int argc, char* argv[])

options.add_options()("ctx_micro_batch_size", "Batch size for context phase.", cxxopts::value<int>());
options.add_options()("gen_micro_batch_size", "Batch size for generation phase.", cxxopts::value<int>());
options.add_options()("max_attention_window", "Max kv cache length per sequence.", cxxopts::value<int>());
options.add_options()(
"max_attention_window", "Max kv cache length per sequence.", cxxopts::value<std::vector<int>>());
options.add_options()("max_tokens_in_paged_kvcache", "Max tokens in paged K-V Cache.", cxxopts::value<int>());
options.add_options()("sink_token_len", "Sink token length in kv cache per sequence.", cxxopts::value<int>());
options.add_options()(
Expand Down Expand Up @@ -535,7 +536,7 @@ int main(int argc, char* argv[])
// Argument: Max KV Cache Length
if (result.count("max_attention_window"))
{
sessionConfig.kvCacheConfig.maxAttentionWindow = result["max_attention_window"].as<int>();
sessionConfig.kvCacheConfig.maxAttentionWindowVec = result["max_attention_window"].as<std::vector<int>>();
}
// Argument: Sink token length
if (result.count("sink_token_len"))
Expand Down
25 changes: 12 additions & 13 deletions benchmarks/python/all_reduce.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@

import tensorrt_llm as tllm
from tensorrt_llm import Mapping, Tensor
from tensorrt_llm._ipc_utils import peer_access
from tensorrt_llm._utils import OMPI_COMM_TYPE_HOST, mpi_comm
from tensorrt_llm.functional import AllReduceStrategy, allreduce
from tensorrt_llm.plugin.plugin import current_all_reduce_helper
Expand Down Expand Up @@ -106,18 +105,18 @@ def allreduce_benchmark(dtype: str,
_, start = cuda.cuEventCreate(0)
_, stop = cuda.cuEventCreate(0)
runtimes = []
with peer_access(mapping):
tllm.mpi_barrier()

for _ in range(10):
cuda.cuEventRecord(start, stream.cuda_stream)
session.run(inputs=feed_dict,
outputs={"output": output},
stream=stream.cuda_stream)
cuda.cuEventRecord(stop, stream.cuda_stream)
torch.cuda.synchronize()
_, ms = cuda.cuEventElapsedTime(start, stop)
runtimes.append(ms)

tllm.mpi_barrier()

for _ in range(10):
cuda.cuEventRecord(start, stream.cuda_stream)
session.run(inputs=feed_dict,
outputs={"output": output},
stream=stream.cuda_stream)
cuda.cuEventRecord(stop, stream.cuda_stream)
torch.cuda.synchronize()
_, ms = cuda.cuEventElapsedTime(start, stop)
runtimes.append(ms)

median_ms = sorted(runtimes)[len(runtimes) // 2]
assert torch.allclose(output, (input * world_size)**inner_loop)
Expand Down
12 changes: 5 additions & 7 deletions benchmarks/python/check_accuracy_mlperf.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from transformers import AutoTokenizer, LlamaTokenizerFast

nltk.download("punkt", quiet=False)
nltk.download('punkt_tab')
import argparse


Expand All @@ -25,10 +26,9 @@ class Model(Enum):
"tokens_per_sample": 294.45 * 0.9
},
Model.GPT_J: {
"rouge1": 42.9435135,
"rouge2": 20.1033765,
"rougeL": 29.9581119,
# "tokens_per_sample": ??
"rouge1": 42.9865 * 0.99,
"rouge2": 20.1235 * 0.99,
"rougeL": 29.9881 * 0.99,
}
}

Expand Down Expand Up @@ -138,7 +138,6 @@ def main():
target_texts = get_reference_df(args.dataset)
model = Model.Llama_v2_70B
tokenizer = LlamaTokenizerFast.from_pretrained(args.base_model)
relaxing_factor = 1.0
elif args.dataset.lower().endswith(".json"):
target_texts = get_reference_json(args.dataset)
model = Model.GPT_J
Expand All @@ -147,7 +146,6 @@ def main():
padding_side="left",
use_fast=False)
tokenizer.pad_token = tokenizer.eos_token
relaxing_factor = 0.93
else:
raise RuntimeError(
"Dataset expected to be pkl (open-orca) or json (cnn-dailymail)")
Expand All @@ -169,7 +167,7 @@ def main():
print("Targets: ", targets)

for k, _ in targets.items():
assert targets[k] * relaxing_factor <= achieved_scores[k]
assert targets[k] <= achieved_scores[k]


if __name__ == "__main__":
Expand Down
6 changes: 5 additions & 1 deletion benchmarks/python/enc_dec_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from tensorrt_llm.quantization import QuantMode
from tensorrt_llm.runtime.session import TensorInfo
from tensorrt_llm.runtime import ModelConfig
from tensorrt_llm.models.modeling_utils import get_kv_cache_type_from_legacy


class EncDecBenchmark(BaseBenchmark):
Expand Down Expand Up @@ -100,6 +101,9 @@ def read_config(component):
dtype = pretrained_config["dtype"]

paged_kv_cache = plugin_config['paged_kv_cache']
kv_cache_type = get_kv_cache_type_from_legacy(
True, paged_kv_cache)

tokens_per_block = plugin_config['tokens_per_block']

gather_context_logits = builder_config.get(
Expand All @@ -120,7 +124,7 @@ def read_config(component):
num_layers=num_layers,
gpt_attention_plugin=use_gpt_attention_plugin,
remove_input_padding=remove_input_padding,
paged_kv_cache=paged_kv_cache,
kv_cache_type=kv_cache_type,
tokens_per_block=tokens_per_block,
cross_attention=cross_attention,
has_position_embedding=has_position_embedding,
Expand Down
11 changes: 9 additions & 2 deletions benchmarks/python/gpt_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import torch

import tensorrt_llm
from tensorrt_llm.bindings import KVCacheType
from tensorrt_llm.builder import Engine
from tensorrt_llm.runtime import (ChatGLMGenerationSession, GenerationSession,
SamplingConfig)
Expand Down Expand Up @@ -77,6 +78,13 @@ def __init__(self, args, batch_sizes, in_out_lens, gpu_weights_percents,
if hasattr(self, item):
rnn_configs_kwargs[item] = getattr(self, item)

kv_cache_type = KVCacheType.CONTINUOUS
if hasattr(self, 'kv_cache_type'):
kv_cache_type = self.kv_cache_type
else:
if hasattr(self, 'paged_kv_cache'):
kv_cache_type = KVCacheType.PAGED if self.paged_kv_cache == True else KVCacheType.CONTINUOUS

model_config = tensorrt_llm.runtime.ModelConfig(
max_batch_size=self.max_batch_size,
max_beam_width=self.num_beams,
Expand All @@ -86,8 +94,7 @@ def __init__(self, args, batch_sizes, in_out_lens, gpu_weights_percents,
num_kv_heads=ceil(self.num_kv_heads / self.world_size),
hidden_size=self.hidden_size // self.world_size,
gpt_attention_plugin=self.use_gpt_attention_plugin,
paged_kv_cache=self.paged_kv_cache if hasattr(
self, 'paged_kv_cache') else False,
kv_cache_type=kv_cache_type,
paged_state=self.paged_state
if hasattr(self, 'paged_state') else False,
dtype=self.dtype,
Expand Down
45 changes: 43 additions & 2 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,23 @@ else()
message(STATUS "Importing nvrtc wrapper")
endif()

if(EXISTS
"${CMAKE_CURRENT_SOURCE_DIR}/tensorrt_llm/kernels/internal_cutlass_kernels/CMakeLists.txt"
)
set(BUILD_INTERNAL_CUTLASS_KERNELS_DEFAULT ON)
else()
set(BUILD_INTERNAL_CUTLASS_KERNELS_DEFAULT OFF)
endif()
option(BUILD_INTERNAL_CUTLASS_KERNELS
"Build internal cutlass kernels from source"
${BUILD_INTERNAL_CUTLASS_KERNELS_DEFAULT})

if(BUILD_INTERNAL_CUTLASS_KERNELS)
message(STATUS "Building internal cutlass kernels")
else()
message(STATUS "Importing internal cutlass kernels")
endif()

if(BUILD_PYT)
message(STATUS "Building PyTorch")
else()
Expand Down Expand Up @@ -289,7 +306,7 @@ set(CMAKE_CUDA_RUNTIME_LIBRARY Static)
find_library(RT_LIB rt)

set_ifndef(ENABLE_MULTI_DEVICE 1)
if(ENABLE_MULTI_DEVICE EQUAL 1)
if(ENABLE_MULTI_DEVICE)
# NCCL dependencies
set_ifndef(NCCL_LIB_DIR /usr/lib/${CMAKE_SYSTEM_PROCESSOR}-linux-gnu/)
set_ifndef(NCCL_INCLUDE_DIR /usr/include/)
Expand Down Expand Up @@ -364,7 +381,7 @@ endif()
# set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -G")

set(CMAKE_CXX_FLAGS
"${CMAKE_CXX_FLAGS} -DBUILD_SYSTEM=cmake_oss -DENABLE_MULTI_DEVICE=${ENABLE_MULTI_DEVICE}"
"${CMAKE_CXX_FLAGS} -DBUILD_SYSTEM=cmake_oss -DENABLE_MULTI_DEVICE=${ENABLE_MULTI_DEVICE} -DENABLE_UCX=${ENABLE_UCX}"
)

# Fix linking issue with TRT 10, the detailed description about `--mcmodel` can
Expand Down Expand Up @@ -521,6 +538,30 @@ elseif(NOT WIN32)
message(STATUS "Build without PyTorch, USE_CXX11_ABI=${USE_CXX11_ABI}")
endif()

# Defer UCX/UCXX setup until after USE_CXX11_ABI is well defined, as UCXX will
# need to be built to have aligned symbols
set_ifndef(ENABLE_UCX 0)
if(ENABLE_UCX)
# Only enable UCX related features if the system has UCX library
find_package(ucx)
if(NOT ${ucx_FOUND})
set(ENABLE_UCX 0)
else()
# installing ucxx via add_subdirectory results in strange cudart linking
# error, thus using their installation script to isolate the installation
# process until the issue is understood. And always trigger the build so
# that change in USE_CXX11_ABI will not be ignored.
execute_process(
COMMAND
${3RDPARTY_DIR}/ucxx/build.sh libucxx -n
--cmake-args=\"-DBUILD_SHARED_LIBS=OFF
-DCMAKE_CXX_FLAGS=-D_GLIBCXX_USE_CXX11_ABI=${USE_CXX11_ABI}\"
COMMAND_ECHO STDOUT)
find_package(ucxx REQUIRED PATHS ${3RDPARTY_DIR}/ucxx/cpp/build
NO_DEFAULT_PATH)
endif()
endif()

file(STRINGS "${TRT_INCLUDE_DIR}/NvInferVersion.h" VERSION_STRINGS
REGEX "#define NV_TENSORRT_.*")
foreach(TYPE MAJOR MINOR PATCH BUILD)
Expand Down
23 changes: 23 additions & 0 deletions cpp/include/tensorrt_llm/batch_manager/inferenceRequest.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

#include "tensorrt_llm/batch_manager/llmRequest.h"
#include "tensorrt_llm/batch_manager/namedTensor.h"
#include "tensorrt_llm/executor/executor.h"
#include "tensorrt_llm/runtime/iTensor.h"

#include <algorithm>
Expand All @@ -35,10 +36,12 @@ namespace inference_request
{
// Input tensors
auto constexpr kInputIdsTensorName = "input_ids";
auto constexpr kPositionIdsTensorName = "position_ids";
auto constexpr kDraftInputIdsTensorName = "draft_input_ids";
auto constexpr kDraftLogitsTensorName = "draft_logits";
auto constexpr kMaxNewTokensTensorName = "request_output_len";
auto constexpr kBeamWidthTensorName = "beam_width";
auto constexpr kNumReturnSequencesTensorName = "num_return_sequences";
auto constexpr kEndIdTensorName = "end_id";
auto constexpr kPadIdTensorName = "pad_id";
auto constexpr kBadWordsListTensorName = "bad_words_list";
Expand Down Expand Up @@ -165,17 +168,34 @@ class GenericInferenceRequest
mLogitsPostProcessor = cb;
}

[[nodiscard]] std::optional<executor::LookaheadDecodingConfig> getLookaheadConfig() const
{
return mLookaheadConfig;
}

void setLookaheadConfig(executor::LookaheadDecodingConfig config)
{
mLookaheadConfig = config;
}

void clearLookaheadConfig()
{
mLookaheadConfig = std::nullopt;
}

std::optional<LogitsPostProcessor> getLogitsPostProcessor()
{
return mLogitsPostProcessor;
}

static std::array constexpr kTensorNames = {
inference_request::kInputIdsTensorName,
inference_request::kPositionIdsTensorName,
inference_request::kDraftInputIdsTensorName,
inference_request::kDraftLogitsTensorName,
inference_request::kMaxNewTokensTensorName,
inference_request::kBeamWidthTensorName,
inference_request::kNumReturnSequencesTensorName,
inference_request::kEndIdTensorName,
inference_request::kPadIdTensorName,
inference_request::kBadWordsListTensorName,
Expand Down Expand Up @@ -240,10 +260,12 @@ class GenericInferenceRequest
}

TENSOR_GETTER_SETTER(InputIds, inference_request::kInputIdsTensorName)
TENSOR_GETTER_SETTER(PositionIds, inference_request::kPositionIdsTensorName)
TENSOR_GETTER_SETTER(DraftInputIds, inference_request::kDraftInputIdsTensorName)
TENSOR_GETTER_SETTER(DraftLogits, inference_request::kDraftLogitsTensorName)
TENSOR_GETTER_SETTER(MaxNewTokens, inference_request::kMaxNewTokensTensorName)
TENSOR_GETTER_SETTER(BeamWidth, inference_request::kBeamWidthTensorName)
TENSOR_GETTER_SETTER(NumReturnSequences, inference_request::kNumReturnSequencesTensorName)
TENSOR_GETTER_SETTER(EndId, inference_request::kEndIdTensorName)
TENSOR_GETTER_SETTER(PadId, inference_request::kPadIdTensorName)
TENSOR_GETTER_SETTER(BadWordsList, inference_request::kBadWordsListTensorName)
Expand Down Expand Up @@ -282,6 +304,7 @@ class GenericInferenceRequest
bool mIsStreaming;
TensorMap mInputTensors;
std::optional<LogitsPostProcessor> mLogitsPostProcessor;
std::optional<executor::LookaheadDecodingConfig> mLookaheadConfig;
};

class InferenceRequest : public GenericInferenceRequest<tensorrt_llm::runtime::ITensor::SharedPtr, NamedTensor>
Expand Down
Loading