Skip to content

Commit 9b931c0

Browse files
authored
Update TensorRT-LLM (#2873)
1 parent c384d26 commit 9b931c0

File tree

629 files changed

+1305551
-20105
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

629 files changed

+1305551
-20105
lines changed

.cursorignore

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
# Add directories or file patterns to ignore during indexing (e.g. foo/ or *.csv)
2+
*cubin.cpp

.pre-commit-config.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ repos:
44
hooks:
55
- id: isort
66
- repo: https://github.com/Lucas-C/pre-commit-hooks.git
7-
rev: v1.1.13
7+
rev: v1.5.5
88
hooks:
99
- id: remove-crlf
1010
- repo: https://github.com/google/yapf

benchmarks/cpp/disaggServerBenchmark.cpp

-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@
1515
* limitations under the License.
1616
*/
1717

18-
#include "tensorrt_llm/batch_manager/GptManager.h"
1918
#include "tensorrt_llm/common/assert.h"
2019
#include "tensorrt_llm/common/logger.h"
2120
#include "tensorrt_llm/executor/disaggServerUtil.h"

cpp/CMakeLists.txt

+56
Original file line numberDiff line numberDiff line change
@@ -524,6 +524,62 @@ if((WIN32))
524524
endif()
525525
endif()
526526

527+
if(SANITIZE)
528+
if(WIN32)
529+
message(FATAL_ERROR "Sanitizing support is unimplemented on Windows.")
530+
endif()
531+
532+
macro(add_clang_rt_lib lib_name)
533+
if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
534+
execute_process(
535+
COMMAND
536+
${CMAKE_CXX_COMPILER}
537+
"-print-file-name=libclang_rt.${lib_name}-${CMAKE_SYSTEM_PROCESSOR}.so"
538+
OUTPUT_VARIABLE CLANG_SAN_LIBRARY_PATH OUTPUT_STRIP_TRAILING_WHITESPACE)
539+
link_libraries(${CLANG_SAN_LIBRARY_PATH})
540+
endif()
541+
endmacro()
542+
543+
string(TOLOWER ${SANITIZE} SANITIZE)
544+
545+
if("undefined" IN_LIST SANITIZE)
546+
message(STATUS "Enabling extra sub-sanitizers for UBSan")
547+
list(APPEND SANITIZE "float-divide-by-zero")
548+
549+
if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
550+
list(APPEND SANITIZE "unsigned-integer-overflow" "implicit-conversion"
551+
"local-bounds")
552+
endif()
553+
add_clang_rt_lib("ubsan_standalone")
554+
add_compile_definitions("SANITIZE_UNDEFINED")
555+
endif()
556+
557+
if("address" IN_LIST SANITIZE)
558+
message(STATUS "Enabling extra sub-sanitizers for ASan")
559+
list(APPEND SANITIZE "pointer-compare" "pointer-subtract")
560+
add_compile_options("-fno-omit-frame-pointer;-fno-optimize-sibling-calls")
561+
562+
if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
563+
add_compile_options("-fsanitize-address-use-after-return=always")
564+
add_link_options("-fsanitize-address-use-after-return=always")
565+
endif()
566+
add_clang_rt_lib("asan")
567+
endif()
568+
569+
if("thread" IN_LIST SANITIZE)
570+
add_compile_options("-ftls-model=local-dynamic")
571+
add_clang_rt_lib("tsan")
572+
endif()
573+
574+
list(REMOVE_DUPLICATES SANITIZE)
575+
message(STATUS "Enabled sanitizers: ${SANITIZE}")
576+
577+
foreach(SANITIZER IN LISTS SANITIZE)
578+
add_compile_options("-fsanitize=${SANITIZER}")
579+
add_link_options("-fsanitize=${SANITIZER}")
580+
endforeach()
581+
endif()
582+
527583
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda")
528584
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
529585
if(FAST_MATH)

cpp/include/tensorrt_llm/batch_manager/GptManager.h

-131
This file was deleted.

cpp/include/tensorrt_llm/batch_manager/callbacks.h

-37
This file was deleted.

cpp/include/tensorrt_llm/batch_manager/createNewDecoderRequests.h

+10-10
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ class CreateNewDecoderRequests : Algorithm
4343

4444
using SizeType32 = tensorrt_llm::runtime::SizeType32;
4545
using SamplingConfig = tensorrt_llm::runtime::SamplingConfig;
46-
using CudaStreamPtr = std::shared_ptr<tensorrt_llm::runtime::CudaStream>;
46+
using CudaStream = tensorrt_llm::runtime::CudaStream;
4747
using TensorPtr = runtime::ITensor::SharedPtr;
4848
using SharedConstPtr = runtime::ITensor::SharedConstPtr;
4949
using DecodingInput = runtime::DecodingInput;
@@ -55,42 +55,42 @@ class CreateNewDecoderRequests : Algorithm
5555

5656
void operator()(TensorPtr const& batchSlots, std::vector<runtime::decoder_batch::Request> const& requests,
5757
std::vector<SamplingConfig> const& samplingConfigs, runtime::ModelConfig const& modelConfig,
58-
GptDecoderBatched& decoder, CudaStreamPtr const& runtimeStream, SizeType32 maxSequenceLength) const;
58+
GptDecoderBatched& decoder, CudaStream const& runtimeStream, SizeType32 maxSequenceLength) const;
5959

6060
//! @brief Initialize the decoder at `batchSlot` with a new `request`. Exposed only for static batching via
6161
//! GptDecoderBatched::newBatch()
6262
void newRequest(SizeType32 batchSlot, runtime::decoder_batch::Request const& request,
6363
SamplingConfig const& samplingConfig, runtime::ModelConfig const& modelConfig, GptDecoderBatched& decoder,
64-
CudaStreamPtr runtimeStream, SizeType32 maxSequenceLength) const;
64+
CudaStream const& runtimeStream, SizeType32 maxSequenceLength) const;
6565

6666
private:
6767
//! @brief Setups decoder internal tensors for new speculative decoding request
6868
void newRequestSpeculativeDecoding(SizeType32 batchIdx, runtime::decoder_batch::Request const& request,
6969
SamplingConfig const& samplingConfig, runtime::ModelConfig const& modelConfig,
70-
DecodingInput& jointDecodingInput, DecodingOutput& jointDecodingOutput, CudaStreamPtr runtimeStream,
71-
CudaStreamPtr decoderStream, SpeculativeDecodingMode const& speculativeDecodingMode,
70+
DecodingInput& jointDecodingInput, DecodingOutput& jointDecodingOutput, CudaStream const& runtimeStream,
71+
CudaStream const& decoderStream, SpeculativeDecodingMode const& speculativeDecodingMode,
7272
SizeType32 maxDecodingEngineTokens) const;
7373

7474
//! @brief Setups decoder internal tensors for new request in Draft model Sps mode
7575
void newRequestDraftTokensExternal(SizeType32 batchIdx, runtime::decoder_batch::Request const& request,
76-
SamplingConfig const& samplingConfig, DecodingInput& jointDecodingInput, CudaStreamPtr decoderStream) const;
76+
SamplingConfig const& samplingConfig, DecodingInput& jointDecodingInput, CudaStream const& decoderStream) const;
7777

7878
//! @brief Setups decoder internal tensors for new Medusa request
7979
void newRequestMedusa(SizeType32 batchIdx, runtime::decoder_batch::Request const& request,
80-
DecodingInput& jointDecodingInput, CudaStreamPtr decoderStream, SizeType32 maxDecodingEngineTokens) const;
80+
DecodingInput& jointDecodingInput, CudaStream const& decoderStream, SizeType32 maxDecodingEngineTokens) const;
8181

8282
//! @brief Setups decoder internal tensors for new Lookahead request
8383
void newRequestLookahead(SizeType32 batchIdx, runtime::decoder_batch::Request const& request,
84-
DecodingInput& jointDecodingInput, DecodingOutput& jointDecodingOutput, CudaStreamPtr runtimeStream) const;
84+
DecodingInput& jointDecodingInput, DecodingOutput& jointDecodingOutput, CudaStream const& runtimeStream) const;
8585

8686
//! @brief Setups decoder internal tensors for new Explicit draft tokens request
8787
void newRequestExplicitDraftTokens(SizeType32 batchIdx, runtime::decoder_batch::Request const& request,
88-
DecodingOutput& jointDecodingOutput, CudaStreamPtr runtimeStream) const;
88+
DecodingOutput& jointDecodingOutput, CudaStream const& runtimeStream) const;
8989

9090
//! @brief Setups decoder internal tensors for new Eagle request
9191
void newRequestEagle(SizeType32 batchIdx, runtime::decoder_batch::Request const& request,
9292
runtime::ModelConfig const& modelConfig, DecodingOutput& jointDecodingOutput,
93-
CudaStreamPtr runtimeStream) const;
93+
CudaStream const& runtimeStream) const;
9494
};
9595

9696
} // namespace tensorrt_llm::batch_manager

cpp/include/tensorrt_llm/batch_manager/decoderBuffers.h

+3-8
Original file line numberDiff line numberDiff line change
@@ -27,11 +27,6 @@
2727
#include <optional>
2828
#include <vector>
2929

30-
namespace tensorrt_llm::runtime
31-
{
32-
class TllmRuntime;
33-
} // namespace tensorrt_llm::runtime
34-
3530
namespace tensorrt_llm::batch_manager
3631
{
3732

@@ -141,7 +136,7 @@ class DecoderBuffers
141136
std::vector<std::vector<runtime::ITensor::SharedPtr>>
142137
predictedDraftLogits; // [mMaxNumRequests][mMaxNumHeads][maxDraftTokens + 1, vocabSize]
143138

144-
void create(SizeType32 maxNumSequences, SizeType32 maxTokensPerStep, runtime::TllmRuntime const& runtime,
139+
void create(SizeType32 maxNumSequences, SizeType32 maxTokensPerStep, runtime::BufferManager const& manager,
145140
runtime::ModelConfig const& modelConfig);
146141
};
147142

@@ -151,7 +146,7 @@ class DecoderBuffers
151146
std::optional<runtime::LookaheadDecodingBuffers> lookaheadBuffers;
152147

153148
DecoderBuffers(SizeType32 maxNumSequences, SizeType32 maxBeamWidth, SizeType32 maxAttentionWindow,
154-
SizeType32 maxSeqLen, SizeType32 maxTokensPerStep, runtime::TllmRuntime const& runtime,
149+
SizeType32 maxSeqLen, SizeType32 maxTokensPerStep, runtime::BufferManager const& manager,
155150
runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig);
156151

157152
std::unique_ptr<DecoderStepAsyncSend> asyncSend(std::shared_ptr<mpi::MpiComm> const& commSession,
@@ -182,7 +177,7 @@ class SlotDecoderBuffers
182177
TensorPtr logProbsHost; // [beamWidth, maxSeqLen]
183178
TensorPtr finishReasonsHost; // [beamWidth]
184179

185-
SlotDecoderBuffers(SizeType32 maxBeamWidth, SizeType32 maxSeqLen, runtime::TllmRuntime const& runtime);
180+
SlotDecoderBuffers(SizeType32 maxBeamWidth, SizeType32 maxSeqLen, runtime::BufferManager const& manager);
186181

187182
static std::unique_ptr<DecoderSlotAsyncSend> asyncSend(std::shared_ptr<mpi::MpiComm> const& commSession,
188183
TensorPtr const& outputIdsView, TensorPtr const& sequenceLengthView, TensorPtr const& cumLogProbsView,

cpp/include/tensorrt_llm/batch_manager/evictionPolicy.h

+4
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,8 @@ class BaseEvictionPolicy
5252
= 0;
5353
/// @brief Perform any per-iteration bookkeeping
5454
virtual void refresh() = 0;
55+
56+
virtual bool verifyQueueIntegrity() = 0;
5557
};
5658

5759
struct ExpiringBlockComparator
@@ -87,6 +89,8 @@ class LRUEvictionPolicy : public BaseEvictionPolicy
8789
// Making this public and virtual makes it possible to test.
8890
[[nodiscard]] virtual std::chrono::steady_clock::time_point::duration getTime() const;
8991

92+
bool verifyQueueIntegrity() override;
93+
9094
private:
9195
// Check if the block should be added to mFreeQueues.
9296
bool isReleasedLeafBlock(BlockPtr const& block);

0 commit comments

Comments
 (0)