NVIDIA
diff --git a/‎.cursorignore
+2 b/‎.cursorignore
+2
diff --git a/‎.pre-commit-config.yaml
+1-1 b/‎.pre-commit-config.yaml
+1-1
diff --git a/‎benchmarks/cpp/disaggServerBenchmark.cpp
-1 b/‎benchmarks/cpp/disaggServerBenchmark.cpp
-1
diff --git a/‎cpp/CMakeLists.txt
+56 b/‎cpp/CMakeLists.txt
+56
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/GptManager.h
-131 b/‎cpp/include/tensorrt_llm/batch_manager/GptManager.h
-131
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/callbacks.h
-37 b/‎cpp/include/tensorrt_llm/batch_manager/callbacks.h
-37
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/createNewDecoderRequests.h
+10-10 b/‎cpp/include/tensorrt_llm/batch_manager/createNewDecoderRequests.h
+10-10
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/decoderBuffers.h
+3-8 b/‎cpp/include/tensorrt_llm/batch_manager/decoderBuffers.h
+3-8
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/evictionPolicy.h
+4 b/‎cpp/include/tensorrt_llm/batch_manager/evictionPolicy.h
+4
@@ -0,0 +1,2 @@
+# Add directories or file patterns to ignore during indexing (e.g. foo/ or *.csv)
+*cubin.cpp
@@ -4,7 +4,7 @@ repos:
     hooks:
     -   id: isort
 -   repo: https://github.com/Lucas-C/pre-commit-hooks.git
-    rev: v1.1.13
+    rev: v1.5.5
     hooks:
     -   id: remove-crlf
 -   repo: https://github.com/google/yapf
 
@@ -15,7 +15,6 @@
  * limitations under the License.
  */
 
-#include "tensorrt_llm/batch_manager/GptManager.h"
 #include "tensorrt_llm/common/assert.h"
 #include "tensorrt_llm/common/logger.h"
 #include "tensorrt_llm/executor/disaggServerUtil.h"
 
@@ -524,6 +524,62 @@ if((WIN32))
   endif()
 endif()
 
+if(SANITIZE)
+  if(WIN32)
+    message(FATAL_ERROR "Sanitizing support is unimplemented on Windows.")
+  endif()
+
+  macro(add_clang_rt_lib lib_name)
+    if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+      execute_process(
+        COMMAND
+          ${CMAKE_CXX_COMPILER}
+          "-print-file-name=libclang_rt.${lib_name}-${CMAKE_SYSTEM_PROCESSOR}.so"
+        OUTPUT_VARIABLE CLANG_SAN_LIBRARY_PATH OUTPUT_STRIP_TRAILING_WHITESPACE)
+      link_libraries(${CLANG_SAN_LIBRARY_PATH})
+    endif()
+  endmacro()
+
+  string(TOLOWER ${SANITIZE} SANITIZE)
+
+  if("undefined" IN_LIST SANITIZE)
+    message(STATUS "Enabling extra sub-sanitizers for UBSan")
+    list(APPEND SANITIZE "float-divide-by-zero")
+
+    if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+      list(APPEND SANITIZE "unsigned-integer-overflow" "implicit-conversion"
+           "local-bounds")
+    endif()
+    add_clang_rt_lib("ubsan_standalone")
+    add_compile_definitions("SANITIZE_UNDEFINED")
+  endif()
+
+  if("address" IN_LIST SANITIZE)
+    message(STATUS "Enabling extra sub-sanitizers for ASan")
+    list(APPEND SANITIZE "pointer-compare" "pointer-subtract")
+    add_compile_options("-fno-omit-frame-pointer;-fno-optimize-sibling-calls")
+
+    if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+      add_compile_options("-fsanitize-address-use-after-return=always")
+      add_link_options("-fsanitize-address-use-after-return=always")
+    endif()
+    add_clang_rt_lib("asan")
+  endif()
+
+  if("thread" IN_LIST SANITIZE)
+    add_compile_options("-ftls-model=local-dynamic")
+    add_clang_rt_lib("tsan")
+  endif()
+
+  list(REMOVE_DUPLICATES SANITIZE)
+  message(STATUS "Enabled sanitizers: ${SANITIZE}")
+
+  foreach(SANITIZER IN LISTS SANITIZE)
+    add_compile_options("-fsanitize=${SANITIZER}")
+    add_link_options("-fsanitize=${SANITIZER}")
+  endforeach()
+endif()
+
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda")
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
 if(FAST_MATH)
 
@@ -43,7 +43,7 @@ class CreateNewDecoderRequests : Algorithm
 
     using SizeType32 = tensorrt_llm::runtime::SizeType32;
     using SamplingConfig = tensorrt_llm::runtime::SamplingConfig;
-    using CudaStreamPtr = std::shared_ptr<tensorrt_llm::runtime::CudaStream>;
+    using CudaStream = tensorrt_llm::runtime::CudaStream;
     using TensorPtr = runtime::ITensor::SharedPtr;
     using SharedConstPtr = runtime::ITensor::SharedConstPtr;
     using DecodingInput = runtime::DecodingInput;
@@ -55,42 +55,42 @@ class CreateNewDecoderRequests : Algorithm
 
     void operator()(TensorPtr const& batchSlots, std::vector<runtime::decoder_batch::Request> const& requests,
         std::vector<SamplingConfig> const& samplingConfigs, runtime::ModelConfig const& modelConfig,
-        GptDecoderBatched& decoder, CudaStreamPtr const& runtimeStream, SizeType32 maxSequenceLength) const;
+        GptDecoderBatched& decoder, CudaStream const& runtimeStream, SizeType32 maxSequenceLength) const;
 
     //! @brief Initialize the decoder at `batchSlot` with a new `request`. Exposed only for static batching via
     //! GptDecoderBatched::newBatch()
     void newRequest(SizeType32 batchSlot, runtime::decoder_batch::Request const& request,
         SamplingConfig const& samplingConfig, runtime::ModelConfig const& modelConfig, GptDecoderBatched& decoder,
-        CudaStreamPtr runtimeStream, SizeType32 maxSequenceLength) const;
+        CudaStream const& runtimeStream, SizeType32 maxSequenceLength) const;
 
 private:
     //! @brief Setups decoder internal tensors for new speculative decoding request
     void newRequestSpeculativeDecoding(SizeType32 batchIdx, runtime::decoder_batch::Request const& request,
         SamplingConfig const& samplingConfig, runtime::ModelConfig const& modelConfig,
-        DecodingInput& jointDecodingInput, DecodingOutput& jointDecodingOutput, CudaStreamPtr runtimeStream,
-        CudaStreamPtr decoderStream, SpeculativeDecodingMode const& speculativeDecodingMode,
+        DecodingInput& jointDecodingInput, DecodingOutput& jointDecodingOutput, CudaStream const& runtimeStream,
+        CudaStream const& decoderStream, SpeculativeDecodingMode const& speculativeDecodingMode,
         SizeType32 maxDecodingEngineTokens) const;
 
     //! @brief Setups decoder internal tensors for new request in Draft model Sps mode
     void newRequestDraftTokensExternal(SizeType32 batchIdx, runtime::decoder_batch::Request const& request,
-        SamplingConfig const& samplingConfig, DecodingInput& jointDecodingInput, CudaStreamPtr decoderStream) const;
+        SamplingConfig const& samplingConfig, DecodingInput& jointDecodingInput, CudaStream const& decoderStream) const;
 
     //! @brief Setups decoder internal tensors for new Medusa request
     void newRequestMedusa(SizeType32 batchIdx, runtime::decoder_batch::Request const& request,
-        DecodingInput& jointDecodingInput, CudaStreamPtr decoderStream, SizeType32 maxDecodingEngineTokens) const;
+        DecodingInput& jointDecodingInput, CudaStream const& decoderStream, SizeType32 maxDecodingEngineTokens) const;
 
     //! @brief Setups decoder internal tensors for new Lookahead request
     void newRequestLookahead(SizeType32 batchIdx, runtime::decoder_batch::Request const& request,
-        DecodingInput& jointDecodingInput, DecodingOutput& jointDecodingOutput, CudaStreamPtr runtimeStream) const;
+        DecodingInput& jointDecodingInput, DecodingOutput& jointDecodingOutput, CudaStream const& runtimeStream) const;
 
     //! @brief Setups decoder internal tensors for new Explicit draft tokens request
     void newRequestExplicitDraftTokens(SizeType32 batchIdx, runtime::decoder_batch::Request const& request,
-        DecodingOutput& jointDecodingOutput, CudaStreamPtr runtimeStream) const;
+        DecodingOutput& jointDecodingOutput, CudaStream const& runtimeStream) const;
 
     //! @brief Setups decoder internal tensors for new Eagle request
     void newRequestEagle(SizeType32 batchIdx, runtime::decoder_batch::Request const& request,
         runtime::ModelConfig const& modelConfig, DecodingOutput& jointDecodingOutput,
-        CudaStreamPtr runtimeStream) const;
+        CudaStream const& runtimeStream) const;
 };
 
 } // namespace tensorrt_llm::batch_manager
@@ -27,11 +27,6 @@
 #include <optional>
 #include <vector>
 
-namespace tensorrt_llm::runtime
-{
-class TllmRuntime;
-} // namespace tensorrt_llm::runtime
-
 namespace tensorrt_llm::batch_manager
 {
 
@@ -141,7 +136,7 @@ class DecoderBuffers
         std::vector<std::vector<runtime::ITensor::SharedPtr>>
             predictedDraftLogits;               // [mMaxNumRequests][mMaxNumHeads][maxDraftTokens + 1, vocabSize]
 
-        void create(SizeType32 maxNumSequences, SizeType32 maxTokensPerStep, runtime::TllmRuntime const& runtime,
+        void create(SizeType32 maxNumSequences, SizeType32 maxTokensPerStep, runtime::BufferManager const& manager,
             runtime::ModelConfig const& modelConfig);
     };
 
@@ -151,7 +146,7 @@ class DecoderBuffers
     std::optional<runtime::LookaheadDecodingBuffers> lookaheadBuffers;
 
     DecoderBuffers(SizeType32 maxNumSequences, SizeType32 maxBeamWidth, SizeType32 maxAttentionWindow,
-        SizeType32 maxSeqLen, SizeType32 maxTokensPerStep, runtime::TllmRuntime const& runtime,
+        SizeType32 maxSeqLen, SizeType32 maxTokensPerStep, runtime::BufferManager const& manager,
         runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig);
 
     std::unique_ptr<DecoderStepAsyncSend> asyncSend(std::shared_ptr<mpi::MpiComm> const& commSession,
@@ -182,7 +177,7 @@ class SlotDecoderBuffers
     TensorPtr logProbsHost;        // [beamWidth, maxSeqLen]
     TensorPtr finishReasonsHost;   // [beamWidth]
 
-    SlotDecoderBuffers(SizeType32 maxBeamWidth, SizeType32 maxSeqLen, runtime::TllmRuntime const& runtime);
+    SlotDecoderBuffers(SizeType32 maxBeamWidth, SizeType32 maxSeqLen, runtime::BufferManager const& manager);
 
     static std::unique_ptr<DecoderSlotAsyncSend> asyncSend(std::shared_ptr<mpi::MpiComm> const& commSession,
         TensorPtr const& outputIdsView, TensorPtr const& sequenceLengthView, TensorPtr const& cumLogProbsView,
 
@@ -52,6 +52,8 @@ class BaseEvictionPolicy
         = 0;
     /// @brief Perform any per-iteration bookkeeping
     virtual void refresh() = 0;
+
+    virtual bool verifyQueueIntegrity() = 0;
 };
 
 struct ExpiringBlockComparator
@@ -87,6 +89,8 @@ class LRUEvictionPolicy : public BaseEvictionPolicy
     // Making this public and virtual makes it possible to test.
     [[nodiscard]] virtual std::chrono::steady_clock::time_point::duration getTime() const;
 
+    bool verifyQueueIntegrity() override;
+
 private:
     // Check if the block should be added to mFreeQueues.
     bool isReleasedLeafBlock(BlockPtr const& block);
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+# Add directories or file patterns to ignore during indexing (e.g. foo/ or *.csv)`
	`2`	`+*cubin.cpp`