NVIDIA
diff --git a/‎benchmarks/python/benchmark.py
+2-1 b/‎benchmarks/python/benchmark.py
+2-1
diff --git a/‎benchmarks/python/build.py
+2 b/‎benchmarks/python/build.py
+2
diff --git a/‎cpp/include/tensorrt_llm/runtime/decodingOutput.h
+4-4 b/‎cpp/include/tensorrt_llm/runtime/decodingOutput.h
+4-4
diff --git a/‎cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt
+1-1 b/‎cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt
+1-1
diff --git a/‎cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a
+2-2 b/‎cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a
+2-2
diff --git a/‎cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
+2-2 b/‎cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
+2-2
diff --git a/‎cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/tensorrt_llm_batch_manager_static.lib
+2-2 b/‎cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/tensorrt_llm_batch_manager_static.lib
+2-2
diff --git a/‎cpp/tensorrt_llm/common/cudaFp8Utils.cu
+3 b/‎cpp/tensorrt_llm/common/cudaFp8Utils.cu
+3
diff --git a/‎cpp/tensorrt_llm/common/cudaTypeUtils.cuh
+3 b/‎cpp/tensorrt_llm/common/cudaTypeUtils.cuh
+3
diff --git a/‎cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.a
+2-2 b/‎cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.a
+2-2
diff --git a/‎cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a
+2-2 b/‎cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a
+2-2
diff --git a/‎cpp/tensorrt_llm/executor/aarch64-linux-gnu/version.txt
+3-3 b/‎cpp/tensorrt_llm/executor/aarch64-linux-gnu/version.txt
+3-3
diff --git a/‎cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.a
+2-2 b/‎cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.a
+2-2
diff --git a/‎cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a
+2-2 b/‎cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a
+2-2
diff --git a/‎cpp/tensorrt_llm/executor/x86_64-windows-msvc/tensorrt_llm_executor_static.lib
+2-2 b/‎cpp/tensorrt_llm/executor/x86_64-windows-msvc/tensorrt_llm_executor_static.lib
+2-2
diff --git a/‎cpp/tensorrt_llm/executor_worker/executorWorker.cpp
+6 b/‎cpp/tensorrt_llm/executor_worker/executorWorker.cpp
+6
diff --git a/‎cpp/tensorrt_llm/kernels/beamSearchKernels.h
+10-9 b/‎cpp/tensorrt_llm/kernels/beamSearchKernels.h
+10-9
@@ -232,7 +232,8 @@ def parse_arguments():
         choices=[
             'fp8', 'fp8_gemm', 'fp8_kv_cache', 'int8_sq_per_tensor',
             'int8_sq_per_token_channel', 'int8_weight_only', 'int4_weight_only',
-            'int4_weight_only_awq', 'int4_weight_only_gptq'
+            'int4_weight_only_awq', 'int4_weight_only_gptq',
+            'int8_sq_per_channel_ootb'
         ],
         help="Optimize the model with specified quantization recipe")
     parser.add_argument(
 
@@ -220,6 +220,8 @@ def get_quant_config(quantization: str):
     elif quantization == "int8_sq_per_token_channel":
         return QuantConfig(
             quant_algo=QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN)
+    elif quantization == "int8_sq_per_channel_ootb":
+        return QuantConfig(quant_algo=QuantAlgo.W8A8_SQ_PER_CHANNEL)
     elif quantization == "int8_weight_only":
         return QuantConfig(quant_algo=QuantAlgo.W8A16)
     elif quantization == "int4_weight_only":
 
@@ -35,14 +35,14 @@ class DecodingOutput
     class BeamHypotheses
     {
     public:
-        // The same as cpp/tensorrt_llm/kernels/beamSearchKernels.h
+        // Keep same as cpp/tensorrt_llm/kernels/beamSearchKernels.h
         TensorPtr outputIdsCBA;       // [BS, BM*2, MSL]
-        TensorPtr sequenceLengthsCBA; // [BS, BM]
+        TensorPtr logProbsCBA;        // [BS, BM*2, MSL]
+        TensorPtr sequenceLengthsCBA; // [BS, BM*2]
         TensorPtr cumLogProbsCBA;     // [BS, BM*2]
         TensorPtr normedScoresCBA;    // [BS, BM*2]
-        TensorPtr logProbsCBA;        // [BS, BM*2, MSL]
-        TensorPtr minNormedScoresCBA; // [BS]
         TensorPtr numBeamsCBA;        // [BS]
+        TensorPtr minNormedScoresCBA; // [BS]
         TensorPtr batchDones;         // [BS]
 
         void empty(BufferManager& manager);
 
@@ -1,3 +1,3 @@
 ed2ee8d73a5d374e800f653169bf293e libtensorrt_llm_batch_manager_static.a
 ed2ee8d73a5d374e800f653169bf293e libtensorrt_llm_batch_manager_static.pre_cxx11.a
-f088526f4bce4b1143c67973b3502734c3491ab9 commit
+05aaf1a0fb2f0115af107b00aa839a6601f6a873 commit
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c2cc6b820b9eb87d3417070b2996966e6147c28ba95e47cb97ae7c5d4375b8aa
-size 3210470
+oid sha256:00e2d6ee8efd00e27dd8da61be576ba7978d885a055d591c90f600b334356846
+size 3211414
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a0c7e0f8e717637d2686c44814640c25c71703bbdb50465955c35990e45a0399
-size 3185534
+oid sha256:b6b65183b0aa3f40f68aa13105da9dc00fb75b8bf8892813e46a09e3f0743570
+size 3186478
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5de50dfcea7e67aa4f8b1c5404f9902ceea909d798d0154a44800d3af46ce1b1
-size 19838492
+oid sha256:7d4a3bc5160666612e529f21c61dbd9d0f1b387662768f76b9351f877108f84b
+size 19840380
@@ -206,6 +206,9 @@ __device__ __nv_bfloat16 atomicMaxExtd(__nv_bfloat16* address, __nv_bfloat16 val
     }
 
     return __ushort_as_bfloat16(old);
+#else
+    asm volatile("  brkpt;\n");
+    return 0;
 #endif
 }
 
 
@@ -597,6 +597,9 @@ __device__ inline __nv_bfloat16 cuda_max(__nv_bfloat162 val)
 {
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
     return __hmax(val.x, val.y);
+#else
+    asm volatile("  brkpt;\n");
+    return 0;
 #endif
 }
 #endif
 
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8d54459f1db7a6b78b67f2a7f378bf4fab4a24fdf70e08f5b1e2bcbf7ffd538d
-size 1251758
+oid sha256:8ff03e99e17e64c9f559e4586dec3983d438857c0050a34a417e4d86d56fbe2a
+size 1251854
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8d54459f1db7a6b78b67f2a7f378bf4fab4a24fdf70e08f5b1e2bcbf7ffd538d
-size 1251758
+oid sha256:8ff03e99e17e64c9f559e4586dec3983d438857c0050a34a417e4d86d56fbe2a
+size 1251854
@@ -1,3 +1,3 @@
-3c423e837e67ce86756ea468438c41a8 libtensorrt_llm_executor_static.a
-3c423e837e67ce86756ea468438c41a8 libtensorrt_llm_executor_static.pre_cxx11.a
-f088526f4bce4b1143c67973b3502734c3491ab9 commit
+54670adde093baff8b031869bdeeeb1b libtensorrt_llm_executor_static.a
+54670adde093baff8b031869bdeeeb1b libtensorrt_llm_executor_static.pre_cxx11.a
+05aaf1a0fb2f0115af107b00aa839a6601f6a873 commit
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ec73640a8f4a20baf86ec85f02ff673edf209f77d2b57b7c5b9e11a8abcc3dd5
-size 1269974
+oid sha256:431dc6352dcb332821aab031ccbd887e6a60591a5ea276a9ffd3df1f28463326
+size 1271014
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b5315703c0e4c6f0154ac60b3700187dbf787d50fdfc1cdf33b9806814a39846
-size 1226290
+oid sha256:86319542d275570a0c66622d4656b88f3d153c6861db0e53f17f29d47e0a30c9
+size 1227362
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0aec01d232889a57fc99133e28a39c82ae47cfcc58894a938d851d9b2c3e64d5
-size 12074178
+oid sha256:8ed99448579b40e0046eca5c8989151a66579f8fbccef9cda4ee7fc2ffd2245b
+size 12076106
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include "tensorrt_llm/common/cudaUtils.h"
 #include "tensorrt_llm/common/logger.h"
 #include "tensorrt_llm/common/mpiUtils.h"
 #include "tensorrt_llm/executor/executor.h"
@@ -46,6 +47,11 @@ int main(int argc, char* argv[])
         return -1;
     }
 
+    // TRT-LLM event synchronization sometimes takes extra time to complete
+    // after the kernel has finished. Using a yield in the wait helps improve
+    // performance.
+    TLLM_CUDA_CHECK(::cudaSetDeviceFlags(cudaDeviceScheduleYield));
+
     // Since parentComm is an intercommunicator, input root
     // is the rank of the parent process in his group
     // (always 0 as the parent size is checked before)
 
@@ -29,18 +29,18 @@ struct BeamHypotheses
 {
     // clang-format off
 
-    // BS: batch_size, BM: beam_width, MSL: max_seq_length
-    // %%: parameter name when dynamic_decoder.forward() / gather_tree() are called in [generation.py] (python workflow)
+    // MBS: max_batch_size, BS: batch_size, BM: beam_width, MSL: max_seq_length
+    // %%: parameter name in file generation.py (python workflow)
 
     // Candidate beams: a beam which generates end_id or its sequence length reaches MSL
-    // Candidate-Beam-Array (CBA): The arrays (size: BM*2) to place the candidate beams and related information
+    // Candidate-Beam-Array (CBA): The arrays to place the candidate beams and related information
 
     // Scalar values
     bool bReturnNormedScore{false};     // return normed_score / cum_log_probs, useless yet
-    int nBatchSize{0};                  //
+    int nMaxBatchSize{0};               // max batch size by model configuration
+    int nBatchSize{0};                  // batch size by runtime input data
     int nBeamWidth{0};                  //
     int nIte{0};                        // index of local_batch, always be 0 when pp_size==1
-    int nBatchSizeLocal{0};             //
     int nMaxSeqLen{0};                  //
     int nVocabSize{0};                  // vocab_size_padded
 
@@ -54,8 +54,9 @@ struct BeamHypotheses
     int const* endIds{nullptr};         // [BS, BM]         %% self.end_ids
 
     // Pointers for output
-    int* outputIds{nullptr};            // [BS, BM, MSL]    %% self.output_ids
-    float* logProbs{nullptr};           // [MSL, BS, BM]    %% self.log_probs_tiled
+    int* outputIds{nullptr};            // [BS, BM, MSL]    %% self.output_ids                      only used in gather_tree
+    float* logProbs{nullptr};           // [BS, BM, MSL]    %% self.log_probs                       only used in gather_tree
+    float* logProbsTiled{nullptr};      // [MSL, MBS, BM]   %% self.log_probs_tiled
     int* sequenceLengths{nullptr};      // [BS, BM]         %% self.sequence_length_buffer
     float* cumLogProbs{nullptr};        // [BS, BM]         %% self.cum_log_probs
 
@@ -65,8 +66,8 @@ struct BeamHypotheses
     int* sequenceLengthsCBA{nullptr};   // [BS, BM*2]       %% self.beam_hyps_seq_len_cba
     float* cumLogProbsCBA{nullptr};     // [BS, BM*2]       %% self.beam_hyps_cum_log_probs_cba
     float* normedScoresCBA{nullptr};    // [BS, BM*2]       %% self.beam_hyps_normed_scores_cba
-    int* numBeamsCBA{nullptr};          // [BS]             %% self.beam_hyps_num_beams           number of beams in CBA
-    float* minNormedScoresCBA{nullptr}; // [BS]             %% self.beam_hyps_min_normed_scores   worst score in CBA
+    int* numBeamsCBA{nullptr};          // [BS]             %% self.beam_hyps_num_beams             number of beams in CBA
+    float* minNormedScoresCBA{nullptr}; // [BS]             %% self.beam_hyps_min_normed_scores     worst score in CBA
 
     // Pointers related to beam search process, they are initialized in those two functions:
     // [gptDecoder.cpp] GptDecoder<T>::forward or [dynamicDecodeOp.cpp] FtDynamicDecode<T>::forward
Original file line number	Diff line number	Diff line change
`@@ -206,6 +206,9 @@ __device__ __nv_bfloat16 atomicMaxExtd(__nv_bfloat16* address, __nv_bfloat16 val`
`206`	`206`	`}`
`207`	`207`
`208`	`208`	`return __ushort_as_bfloat16(old);`
	`209`	`+#else`
	`210`	`+ asm volatile(" brkpt;\n");`
	`211`	`+ return 0;`
`209`	`212`	`#endif`
`210`	`213`	`}`
`211`	`214`
Original file line number	Diff line number	Diff line change
`@@ -597,6 +597,9 @@ __device__ inline __nv_bfloat16 cuda_max(__nv_bfloat162 val)`
`597`	`597`	`{`
`598`	`598`	`#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))`
`599`	`599`	`return __hmax(val.x, val.y);`
	`600`	`+#else`
	`601`	`+ asm volatile(" brkpt;\n");`
	`602`	`+ return 0;`
`600`	`603`	`#endif`
`601`	`604`	`}`
`602`	`605`	`#endif`