lambda7xx · Jun 18, 2024
diff --git a/‎benchmarks/cpp/README.md
+1-1 b/‎benchmarks/cpp/README.md
+1-1
diff --git a/‎benchmarks/cpp/bertBenchmark.cpp
+2-2 b/‎benchmarks/cpp/bertBenchmark.cpp
+2-2
@@ -232,7 +232,7 @@ ${HOME}/.local/bin/trtllm-build \
     --output_dir ${LORA_ENGINE} \
     --max_batch_size ${MAX_BATCH} \
     --max_input_len $MAX_LEN \
-    --max_output_len $MAX_LEN \
+    --max_seq_len $((2*${MAX_LEN})) \
     --gemm_plugin float16 \
     --lora_plugin float16 \
     --use_paged_context_fmha enable \
 
@@ -17,6 +17,7 @@
 #include "tensorrt_llm/common/memoryUtils.h"
 #include "tensorrt_llm/plugins/api/tllmPlugin.h"
 #include "tensorrt_llm/runtime/iTensor.h"
+#include "tensorrt_llm/runtime/rawEngine.h"
 #include "tensorrt_llm/runtime/tllmLogger.h"
 #include "tensorrt_llm/runtime/tllmRuntime.h"
 #include "tensorrt_llm/runtime/worldConfig.h"
@@ -78,11 +79,10 @@ void benchmarkBert(std::string const& modelName, std::filesystem::path const& da
 {
     auto const worldConfig = WorldConfig::mpi();
     auto const enginePath = dataPath / engineFilename(dataPath, worldConfig, modelName);
-    auto engineBlob = loadEngine(enginePath.string());
 
     for (float gpuWeightsPercent : gpuWeightsPercents)
     {
-        auto rt = std::make_shared<TllmRuntime>(engineBlob.data(), engineBlob.size(), gpuWeightsPercent, *logger);
+        auto rt = std::make_shared<TllmRuntime>(RawEngine(enginePath), logger.get(), gpuWeightsPercent);
         rt->addContext(0);
         for (auto inLen : inLens)
         {
Original file line number	Diff line number	Diff line change
`@@ -17,6 +17,7 @@`
`17`	`17`	`#include "tensorrt_llm/common/memoryUtils.h"`
`18`	`18`	`#include "tensorrt_llm/plugins/api/tllmPlugin.h"`
`19`	`19`	`#include "tensorrt_llm/runtime/iTensor.h"`
	`20`	`+#include "tensorrt_llm/runtime/rawEngine.h"`
`20`	`21`	`#include "tensorrt_llm/runtime/tllmLogger.h"`
`21`	`22`	`#include "tensorrt_llm/runtime/tllmRuntime.h"`
`22`	`23`	`#include "tensorrt_llm/runtime/worldConfig.h"`
`@@ -78,11 +79,10 @@ void benchmarkBert(std::string const& modelName, std::filesystem::path const& da`
`78`	`79`	`{`
`79`	`80`	`auto const worldConfig = WorldConfig::mpi();`
`80`	`81`	`auto const enginePath = dataPath / engineFilename(dataPath, worldConfig, modelName);`
`81`		`- auto engineBlob = loadEngine(enginePath.string());`
`82`	`82`
`83`	`83`	`for (float gpuWeightsPercent : gpuWeightsPercents)`
`84`	`84`	`{`
`85`		`- auto rt = std::make_shared<TllmRuntime>(engineBlob.data(), engineBlob.size(), gpuWeightsPercent, *logger);`
	`85`	`+ auto rt = std::make_shared<TllmRuntime>(RawEngine(enginePath), logger.get(), gpuWeightsPercent);`
`86`	`86`	`rt->addContext(0);`
`87`	`87`	`for (auto inLen : inLens)`
`88`	`88`	`{`