Skip to content

Commit c2cb520

Browse files
[XLA:GPU] Add debug messages to track which thunk hangs.
PiperOrigin-RevId: 736468039
1 parent 49c16af commit c2cb520

File tree

3 files changed

+14
-2
lines changed

3 files changed

+14
-2
lines changed

xla/backends/gpu/runtime/BUILD

+2-1
Original file line numberDiff line numberDiff line change
@@ -1100,10 +1100,11 @@ cc_library(
11001100
deps = [
11011101
":annotation",
11021102
":thunk",
1103+
"//xla/tsl/platform:errors",
11031104
"@com_google_absl//absl/functional:function_ref",
1105+
"@com_google_absl//absl/log",
11041106
"@com_google_absl//absl/status",
11051107
"@com_google_absl//absl/strings",
1106-
"@tsl//tsl/platform:errors",
11071108
"@tsl//tsl/profiler/lib:scoped_annotation",
11081109
],
11091110
)

xla/backends/gpu/runtime/sequential_thunk.cc

+8-1
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,12 @@ limitations under the License.
2121
#include <utility>
2222

2323
#include "absl/functional/function_ref.h"
24+
#include "absl/log/log.h"
2425
#include "absl/status/status.h"
2526
#include "absl/strings/str_cat.h"
2627
#include "xla/backends/gpu/runtime/annotation.h"
2728
#include "xla/backends/gpu/runtime/thunk.h"
28-
#include "tsl/platform/errors.h"
29+
#include "xla/tsl/platform/errors.h"
2930
#include "tsl/profiler/lib/scoped_annotation.h"
3031

3132
namespace xla {
@@ -83,7 +84,13 @@ absl::Status SequentialThunk::ExecuteOnStream(const ExecuteParams& params) {
8384
if (params.mock_collectives && thunk->IsCollective()) {
8485
continue;
8586
}
87+
VLOG(1) << "[" << params.stream->parent()->device_ordinal() << "] "
88+
<< "Start SequentialThunk::ExecuteOnStream: "
89+
<< thunk->profile_annotation();
8690
TF_RETURN_IF_ERROR(thunk->ExecuteOnStream(params));
91+
VLOG(1) << "[" << params.stream->parent()->device_ordinal() << "] "
92+
<< "End SequentialThunk::ExecuteOnStream: "
93+
<< thunk->profile_annotation();
8794
}
8895
return absl::OkStatus();
8996
}

xla/service/gpu/gpu_executable.cc

+4
Original file line numberDiff line numberDiff line change
@@ -345,7 +345,11 @@ absl::Status ExecuteThunksImpl(
345345
command_buffer_trace_stream, &collective_params, &collective_cliques,
346346
std::move(additional_execution_streams));
347347

348+
VLOG(1) << "[" << run_options->device_ordinal() << "] "
349+
<< "Start GpuExecutable::ExecuteOnStream module: " << module_name;
348350
TF_RETURN_IF_ERROR(thunk_sequence.ExecuteOnStream(execute_params));
351+
VLOG(1) << "[" << run_options->device_ordinal() << "] "
352+
<< "End GpuExecutable::ExecuteOnStream module: " << module_name;
349353

350354
return MaybeSyncAndProfile(run_options, execution_timer.get(),
351355
block_host_until_done ? main_stream : nullptr);

0 commit comments

Comments
 (0)