Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adapt host event recorder to profiler #37766

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
8744ba7
add align for WorkQueue
liutiexing Sep 22, 2021
4759bc8
Merge branch 'develop' of https://github.com/liutiexing/Paddle into d…
liutiexing Sep 22, 2021
6f00ace
add spinlock
liutiexing Sep 23, 2021
2d6f1cf
merge develop
liutiexing Sep 23, 2021
f5099be
Merge branch 'develop' of https://github.com/liutiexing/Paddle into d…
liutiexing Sep 26, 2021
54aa332
merge develop
liutiexing Oct 12, 2021
1d1bd82
merge
liutiexing Oct 12, 2021
dfbf3e4
Merge remote-tracking branch 'upstream/develop' into develop
liutiexing Oct 12, 2021
a5392b3
Merge remote-tracking branch 'upstream/develop' into develop
liutiexing Oct 14, 2021
e206173
Add EventsWaiter
liutiexing Oct 15, 2021
0a3dcd9
Revert "Add EventsWaiter"
liutiexing Oct 15, 2021
4689bb5
Merge remote-tracking branch 'upstream/develop' into develop
liutiexing Oct 15, 2021
0cec99a
Merge remote-tracking branch 'upstream/develop' into develop
liutiexing Oct 20, 2021
481c4fa
Merge branch 'PaddlePaddle:develop' into develop
liutiexing Oct 27, 2021
83db84e
Merge branch 'PaddlePaddle:develop' into develop
liutiexing Oct 29, 2021
7010e0d
Merge branch 'PaddlePaddle:develop' into develop
liutiexing Nov 16, 2021
ec2a363
Merge branch 'PaddlePaddle:develop' into develop
liutiexing Nov 23, 2021
90a59ec
Merge branch 'PaddlePaddle:develop' into develop
liutiexing Nov 26, 2021
1445bbe
Merge branch 'PaddlePaddle:develop' into develop
liutiexing Nov 29, 2021
a2c74ab
Merge branch 'PaddlePaddle:develop' into develop
liutiexing Dec 1, 2021
078b1c3
add os_info
Dec 1, 2021
46b45ab
update
Dec 1, 2021
a8eaeeb
Merge branch 'PaddlePaddle:develop' into AdaptHostEventRecorderToProf…
liutiexing Dec 1, 2021
d8b3459
update
Dec 1, 2021
5230911
Merge branch 'AdaptHostEventRecorderToProfiler' of https://github.com…
Dec 1, 2021
1c09b4e
Merge branch 'PaddlePaddle:develop' into develop
liutiexing Dec 2, 2021
dce77d4
Merge branch 'PaddlePaddle:develop' into AdaptHostEventRecorderToProf…
liutiexing Dec 2, 2021
c3f271f
update
Dec 2, 2021
58ddce7
update
Dec 2, 2021
14400ae
update
Dec 2, 2021
cb8cf7d
Merge branch 'PaddlePaddle:develop' into develop
liutiexing Dec 8, 2021
fcf9b4f
merge
Dec 8, 2021
96a5f45
update for bugfix
Dec 8, 2021
cf0dcd6
Merge branch 'PaddlePaddle:develop' into develop
liutiexing Dec 8, 2021
2112039
merge develop
Dec 8, 2021
2f95801
Merge branch 'PaddlePaddle:develop' into develop
liutiexing Dec 14, 2021
9ef1694
Merge branch 'develop' into AdaptHostEventRecorderToProfiler
Dec 14, 2021
99b257f
update
Dec 14, 2021
aaf83cc
update
Dec 14, 2021
14bec1b
Merge branch 'PaddlePaddle:develop' into develop
liutiexing Dec 15, 2021
726201a
Merge remote-tracking branch 'origin/develop' into AdaptHostEventReco…
Dec 15, 2021
1341f66
update
Dec 15, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions paddle/fluid/platform/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ IF(WITH_XBYAK)
ENDIF()
cc_library(cpu_info SRCS cpu_info.cc DEPS ${CPU_INFO_DEPS})
cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info)
cc_library(os_info SRCS os_info.cc DEPS enforce device_tracer)

IF(WITH_GPU)
nv_library(cuda_graph_with_memory_pool SRCS cuda_graph_with_memory_pool.cc DEPS device_context allocator_facade cuda_graph)
Expand Down Expand Up @@ -165,13 +166,13 @@ cc_test(lodtensor_printer_test SRCS lodtensor_printer_test.cc DEPS lodtensor_pri

cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS})
if(WITH_GPU)
nv_library(profiler SRCS profiler.cc profiler.cu DEPS device_tracer gpu_info enforce dynload_cuda)
nv_library(profiler SRCS profiler.cc profiler.cu DEPS os_info device_tracer gpu_info enforce dynload_cuda)
nv_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place)
elseif(WITH_ROCM)
hip_library(profiler SRCS profiler.cc profiler.cu DEPS device_tracer gpu_info enforce)
hip_library(profiler SRCS profiler.cc profiler.cu DEPS os_info device_tracer gpu_info enforce)
hip_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place)
else()
cc_library(profiler SRCS profiler.cc DEPS device_tracer enforce)
cc_library(profiler SRCS profiler.cc DEPS os_info device_tracer enforce)
cc_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info place)
endif()

Expand Down
70 changes: 62 additions & 8 deletions paddle/fluid/platform/device_tracer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,14 @@ limitations under the License. */
#include "glog/logging.h"
#include "paddle/fluid/platform/device_tracer.h"

DECLARE_bool(enable_host_event_recorder_hook);

namespace paddle {
namespace platform {

// Used only by DeviceTracer
uint64_t GetThreadIdFromSystemThreadId(uint32_t id);

namespace {
// Tracking the nested block stacks of each thread.
#ifdef PADDLE_WITH_SW
Expand All @@ -40,7 +46,8 @@ thread_local std::deque<Event *> annotation_stack;
static std::deque<Event *> main_thread_annotation_stack{};
static std::deque<std::string> main_thread_annotation_stack_name{};

std::map<uint32_t, int32_t> system_thread_id_map;
std::map<uint32_t, uint64_t> system_thread_id_map;
std::mutex system_thread_id_map_mutex;

std::once_flag tracer_once_flag;
DeviceTracer *tracer = nullptr;
Expand Down Expand Up @@ -299,6 +306,47 @@ class DeviceTracerImpl : public DeviceTracer {
local_correlations_pairs->push_front(std::make_pair(id, event));
}

void AddAnnotations(const std::map<uint64_t, ThreadEvents> &thr_events) {
for (auto &tmp : active_kind_records_) {
for (const ActiveKindRecord &r : tmp) {
auto iter = thr_events.find(r.thread_id);
if (iter == thr_events.end()) {
VLOG(10) << __func__ << " " << r.name
<< " Missing tid: " << r.thread_id;
continue;
}
const ThreadEvents &evts = iter->second;
auto evt_iter = evts.upper_bound(r.end_ns);
if (evt_iter == evts.end()) {
VLOG(10) << __func__ << " Missing Record " << r.name
<< " tid: " << r.thread_id << " end_ns: " << r.end_ns;
continue;
}
if (evt_iter != evts.begin()) {
auto prev_iter = std::prev(evt_iter);
if (prev_iter->first >= r.end_ns) {
evt_iter = prev_iter;
} else {
VLOG(10) << __func__ << " prev end_ns " << prev_iter->first
<< " end_ns: " << r.end_ns;
}
}
Event *evt = evt_iter->second.first;
uint64_t start_ns = evt_iter->second.second;
if (start_ns > r.start_ns) {
VLOG(10) << __func__ << " Mismatch Record " << r.name
<< " tid: " << r.thread_id << " start_ns: " << r.start_ns
<< " end_ns: " << r.end_ns << ", event " << evt->name()
<< " start_ns: " << start_ns;
continue;
}
VLOG(10) << __func__ << " tid: " << r.thread_id << " Add correlation "
<< r.correlation_id << "<->" << evt->name();
AddAnnotation(r.correlation_id, evt);
}
}
}

void AddCPURecords(const std::string &anno, uint64_t start_ns,
uint64_t end_ns, int64_t device_id, int64_t thread_id) {
if (anno.empty()) {
Expand Down Expand Up @@ -357,7 +405,7 @@ class DeviceTracerImpl : public DeviceTracer {

void AddActiveKindRecords(const std::string &anno, uint64_t start_ns,
uint64_t end_ns, int64_t device_id,
int64_t thread_id, uint32_t correlation_id) {
uint64_t thread_id, uint32_t correlation_id) {
if (anno.empty()) {
VLOG(1) << "Empty timeline annotation.";
return;
Expand Down Expand Up @@ -524,7 +572,7 @@ class DeviceTracerImpl : public DeviceTracer {
event->set_detail_info(c->second->attr());
find++;
} else {
VLOG(10) << "Missing Kernel Event: " + r.name;
VLOG(10) << __func__ << " Missing Kernel Event: " + r.name;
miss++;
event->set_name(r.name);
}
Expand All @@ -533,7 +581,8 @@ class DeviceTracerImpl : public DeviceTracer {
event->set_sub_device_id(r.stream_id);
event->set_device_id(r.device_id);
}
VLOG(1) << "KernelRecord event miss: " << miss << " find: " << find;
VLOG(1) << __func__ << " KernelRecord event miss: " << miss
<< " find: " << find;

for (auto &tmp : cpu_records_) {
for (const CPURecord &r : tmp) {
Expand Down Expand Up @@ -583,7 +632,8 @@ class DeviceTracerImpl : public DeviceTracer {
event->set_device_id(r.device_id);
event->mutable_memcopy()->set_bytes(r.bytes);
}
VLOG(1) << "MemRecord event miss: " << miss << " find: " << find;
VLOG(1) << __func__ << " MemRecord event miss: " << miss
<< " find: " << find;

for (auto &tmp : mem_info_record_) {
for (const auto &r : tmp) {
Expand Down Expand Up @@ -633,6 +683,9 @@ class DeviceTracerImpl : public DeviceTracer {
#ifdef PADDLE_WITH_CUPTI
static void CUPTIAPI ApiCallback(void *userdata, CUpti_CallbackDomain domain,
CUpti_CallbackId cbid, const void *cbdata) {
if (LIKELY(FLAGS_enable_host_event_recorder_hook)) {
return;
}
auto *cbInfo = reinterpret_cast<const CUpti_CallbackData *>(cbdata);
DeviceTracerImpl *tracer = reinterpret_cast<DeviceTracerImpl *>(userdata);
if (cbInfo->callbackSite == CUPTI_API_ENTER) {
Expand Down Expand Up @@ -712,6 +765,7 @@ Event *CurAnnotation() {
if (annotation_stack.empty()) return nullptr;
return annotation_stack.back();
}

std::string CurAnnotationName() {
if (annotation_stack.empty()) return "Unknown";
return annotation_stack.back()->name();
Expand All @@ -730,13 +784,13 @@ uint32_t GetCurSystemThreadId() {
return id;
}

void RecoreCurThreadId(int32_t id) {
void RecoreCurThreadId(uint64_t id) {
std::lock_guard<std::mutex> lock(system_thread_id_map_mutex);
auto gid = GetCurSystemThreadId();
VLOG(1) << "RecoreCurThreadId: " << gid << " -> " << id;
system_thread_id_map[gid] = id;
}

int32_t GetThreadIdFromSystemThreadId(uint32_t id) {
uint64_t GetThreadIdFromSystemThreadId(uint32_t id) {
auto it = system_thread_id_map.find(id);
if (it != system_thread_id_map.end()) return it->second;
// return origin id if no event is recorded in this thread.
Expand Down
18 changes: 7 additions & 11 deletions paddle/fluid/platform/device_tracer.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ limitations under the License. */

#include "paddle/fluid/platform/dynload/cupti.h"
#include "paddle/fluid/platform/event.h"
#include "paddle/fluid/platform/os_info.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/fluid/platform/profiler.pb.h"

namespace paddle {
Expand All @@ -30,12 +30,6 @@ namespace platform {
//////////////////////
class Event;

inline uint64_t PosixInNsec() {
struct timeval tv;
gettimeofday(&tv, nullptr);
return 1000 * (static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec);
}

// DeviceTracer performs the following tasks:
// 1. Register cuda callbacks for various events: kernel, memcpy, etc.
// 2. Collect cuda statistics: start/end ts, memory, etc.
Expand Down Expand Up @@ -84,7 +78,7 @@ class DeviceTracer {
uint64_t start_ns;
uint64_t end_ns;
int64_t device_id;
int64_t thread_id;
uint64_t thread_id;
uint32_t correlation_id;
};

Expand All @@ -101,6 +95,9 @@ class DeviceTracer {
// human-readable annotations.
virtual void AddAnnotation(uint32_t id, Event* event) = 0;

virtual void AddAnnotations(
const std::map<uint64_t, ThreadEvents>& thr_events) = 0;

virtual void AddMemRecords(const std::string& name, uint64_t start_ns,
uint64_t end_ns, int64_t device_id,
int64_t stream_id, uint32_t correlation_id,
Expand All @@ -111,7 +108,7 @@ class DeviceTracer {
int64_t thread_id) = 0;
virtual void AddActiveKindRecords(const std::string& anno, uint64_t start_ns,
uint64_t end_ns, int64_t device_id,
int64_t thread_id,
uint64_t thread_id,
uint32_t correlation_id) = 0;

virtual void AddMemInfoRecord(uint64_t start_ns, uint64_t end_ns,
Expand Down Expand Up @@ -154,7 +151,6 @@ void ClearCurBlock();
int BlockDepth();

// Set current thread id, so we can map the system thread id to thread id.
void RecoreCurThreadId(int32_t id);
int32_t GetThreadIdFromSystemThreadId(uint32_t id);
void RecoreCurThreadId(uint64_t id);
} // namespace platform
} // namespace paddle
13 changes: 9 additions & 4 deletions paddle/fluid/platform/event.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@ limitations under the License. */

#pragma once

#include <map>
#include <string>
#include <utility>
#ifdef PADDLE_WITH_CUDA
#include <cuda_runtime.h>
#endif
Expand Down Expand Up @@ -48,7 +50,7 @@ class Event {
void set_parent(Event* parent) { parent_ = parent; }
std::string name() const { return name_; }
EventRole role() const { return role_; }
uint32_t thread_id() const { return thread_id_; }
uint64_t thread_id() const { return thread_id_; }
void set_name(std::string name) { name_ = name; }
void set_role(EventRole role) { role_ = role; }
std::string attr() const { return attr_; }
Expand All @@ -66,7 +68,7 @@ class Event {
EventType type_;
std::string name_{};
Event* parent_{nullptr};
uint32_t thread_id_;
uint64_t thread_id_;
EventRole role_{};
int64_t cpu_ns_;
bool visited_status_{false};
Expand All @@ -88,6 +90,9 @@ class Event {
#endif
};

using EventWithStartNs = std::pair<Event*, uint64_t>;
using ThreadEvents = std::map<uint64_t, EventWithStartNs>;

class MemEvent {
public:
MemEvent(EventType type, uint64_t start_ns, uint64_t end_ns, size_t bytes,
Expand All @@ -105,7 +110,7 @@ class MemEvent {
uint64_t end_ns() const { return end_ns_; }
size_t bytes() const { return bytes_; }
Place place() const { return place_; }
int64_t thread_id() const { return thread_id_; }
uint64_t thread_id() const { return thread_id_; }
const std::string& annotation() const { return annotation_; }

private:
Expand All @@ -114,7 +119,7 @@ class MemEvent {
uint64_t end_ns_ = 0;
size_t bytes_;
Place place_;
int64_t thread_id_;
uint64_t thread_id_;
std::string annotation_;
};

Expand Down
38 changes: 38 additions & 0 deletions paddle/fluid/platform/os_info.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include "paddle/fluid/platform/os_info.h"
#include <sstream>
#include "paddle/fluid/platform/device_tracer.h"

namespace paddle {
namespace platform {

ThreadId::ThreadId() {
std_tid_ = std::hash<std::thread::id>()(std::this_thread::get_id());
std::stringstream ss;
ss << std::this_thread::get_id();
cupti_tid_ = static_cast<uint32_t>(std::stoull(ss.str()));
RecoreCurThreadId(MainTid()); // For DeviceTracer
}

ThreadIdRegistry::~ThreadIdRegistry() {
std::lock_guard<std::mutex> lock(lock_);
for (auto id_pair : id_map_) {
delete id_pair.second;
}
}

} // namespace platform
} // namespace paddle
Loading