Skip to content

Commit

Permalink
Make profiler better (#38280)
Browse files Browse the repository at this point in the history
* add align for WorkQueue

* add spinlock

* merge develop

* merge

* Add EventsWaiter

* Revert "Add EventsWaiter"

This reverts commit e206173.

* update OS info

* split host_event_recorder

* split host_event_recorder

* update

* update

* update

* update

* update

* update

* update

Co-authored-by: liutiexing <liutiexing@google.com>
  • Loading branch information
liutiexing and liutiexing authored Dec 29, 2021
1 parent 14658d8 commit 851637f
Show file tree
Hide file tree
Showing 11 changed files with 462 additions and 349 deletions.
9 changes: 5 additions & 4 deletions paddle/fluid/platform/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ IF(WITH_XBYAK)
ENDIF()
cc_library(cpu_info SRCS cpu_info.cc DEPS ${CPU_INFO_DEPS})
cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info)
cc_library(os_info SRCS os_info.cc DEPS enforce device_tracer)
cc_library(os_info SRCS os_info.cc DEPS enforce)

IF(WITH_GPU)
nv_library(cuda_graph_with_memory_pool SRCS cuda_graph_with_memory_pool.cc DEPS device_context allocator_facade cuda_graph)
Expand Down Expand Up @@ -169,15 +169,16 @@ cc_test(timer_test SRCS timer_test.cc DEPS timer)
cc_library(lodtensor_printer SRCS lodtensor_printer.cc DEPS ddim place tensor scope lod_tensor variable_helper framework_proto)
cc_test(lodtensor_printer_test SRCS lodtensor_printer_test.cc DEPS lodtensor_printer)

cc_library(host_event_recorder SRCS host_event_recorder.cc DEPS os_info)
cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS})
if(WITH_GPU)
nv_library(profiler SRCS profiler.cc profiler.cu DEPS os_info device_tracer gpu_info enforce dynload_cuda)
nv_library(profiler SRCS profiler.cc profiler.cu DEPS host_event_recorder os_info device_tracer gpu_info enforce dynload_cuda)
nv_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place)
elseif(WITH_ROCM)
hip_library(profiler SRCS profiler.cc profiler.cu DEPS os_info device_tracer gpu_info enforce)
hip_library(profiler SRCS profiler.cc profiler.cu DEPS host_event_recorder os_info device_tracer gpu_info enforce)
hip_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place)
else()
cc_library(profiler SRCS profiler.cc DEPS os_info device_tracer enforce)
cc_library(profiler SRCS profiler.cc DEPS host_event_recorder os_info device_tracer enforce)
cc_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info place)
endif()

Expand Down
4 changes: 2 additions & 2 deletions paddle/fluid/platform/device_tracer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -348,7 +348,7 @@ class DeviceTracerImpl : public DeviceTracer {
}

void AddCPURecords(const std::string &anno, uint64_t start_ns,
uint64_t end_ns, int64_t device_id, int64_t thread_id) {
uint64_t end_ns, int64_t device_id, uint64_t thread_id) {
if (anno.empty()) {
VLOG(1) << "Empty timeline annotation.";
return;
Expand Down Expand Up @@ -383,7 +383,7 @@ class DeviceTracerImpl : public DeviceTracer {

void AddMemInfoRecord(uint64_t start_ns, uint64_t end_ns, size_t bytes,
const Place &place, const std::string &alloc_in,
const std::string &free_in, int64_t thread_id) {
const std::string &free_in, uint64_t thread_id) {
if (0 == start_ns || 0 == end_ns) {
VLOG(3) << alloc_in << ", " << free_in << " Cannot be traced.";
return;
Expand Down
8 changes: 4 additions & 4 deletions paddle/fluid/platform/device_tracer.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ class DeviceTracer {
uint64_t start_ns;
uint64_t end_ns;
int64_t device_id;
int64_t thread_id;
uint64_t thread_id;
};

struct MemRecord {
Expand All @@ -68,7 +68,7 @@ class DeviceTracer {
uint64_t end_ns;
size_t bytes;
Place place;
int64_t thread_id;
uint64_t thread_id;
std::string alloc_in;
std::string free_in;
};
Expand Down Expand Up @@ -105,7 +105,7 @@ class DeviceTracer {

virtual void AddCPURecords(const std::string& anno, uint64_t start_ns,
uint64_t end_ns, int64_t device_id,
int64_t thread_id) = 0;
uint64_t thread_id) = 0;
virtual void AddActiveKindRecords(const std::string& anno, uint64_t start_ns,
uint64_t end_ns, int64_t device_id,
uint64_t thread_id,
Expand All @@ -115,7 +115,7 @@ class DeviceTracer {
size_t bytes, const Place& place,
const std::string& alloc_in,
const std::string& free_in,
int64_t thread_id) = 0;
uint64_t thread_id) = 0;

// Add a cuda kernel stats. `correlation_id` will be mapped to annotation
// added before for human readability.
Expand Down
57 changes: 46 additions & 11 deletions paddle/fluid/platform/event.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ limitations under the License. */

#pragma once

#include <functional>
#include <map>
#include <string>
#include <utility>
Expand Down Expand Up @@ -45,9 +46,9 @@ class Event {
Event(EventType type, std::string name, uint32_t thread_id,
EventRole role = EventRole::kOrdinary, std::string attr = "none");

const EventType& type() const;
Event* parent() const { return parent_; }
void set_parent(Event* parent) { parent_ = parent; }
const EventType &type() const;
Event *parent() const { return parent_; }
void set_parent(Event *parent) { parent_ = parent; }
std::string name() const { return name_; }
EventRole role() const { return role_; }
uint64_t thread_id() const { return thread_id_; }
Expand All @@ -61,13 +62,13 @@ class Event {
#endif
#endif

double CpuElapsedMs(const Event& e) const;
double CudaElapsedMs(const Event& e) const;
double CpuElapsedMs(const Event &e) const;
double CudaElapsedMs(const Event &e) const;

private:
EventType type_;
std::string name_{};
Event* parent_{nullptr};
Event *parent_{nullptr};
uint64_t thread_id_;
EventRole role_{};
int64_t cpu_ns_;
Expand All @@ -90,13 +91,13 @@ class Event {
#endif
};

using EventWithStartNs = std::pair<Event*, uint64_t>;
using EventWithStartNs = std::pair<Event *, uint64_t>;
using ThreadEvents = std::map<uint64_t, EventWithStartNs>;

class MemEvent {
public:
MemEvent(EventType type, uint64_t start_ns, uint64_t end_ns, size_t bytes,
Place place, int64_t thread_id, const std::string& annotation)
Place place, int64_t thread_id, const std::string &annotation)
: type_(type),
start_ns_(start_ns),
end_ns_(end_ns),
Expand All @@ -105,13 +106,13 @@ class MemEvent {
thread_id_(thread_id),
annotation_(annotation) {}

const EventType& type() const { return type_; }
const EventType &type() const { return type_; }
uint64_t start_ns() const { return start_ns_; }
uint64_t end_ns() const { return end_ns_; }
size_t bytes() const { return bytes_; }
Place place() const { return place_; }
uint64_t thread_id() const { return thread_id_; }
const std::string& annotation() const { return annotation_; }
const std::string &annotation() const { return annotation_; }

private:
EventType type_;
Expand Down Expand Up @@ -151,7 +152,7 @@ class CudaEvent {
#endif
}

void Record(const paddle::platform::stream::CUDAStream& stream) {
void Record(const paddle::platform::stream::CUDAStream &stream) {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, stream.raw_stream()));
#else
Expand Down Expand Up @@ -200,5 +201,39 @@ class CudaEvent {
#endif
};

struct CommonEvent {
public:
CommonEvent(const char *name, uint64_t start_ns, uint64_t end_ns,
EventRole role)
: name(name), start_ns(start_ns), end_ns(end_ns), role(role) {}

CommonEvent(std::function<void *(size_t)> &arena_allocator,
const std::string &name_str, uint64_t start_ns, uint64_t end_ns,
EventRole role, const std::string &attr_str)
: start_ns(start_ns), end_ns(end_ns), role(role) {
auto buf = static_cast<char *>(arena_allocator(name_str.length() + 1));
strncpy(buf, name_str.c_str(), name_str.length() + 1);
name = buf;
buf = static_cast<char *>(arena_allocator(attr_str.length() + 1));
strncpy(buf, attr_str.c_str(), attr_str.length() + 1);
attr = buf;
}

CommonEvent(const std::function<void *(size_t)> &arena_allocator,
const std::string &name_str, uint64_t start_ns, uint64_t end_ns,
EventRole role)
: start_ns(start_ns), end_ns(end_ns), role(role) {
auto buf = static_cast<char *>(arena_allocator(name_str.length() + 1));
strncpy(buf, name_str.c_str(), name_str.length() + 1);
name = buf;
}

const char *name = nullptr; // not owned, designed for performance
uint64_t start_ns = 0;
uint64_t end_ns = 0;
EventRole role = EventRole::kOrdinary;
const char *attr = nullptr; // not owned, designed for performance
};

} // namespace platform
} // namespace paddle
70 changes: 70 additions & 0 deletions paddle/fluid/platform/event_tracing.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#pragma once

#include <string>
#include "paddle/fluid/platform/event.h"

namespace paddle {
namespace platform {

// CPU event tracing. A trace marks something that happens but has no duration
// associated with it. For example, thread starts working.
// Chrome Trace Viewer Format: Instant Event
struct RecordInstantEvent {
explicit RecordInstantEvent(const char* name,
const EventRole role = EventRole::kOrdinary);
};

// CPU event tracing. A trace starts when an object of this clas is created and
// stops when the object is destroyed.
// Chrome Trace Viewer Format: Duration Event/Complte Event
class RecordEvent {
public:
explicit RecordEvent(const std::string& name,
const EventRole role = EventRole::kOrdinary);

explicit RecordEvent(const char* name,
const EventRole role = EventRole::kOrdinary);

RecordEvent(const std::string& name, const EventRole role,
const std::string& attr);

// Stop event tracing explicitly before the object goes out of scope.
// Sometimes it's inconvenient to use RAII
void End();

~RecordEvent() { End(); }

private:
void OriginalConstruct(const std::string& name, const EventRole role,
const std::string& attr);

bool is_enabled_{false};
bool is_pushed_{false};
// Event name
std::string* name_{nullptr};
const char* shallow_copy_name_{nullptr};
uint64_t start_ns_;
// Need to distinguish name by op type, block_id, program_id and perhaps
// different kernel invocations within an op.
// std::string full_name_;
EventRole role_{EventRole::kOrdinary};
std::string* attr_{nullptr};
bool finished_{false};
};

} // namespace platform
} // namespace paddle
33 changes: 33 additions & 0 deletions paddle/fluid/platform/host_event_recorder.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include "paddle/fluid/platform/host_event_recorder.h"
#include "paddle/fluid/platform/os_info.h"

namespace paddle {
namespace platform {

ThreadEventRecorder::ThreadEventRecorder() {
thread_id_ = ThreadIdRegistry::GetInstance().CurrentThreadId().MainTid();
HostEventRecorder::GetInstance().RegisterThreadRecorder(thread_id_, this);
}

HostEventSection HostEventRecorder::GatherEvents() {
HostEventSection host_sec;
host_sec.thr_sections.reserve(thread_recorders_.size());
for (auto &kv : thread_recorders_) {
host_sec.thr_sections.emplace_back(std::move(kv.second->GatherEvents()));
}
return std::move(host_sec);
}

} // namespace platform
} // namespace paddle
Loading

0 comments on commit 851637f

Please sign in to comment.