From 515be94e8298f0db907a4c3d5788ba3da724dc32 Mon Sep 17 00:00:00 2001 From: chenruibiao Date: Wed, 17 Nov 2021 15:37:28 +0800 Subject: [PATCH 01/13] Support multi-stream allocation for CUDA place --- paddle/fluid/memory/allocation/CMakeLists.txt | 3 +- paddle/fluid/memory/allocation/allocator.h | 9 + .../memory/allocation/allocator_facade.cc | 558 ++++++++++++------ .../memory/allocation/allocator_facade.h | 18 +- .../memory/allocation/retry_allocator.cc | 19 - .../fluid/memory/allocation/retry_allocator.h | 19 + .../allocation/stream_safe_cuda_allocator.cc | 160 +++++ .../allocation/stream_safe_cuda_allocator.h | 76 +++ paddle/fluid/memory/malloc.cc | 28 +- paddle/fluid/memory/malloc.h | 13 + paddle/fluid/memory/malloc_test.cu | 144 +++++ 11 files changed, 838 insertions(+), 209 deletions(-) create mode 100644 paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc create mode 100644 paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt index 58979d6c3e1852..4b4717cead36e5 100644 --- a/paddle/fluid/memory/allocation/CMakeLists.txt +++ b/paddle/fluid/memory/allocation/CMakeLists.txt @@ -88,7 +88,7 @@ endif() cc_library(aligned_allocator SRCS aligned_allocator.cc DEPS allocator) cc_test(test_aligned_allocator SRCS test_aligned_allocator.cc DEPS aligned_allocator) cc_library(allocator_strategy SRCS allocator_strategy.cc DEPS gflags ${AllocatorFacadeDeps}) -cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy) +cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy stream_safe_cuda_allocator) if (WITH_GPU) target_link_libraries(allocator_facade cuda_graph) @@ -113,6 +113,7 @@ cc_library(auto_growth_best_fit_allocator SRCS auto_growth_best_fit_allocator.cc cc_test(auto_growth_best_fit_allocator_facade_test SRCS auto_growth_best_fit_allocator_facade_test.cc DEPS cpu_allocator auto_growth_best_fit_allocator) cc_test(auto_growth_best_fit_allocator_test SRCS auto_growth_best_fit_allocator_test.cc DEPS auto_growth_best_fit_allocator) +cc_library(stream_safe_cuda_allocator SRCS stream_safe_cuda_allocator.cc DEPS allocator) cc_library(virtual_memory_auto_growth_best_fit_allocator SRCS virtual_memory_auto_growth_best_fit_allocator.cc DEPS allocator aligned_allocator) if(NOT WIN32) diff --git a/paddle/fluid/memory/allocation/allocator.h b/paddle/fluid/memory/allocation/allocator.h index b11c657b96b74c..de108434154395 100644 --- a/paddle/fluid/memory/allocation/allocator.h +++ b/paddle/fluid/memory/allocation/allocator.h @@ -39,6 +39,7 @@ struct BadAlloc : public std::exception { }; class Allocator; +extern void NotifyGPURetryThreads(); // Allocation is the object holding the actually pointer. Use // `Allocation::ptr()` will returns the pointer that allocated. @@ -155,8 +156,16 @@ class Allocator { class AllocationDeleter { public: inline void operator()(Allocation* allocation) const { + VLOG(10) << "Run AllocationDeleter"; Allocator* allocator = allocation->TopDecoratedAllocator(); allocator->Free(allocation); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + // TODO(Ruibiao): Dirty code, consider a better design to notify CUDA + // alloc retry + if (platform::is_gpu_place(allocation->place())) { + NotifyGPURetryThreads(); + } +#endif } }; diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 9da735636fc00f..d5bab43d60c442 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -14,19 +14,29 @@ #include "paddle/fluid/memory/allocation/allocator_facade.h" +#include +#include +#include + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#include +#endif #include "gflags/gflags.h" + +#include "paddle/fluid/memory/allocation/aligned_allocator.h" #include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/memory/allocation/allocator_strategy.h" #include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h" #include "paddle/fluid/memory/allocation/cpu_allocator.h" #include "paddle/fluid/memory/allocation/naive_best_fit_allocator.h" -#ifdef PADDLE_WITH_ASCEND_CL -#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h" -#endif -#include "paddle/fluid/memory/allocation/aligned_allocator.h" #include "paddle/fluid/memory/allocation/retry_allocator.h" +#include "paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/npu_info.h" #include "paddle/fluid/platform/place.h" +#ifdef PADDLE_WITH_ASCEND_CL +#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h" +#endif #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/fluid/memory/allocation/cuda_allocator.h" #include "paddle/fluid/memory/allocation/pinned_allocator.h" @@ -44,7 +54,6 @@ #ifdef PADDLE_WITH_XPU #include "paddle/fluid/platform/xpu/xpu_info.h" #endif -#include "paddle/fluid/platform/npu_info.h" PADDLE_DEFINE_EXPORTED_int64( gpu_allocator_retry_time, 10000, @@ -114,83 +123,249 @@ class AllocatorFacadePrivate { public: using AllocatorMap = std::map>; +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + using CUDAAllocatorMap = + std::map>>; +#endif + explicit AllocatorFacadePrivate(bool allow_free_idle_chunk = true) { strategy_ = GetAllocatorStrategy(); - switch (strategy_) { - case AllocatorStrategy::kNaiveBestFit: { - InitNaiveBestFitCPUAllocator(); -#ifdef PADDLE_WITH_XPU - for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) { - InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id)); - } -#endif + CheckStrategy(strategy_); + + InitNaiveBestFitCPUAllocator(); + #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount(); - ++dev_id) { - InitNaiveBestFitCUDAAllocator(platform::CUDAPlace(dev_id)); - } - InitNaiveBestFitCUDAPinnedAllocator(); -#endif -#ifdef PADDLE_WITH_ASCEND_CL - for (int dev_id = 0; dev_id < platform::GetNPUDeviceCount(); ++dev_id) { - InitNaiveBestFitNPUAllocator(platform::NPUPlace(dev_id)); - } - InitNaiveBestFitNPUPinnedAllocator(); + allow_free_idle_chunk_ = allow_free_idle_chunk; + default_cuda_stream_ = nullptr; + for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount(); ++dev_id) { + InitCUDAAllocator(platform::CUDAPlace(dev_id), default_cuda_stream_); + } + InitNaiveBestFitCUDAPinnedAllocator(); #endif - break; - } - case AllocatorStrategy::kAutoGrowth: { - InitNaiveBestFitCPUAllocator(); #ifdef PADDLE_WITH_XPU - for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) { - InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id)); - } -#endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount(); - ++dev_id) { - InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id), - allow_free_idle_chunk); - } - InitNaiveBestFitCUDAPinnedAllocator(); + for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) { + InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id)); + } #endif - break; - } - case AllocatorStrategy::kThreadLocal: { - InitNaiveBestFitCPUAllocator(); -#ifdef PADDLE_WITH_XPU - for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) { - InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id)); - } +#ifdef PADDLE_WITH_ASCEND_CL + if (strategy_ == AllocatorStrategy::kNaiveBestFit) { + for (int dev_id = 0; dev_id < platform::GetNPUDeviceCount(); ++dev_id) { + InitNaiveBestFitNPUAllocator(platform::NPUPlace(dev_id)); + } + InitNaiveBestFitNPUPinnedAllocator(); + } #endif + + InitZeroSizeAllocators(); + InitSystemAllocators(); + CheckAllocThreadSafe(); + } + + const std::shared_ptr& GetAllocator(const platform::Place& place, + size_t size) { + VLOG(6) << "GetAllocator" + << " " << place << " " << size; + + if (platform::is_gpu_place(place) && size > 0) { + return GetCUDAAllocator(boost::get(place), + default_cuda_stream_); + } + + const auto& allocators = + (size > 0 ? (UNLIKELY(FLAGS_use_system_allocator) ? system_allocators_ + : GetAllocatorMap()) + : zero_size_allocators_); + auto iter = allocators.find(place); + PADDLE_ENFORCE_NE(iter, allocators.end(), + platform::errors::NotFound( + "No allocator found for the place, %s", place)); + return iter->second; + } + #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount(); - ++dev_id) { - InitThreadLocalCUDAAllocator(platform::CUDAPlace(dev_id)); - } - InitNaiveBestFitCUDAPinnedAllocator(); -#endif - break; + const std::shared_ptr& GetCUDAAllocator( + const platform::CUDAPlace& place, const cudaStream_t& stream) { + auto place_it = cuda_allocators_.find(place); + PADDLE_ENFORCE_NE(place_it, cuda_allocators_.end(), + platform::errors::NotFound( + "No allocator found for the place %s", place)); + + const std::map>& allocator_map = + place_it->second; + auto stream_it = allocator_map.find(stream); + PADDLE_ENFORCE_NE( + stream_it, allocator_map.end(), + platform::errors::NotFound( + "No allocator found for stream %s in place %s", stream, place)); + + return stream_it->second; + } + + cudaStream_t GetDefaultCudaStream() { return default_cuda_stream_; } + + void NotifyGPURetryThreads() { cuda_retry_cv_.notify_all(); } + + void RecordStream(Allocation* allocation, const cudaStream_t& stream) { + PADDLE_ENFORCE_EQ( + platform::is_gpu_place(allocation->place()), true, + platform::errors::InvalidArgument( + "Not allow to record stream for an allocation with place %s", + allocation->place())); + dynamic_cast(allocation)->RecordStream(stream); + } + + AllocationPtr CUDAAlloc(const platform::CUDAPlace& place, + const cudaStream_t& stream, size_t size) { + std::shared_ptr cuda_allocator; + /* NOTE(Ruibiao): This code does not lead to lock competition + * for seraching initialized CUDA allocator in multithreaded scenario. + * However, when the corresponding CUDA allocator is not initialized, + * it may result in large lookup overhead, + * which call GetCUDAAAllocator 3 times in the worst case. + **/ + try { + cuda_allocator = GetCUDAAllocator(place, stream); + } catch (platform::EnforceNotMet& err) { + VLOG(9) << "No allocator found for stream " << stream << "in place " + << place << " , build a new one"; + std::unique_lock lock(cuda_retry_mutex_); + try { + cuda_allocator = GetCUDAAllocator(place, stream); + } catch (platform::EnforceNotMet& err) { + InitCUDAAllocator(place, stream); + cuda_allocator = GetCUDAAllocator(place, stream); + } catch (...) { + throw; } + } catch (...) { + throw; + } - default: { - PADDLE_THROW(platform::errors::InvalidArgument( - "Unsupported allocator strategy: %d", static_cast(strategy_))); + if (FLAGS_gpu_allocator_retry_time <= 0) { + return cuda_allocator->Allocate(size); + } + + // In fact, we can unify the code of allocation success and failure + // But it would add lock even when allocation success at the first time + try { + return cuda_allocator->Allocate(size); + } catch (BadAlloc&) { + VLOG(9) << "Allocation failed when allocating " << size + << " bytes for stream " << stream; + for (auto pair : cuda_allocators_[place]) { + std::shared_ptr cuda_allocator = pair.second; + std::dynamic_pointer_cast(cuda_allocator) + ->ProcessEventsAndFree(); } + try { + return cuda_allocator->Allocate(size); + } catch (BadAlloc&) { + { + WaitedAllocateSizeGuard guard(&cuda_waited_allocate_size_, size); + VLOG(10) + << "Still allocation failed after calling ProcessEventAndFree, " + << " cuda_waited_allocate_size_ = " << cuda_waited_allocate_size_; + // We can just write allocation retry inside the predicate function of + // wait_until. But it needs to acquire the lock when executing + // predicate + // function. For better performance, we use loop here + auto end_time = + std::chrono::high_resolution_clock::now() + + std::chrono::milliseconds(FLAGS_gpu_allocator_retry_time); + auto wait_until = [&end_time, this] { + std::unique_lock lock(cuda_retry_mutex_); + return cuda_retry_cv_.wait_until(lock, end_time); + }; + + size_t retry_times = 0; + while (wait_until() != std::cv_status::timeout) { + try { + return cuda_allocator->Allocate(size); + } catch (BadAlloc&) { + ++retry_times; + VLOG(10) << "Allocation failed when retrying " << retry_times + << " times when allocating " << size + << " bytes. Wait still."; + } catch (...) { + throw; + } + } + } + VLOG(10) << "Allocation failed because of timeout when allocating " + << size << " bytes."; + return cuda_allocator->Allocate( + size); // If timeout, try last allocation request + } catch (...) { + throw; + } + } catch (...) { + throw; } - InitZeroSizeAllocators(); - InitSystemAllocators(); + } - if (FLAGS_gpu_allocator_retry_time > 0) { - WrapCUDARetryAllocator(FLAGS_gpu_allocator_retry_time); +#ifdef PADDLE_WITH_CUDA + void PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) { + PADDLE_ENFORCE_EQ(strategy_, AllocatorStrategy::kAutoGrowth, + platform::errors::InvalidArgument( + "CUDA Graph is only supported when the " + "FLAGS_allocator_strategy=\"auto_growth\", but got " + "FLAGS_allocator_strategy=\"%s\"", + FLAGS_allocator_strategy)); + auto& allocator = cuda_graph_allocator_map_[id]; + PADDLE_ENFORCE_EQ( + allocator.get(), nullptr, + platform::errors::InvalidArgument( + "The memory pool of the CUDA Graph with ID %d have been prepared.", + id)); + allocator.reset( + new AllocatorFacadePrivate(/*allow_free_idle_chunk=*/false)); + for (auto& item : allocator->allocators_) { + auto& old_allocator = item.second; + old_allocator = CUDAGraphAllocator::Create(old_allocator); } + VLOG(10) << "Prepare memory pool for CUDA Graph with ID " << id; + } - CheckAllocThreadSafe(); + void RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id) { + auto iter = cuda_graph_allocator_map_.find(id); + PADDLE_ENFORCE_NE(iter, cuda_graph_allocator_map_.end(), + platform::errors::InvalidArgument( + "Cannot find CUDA Graph with ID = %d", id)); + cuda_graph_allocator_map_.erase(iter); + VLOG(10) << "Remove memory pool of CUDA Graph with ID " << id; } +#endif +#endif + + private: + class ZeroSizeAllocator : public Allocator { + public: + explicit ZeroSizeAllocator(platform::Place place) : place_(place) {} + bool IsAllocThreadSafe() const override { return true; } + + protected: + Allocation* AllocateImpl(size_t size) override { + return new Allocation(nullptr, 0, place_); + } + void FreeImpl(Allocation* allocation) override { delete allocation; } - inline const AllocatorMap& GetAllocatorMap() { + private: + platform::Place place_; + }; + + void CheckStrategy(AllocatorStrategy strategy) { + if (strategy != AllocatorStrategy::kNaiveBestFit && + strategy != AllocatorStrategy::kAutoGrowth && + strategy != AllocatorStrategy::kThreadLocal) { + PADDLE_THROW(platform::errors::InvalidArgument( + "Unsupported allocator strategy: %d", static_cast(strategy_))); + } + } + + const AllocatorMap& GetAllocatorMap() { #ifdef PADDLE_WITH_CUDA if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { auto id = platform::CUDAGraph::CapturingID(); @@ -208,43 +383,6 @@ class AllocatorFacadePrivate { #endif } - inline const std::shared_ptr& GetAllocator( - const platform::Place& place, size_t size) { - VLOG(6) << "GetAllocator" - << " " << place << " " << size; - const auto& allocators = - (size > 0 ? (UNLIKELY(FLAGS_use_system_allocator) ? system_allocators_ - : GetAllocatorMap()) - : zero_size_allocators_); - auto iter = allocators.find(place); - PADDLE_ENFORCE_NE(iter, allocators.end(), - platform::errors::NotFound( - "No allocator found for the place, %s", place)); - return iter->second; - } - - private: - void InitSystemAllocators() { - if (!system_allocators_.empty()) return; - system_allocators_[platform::CPUPlace()] = std::make_shared(); -#ifdef PADDLE_WITH_XPU - int device_count = platform::GetXPUDeviceCount(); - for (int i = 0; i < device_count; ++i) { - platform::XPUPlace p(i); - system_allocators_[p] = std::make_shared(p); - } -#endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - system_allocators_[platform::CUDAPinnedPlace()] = - std::make_shared(); - int device_count = platform::GetCUDADeviceCount(); - for (int i = 0; i < device_count; ++i) { - platform::CUDAPlace p(i); - system_allocators_[p] = std::make_shared(p); - } -#endif - } - void InitNaiveBestFitCPUAllocator() { allocators_[platform::CPUPlace()] = std::make_shared(platform::CPUPlace()); @@ -256,20 +394,38 @@ class AllocatorFacadePrivate { std::make_shared(platform::CUDAPinnedPlace()); } - void InitNaiveBestFitCUDAAllocator(platform::CUDAPlace p) { - allocators_[p] = std::make_shared(p); + void InitCUDAAllocator(platform::CUDAPlace p, cudaStream_t stream) { + switch (strategy_) { + case AllocatorStrategy::kNaiveBestFit: { + InitNaiveBestFitCUDAAllocator(p, stream); + break; + } + case AllocatorStrategy::kAutoGrowth: { + InitAutoGrowthCUDAAllocator(p, stream); + break; + } + case AllocatorStrategy::kThreadLocal: { + InitThreadLocalCUDAAllocator(p, stream); + break; + } + default: { + PADDLE_THROW(platform::errors::InvalidArgument( + "Unsupported allocator strategy: %d", static_cast(strategy_))); + } + } + WrapStreamSafeCUDAAllocator(p, stream); } - void InitThreadLocalCUDAAllocator(platform::CUDAPlace p) { - allocators_[p] = std::make_shared(p); + void InitNaiveBestFitCUDAAllocator(platform::CUDAPlace p, + cudaStream_t stream) { + cuda_allocators_[p][stream] = std::make_shared(p); } - void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p, - bool allow_free_idle_chunk) { + void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p, cudaStream_t stream) { #if defined(PADDLE_WITH_HIP) auto cuda_allocator = std::make_shared(p); - allocators_[p] = std::make_shared( - cuda_allocator, platform::GpuMinChunkSize(), allow_free_idle_chunk); + cuda_allocators_[p][stream] = std::make_shared( + cuda_allocator, platform::GpuMinChunkSize(), allow_free_idle_chunk_); #endif #if defined(PADDLE_WITH_CUDA) @@ -290,15 +446,16 @@ class AllocatorFacadePrivate { if (val > 0 && FLAGS_use_virtual_memory_auto_growth) { auto cuda_allocator = std::make_shared(p); - allocators_[p] = + cuda_allocators_[p][stream] = std::make_shared( cuda_allocator, platform::GpuMinChunkSize(), p); } else { auto cuda_allocator = std::make_shared(p); - allocators_[p] = std::make_shared( - cuda_allocator, platform::GpuMinChunkSize(), allow_free_idle_chunk); + cuda_allocators_[p][stream] = + std::make_shared( + cuda_allocator, platform::GpuMinChunkSize(), + allow_free_idle_chunk_); } - #else auto cuda_allocator = std::make_shared(p); auto alignment = platform::GpuMinChunkSize(); @@ -332,11 +489,34 @@ class AllocatorFacadePrivate { VLOG(10) << "not use AlignedAllocator with alignment: " << alignment; underlying_allocator = cuda_allocator; } - allocators_[p] = std::make_shared( - underlying_allocator, alignment, 0, allow_free_idle_chunk); + + cuda_allocators_[p][stream] = std::make_shared( + underlying_allocator, alignment, 0, allow_free_idle_chunk_); #endif #endif } + + void InitThreadLocalCUDAAllocator(platform::CUDAPlace p, + cudaStream_t stream) { + cuda_allocators_[p][stream] = std::make_shared(p); + } + + void WrapStreamSafeCUDAAllocator(platform::CUDAPlace p, cudaStream_t stream) { + const std::shared_ptr& underlying_allocator = + GetCUDAAllocator(p, stream); + cuda_allocators_[p][stream] = + std::make_shared(underlying_allocator, stream); + } + + static void CheckCUDAAllocThreadSafe(const CUDAAllocatorMap& allocators) { + for (auto& place_pair : allocators) { + for (auto& stream_pair : place_pair.second) { + PADDLE_ENFORCE_EQ(stream_pair.second->IsAllocThreadSafe(), true, + platform::errors::InvalidArgument( + "Public allocators must be thread safe")); + } + } + } #endif #ifdef PADDLE_WITH_XPU @@ -354,26 +534,8 @@ class AllocatorFacadePrivate { allocators_[platform::NPUPinnedPlace()] = std::make_shared(); } - #endif - class ZeroSizeAllocator : public Allocator { - public: - explicit ZeroSizeAllocator(platform::Place place) : place_(place) {} - - bool IsAllocThreadSafe() const override { return true; } - - protected: - Allocation* AllocateImpl(size_t size) override { - return new Allocation(nullptr, 0, place_); - } - - void FreeImpl(Allocation* allocation) override { delete allocation; } - - private: - platform::Place place_; - }; - void InitZeroSizeAllocators() { if (!zero_size_allocators_.empty()) return; std::vector places; @@ -411,74 +573,61 @@ class AllocatorFacadePrivate { } } - void CheckAllocThreadSafe() const { - CheckAllocThreadSafe(allocators_); - CheckAllocThreadSafe(zero_size_allocators_); - CheckAllocThreadSafe(system_allocators_); - } - - void WrapCUDARetryAllocator(size_t retry_time) { - PADDLE_ENFORCE_GT( - retry_time, 0, - platform::errors::InvalidArgument( - "Retry time should be larger than 0, but got %d", retry_time)); - for (auto& pair : allocators_) { - if (platform::is_gpu_place(pair.first)) { - pair.second = std::make_shared(pair.second, retry_time); - } + void InitSystemAllocators() { + if (!system_allocators_.empty()) return; + system_allocators_[platform::CPUPlace()] = std::make_shared(); +#ifdef PADDLE_WITH_XPU + int device_count = platform::GetXPUDeviceCount(); + for (int i = 0; i < device_count; ++i) { + platform::XPUPlace p(i); + system_allocators_[p] = std::make_shared(p); } - } - -#ifdef PADDLE_WITH_CUDA - - public: - void PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) { - PADDLE_ENFORCE_EQ(strategy_, AllocatorStrategy::kAutoGrowth, - platform::errors::InvalidArgument( - "CUDA Graph is only supported when the " - "FLAGS_allocator_strategy=\"auto_growth\", but got " - "FLAGS_allocator_strategy=\"%s\"", - FLAGS_allocator_strategy)); - auto& allocator = cuda_graph_allocator_map_[id]; - PADDLE_ENFORCE_EQ( - allocator.get(), nullptr, - platform::errors::InvalidArgument( - "The memory pool of the CUDA Graph with ID %d have been prepared.", - id)); - allocator.reset( - new AllocatorFacadePrivate(/*allow_free_idle_chunk=*/false)); - for (auto& item : allocator->allocators_) { - auto& old_allocator = item.second; - old_allocator = CUDAGraphAllocator::Create(old_allocator); +#endif +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + system_allocators_[platform::CUDAPinnedPlace()] = + std::make_shared(); + int device_count = platform::GetCUDADeviceCount(); + for (int i = 0; i < device_count; ++i) { + platform::CUDAPlace p(i); + system_allocators_[p] = std::make_shared(p); } - VLOG(10) << "Prepare memory pool for CUDA Graph with ID " << id; +#endif } - void RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id) { - auto iter = cuda_graph_allocator_map_.find(id); - PADDLE_ENFORCE_NE(iter, cuda_graph_allocator_map_.end(), - platform::errors::InvalidArgument( - "Cannot find CUDA Graph with ID = %d", id)); - cuda_graph_allocator_map_.erase(iter); - VLOG(10) << "Remove memory pool of CUDA Graph with ID " << id; - } + void CheckAllocThreadSafe() const { + CheckAllocThreadSafe(allocators_); + CheckAllocThreadSafe(zero_size_allocators_); + CheckAllocThreadSafe(system_allocators_); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + CheckCUDAAllocThreadSafe(cuda_allocators_); #endif + } - private: - AllocatorMap allocators_; +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + // a standalone CUDA allocator to support multi-stream GC in new executor + CUDAAllocatorMap cuda_allocators_; + cudaStream_t default_cuda_stream_; + static std::condition_variable cuda_retry_cv_; + std::mutex cuda_retry_mutex_; + std::mutex cuda_init_mutex_; + std::atomic cuda_waited_allocate_size_{0}; #ifdef PADDLE_WITH_CUDA std::unordered_map> cuda_graph_allocator_map_; +#endif #endif AllocatorStrategy strategy_; - + AllocatorMap allocators_; static AllocatorMap zero_size_allocators_; static AllocatorMap system_allocators_; + bool allow_free_idle_chunk_; }; - AllocatorFacadePrivate::AllocatorMap AllocatorFacadePrivate::zero_size_allocators_; AllocatorFacadePrivate::AllocatorMap AllocatorFacadePrivate::system_allocators_; +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +std::condition_variable AllocatorFacadePrivate::cuda_retry_cv_; +#endif // Pimpl. Make interface clean. AllocatorFacade::AllocatorFacade() : m_(new AllocatorFacadePrivate()) {} @@ -498,10 +647,22 @@ std::shared_ptr AllocatorFacade::AllocShared( AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size) { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + if (platform::is_gpu_place(place)) { + return Alloc(boost::get(place), + m_->GetDefaultCudaStream(), size); + } +#endif return m_->GetAllocator(place, size)->Allocate(size); } uint64_t AllocatorFacade::Release(const platform::Place& place) { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + if (platform::is_gpu_place(place)) { + return Release(boost::get(place), + m_->GetDefaultCudaStream()); + } +#endif return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1) ->Release(place); } @@ -511,6 +672,33 @@ const std::shared_ptr& AllocatorFacade::GetAllocator( return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1); } +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +std::shared_ptr AllocatorFacade::AllocShared( + const platform::CUDAPlace& place, const cudaStream_t& stream, size_t size) { + return std::shared_ptr(Alloc(place, stream, size)); +} + +AllocationPtr AllocatorFacade::Alloc(const platform::CUDAPlace& place, + const cudaStream_t& stream, size_t size) { + if (size > 0) { + return m_->CUDAAlloc(place, stream, size); + } else { + return m_->GetAllocator(place, size)->Allocate(size); + } +} + +uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place, + const cudaStream_t& stream) { + return m_->GetCUDAAllocator(place, stream)->Release(place); +} + +void AllocatorFacade::NotifyGPURetryThreads() { m_->NotifyGPURetryThreads(); } + +void AllocatorFacade::RecordStream(Allocation* allocation, + const cudaStream_t& stream) { + m_->RecordStream(allocation, stream); +} + #ifdef PADDLE_WITH_CUDA void AllocatorFacade::PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) { return m_->PrepareMemoryPoolForCUDAGraph(id); @@ -521,6 +709,10 @@ void AllocatorFacade::RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id) { } #endif +void NotifyGPURetryThreads() { + allocation::AllocatorFacade::Instance().NotifyGPURetryThreads(); +} +#endif } // namespace allocation } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h index 8d889ec38eed7e..507b36abd5404c 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.h +++ b/paddle/fluid/memory/allocation/allocator_facade.h @@ -26,6 +26,7 @@ namespace paddle { namespace memory { namespace allocation { + #ifdef PADDLE_WITH_ASCEND_CL using NPUPinnedAllocator = paddle::memory::allocation::NPUPinnedAllocator; #endif @@ -40,26 +41,37 @@ using NPUPinnedAllocator = paddle::memory::allocation::NPUPinnedAllocator; class AllocatorFacadePrivate; class AllocatorFacade { public: - ~AllocatorFacade(); AllocatorFacade(const AllocatorFacade& o) = delete; const AllocatorFacade& operator=(const AllocatorFacade& o) = delete; + ~AllocatorFacade(); static AllocatorFacade& Instance(); // Allocate a shared allocation. std::shared_ptr AllocShared(const platform::Place& place, size_t size); - // Allocate a unique allocation. AllocationPtr Alloc(const platform::Place& place, size_t size); - // Release unused memory pool. uint64_t Release(const platform::Place& place); + const std::shared_ptr& GetAllocator(const platform::Place& place); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + std::shared_ptr AllocShared(const platform::CUDAPlace& place, + const cudaStream_t& stream, + size_t size); + AllocationPtr Alloc(const platform::CUDAPlace& place, + const cudaStream_t& stream, size_t size); + uint64_t Release(const platform::CUDAPlace& place, + const cudaStream_t& stream); + + void NotifyGPURetryThreads(); + void RecordStream(Allocation* allocation, const cudaStream_t& stream); #ifdef PADDLE_WITH_CUDA void PrepareMemoryPoolForCUDAGraph(CUDAGraphID id); void RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id); +#endif #endif // TODO(yy): Allocate a Copy-On-Write allocation? diff --git a/paddle/fluid/memory/allocation/retry_allocator.cc b/paddle/fluid/memory/allocation/retry_allocator.cc index 1607af3808b434..86c7ad3dc5be16 100644 --- a/paddle/fluid/memory/allocation/retry_allocator.cc +++ b/paddle/fluid/memory/allocation/retry_allocator.cc @@ -20,25 +20,6 @@ namespace paddle { namespace memory { namespace allocation { -class WaitedAllocateSizeGuard { - public: - WaitedAllocateSizeGuard(std::atomic* waited_size, - size_t requested_size) - : waited_size_(waited_size), requested_size_(requested_size) { - waited_size_->fetch_add(requested_size_, - std::memory_order::memory_order_relaxed); - } - - ~WaitedAllocateSizeGuard() { - waited_size_->fetch_sub(requested_size_, - std::memory_order::memory_order_relaxed); - } - - private: - std::atomic* waited_size_; - size_t requested_size_; -}; - void RetryAllocator::FreeImpl(Allocation* allocation) { // Delete underlying allocation first. size_t size = allocation->size(); diff --git a/paddle/fluid/memory/allocation/retry_allocator.h b/paddle/fluid/memory/allocation/retry_allocator.h index 031a5e2b97f178..36ff2e027c1a37 100644 --- a/paddle/fluid/memory/allocation/retry_allocator.h +++ b/paddle/fluid/memory/allocation/retry_allocator.h @@ -28,6 +28,25 @@ namespace paddle { namespace memory { namespace allocation { +class WaitedAllocateSizeGuard { + public: + WaitedAllocateSizeGuard(std::atomic* waited_size, + size_t requested_size) + : waited_size_(waited_size), requested_size_(requested_size) { + waited_size_->fetch_add(requested_size_, + std::memory_order::memory_order_relaxed); + } + + ~WaitedAllocateSizeGuard() { + waited_size_->fetch_sub(requested_size_, + std::memory_order::memory_order_relaxed); + } + + private: + std::atomic* waited_size_; + size_t requested_size_; +}; + class RetryAllocator : public Allocator { public: RetryAllocator(std::shared_ptr allocator, size_t retry_ms) diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc new file mode 100644 index 00000000000000..5558ddcbbb5361 --- /dev/null +++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc @@ -0,0 +1,160 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h" +#include "paddle/fluid/memory/allocation/allocator.h" + +namespace paddle { +namespace memory { +namespace allocation { + +StreamSafeCUDAAllocation::StreamSafeCUDAAllocation( + AllocationPtr underlying_allocation, cudaStream_t owning_stream) + : Allocation(underlying_allocation->ptr(), underlying_allocation->size(), + underlying_allocation->place()), + underlying_allocation_(std::move(underlying_allocation)), + owning_stream_(owning_stream), + recorded_streams_(std::make_shared>()) {} + +void StreamSafeCUDAAllocation::RecordStream(cudaStream_t stream) { + VLOG(8) << "Record stream " << stream << " to " << ptr(); + if (stream == owning_stream_) { + return; + } + std::lock_guard lock(mutex_); + recorded_streams_->insert(stream); +} + +std::shared_ptr> +StreamSafeCUDAAllocation::GetRecordedStreams() { + return recorded_streams_; +} + +StreamSafeCUDAAllocator::StreamSafeCUDAAllocator( + const std::shared_ptr& underlying_allocator, + const cudaStream_t default_stream) + : underlying_allocator_(underlying_allocator), + default_stream_(default_stream) {} + +bool StreamSafeCUDAAllocator::IsAllocThreadSafe() const { return true; } + +void StreamSafeCUDAAllocator::ProcessEventsAndFree() { + for (auto map_it = allocation_info_map_.begin(); + map_it != allocation_info_map_.end();) { + std::deque& outstanding_events = + map_it->second->outstanding_events; + VLOG(10) << "Check " << outstanding_events.size() + << " outstanding events for " << map_it->first->ptr(); + auto deque_it = outstanding_events.begin(); + while (deque_it != outstanding_events.end()) { + cudaError_t err = cudaEventQuery(*deque_it); + if (err == cudaErrorNotReady) { + VLOG(10) << "Event " << *deque_it << " for " << map_it->first->ptr() + << " is not complete"; + outstanding_events.erase(outstanding_events.begin(), deque_it); + break; + } + PADDLE_ENFORCE_CUDA_SUCCESS(err); + PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(*deque_it)); + ++deque_it; + } + + if (deque_it == outstanding_events.end()) { + outstanding_events.clear(); + Allocation* allocation = map_it->first; + auto next_it = ++map_it; // "map_it" may be invalid after calling + // FreeStreamSafeCUDAAllocation + FreeStreamSafeCUDAAllocation(allocation); + map_it = next_it; + } else { + ++map_it; + } + } +} + +Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) { + std::lock_guard lock(mutex_); + ProcessEventsAndFree(); + AllocationPtr underlying_allocation = underlying_allocator_->Allocate(size); + StreamSafeCUDAAllocation* allocation = new StreamSafeCUDAAllocation( + std::move(underlying_allocation), default_stream_); + allocation_info_map_[allocation] = std::make_shared(); + return allocation; +} + +void StreamSafeCUDAAllocator::FreeImpl(Allocation* allocation) { + std::lock_guard lock(mutex_); + GetAllocationInfo(allocation)->can_be_freed = true; + FreeStreamSafeCUDAAllocation(allocation); +} + +uint64_t StreamSafeCUDAAllocator::ReleaseImpl(const platform::Place& place) { + /*lock_guard*/ { + std::lock_guard lock(mutex_); + ProcessEventsAndFree(); + } + return underlying_allocator_->Release(place); +} + +void StreamSafeCUDAAllocator::CreateEventForAllRecordedStream( + std::set* recorded_streams, + std::deque* outstanding_events) { + for (cudaStream_t stream : *recorded_streams) { + cudaEvent_t event; + PADDLE_ENFORCE_CUDA_SUCCESS( + cudaEventCreateWithFlags(&event, cudaEventDisableTiming)); + PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event, stream)); + outstanding_events->emplace_back(event); + VLOG(9) << "Record event " << event << " in stream " << stream; + } + recorded_streams->clear(); +} + +void StreamSafeCUDAAllocator::FreeStreamSafeCUDAAllocation( + Allocation* allocation) { + std::shared_ptr allocation_info = + GetAllocationInfo(allocation); + if (!allocation_info->can_be_freed) { + return; + } + + std::deque& outstanding_events = + allocation_info->outstanding_events; + CreateEventForAllRecordedStream( + dynamic_cast(allocation) + ->GetRecordedStreams() + .get(), + &outstanding_events); + if (!outstanding_events.empty()) { + VLOG(8) << allocation->ptr() << " is not ready to free"; + return; + } + + VLOG(8) << "Free " << allocation->ptr(); + allocation_info_map_.erase(allocation); + delete allocation; +} + +std::shared_ptr +StreamSafeCUDAAllocator::GetAllocationInfo(Allocation* allocation) { + auto it = allocation_info_map_.find(allocation); + PADDLE_ENFORCE_NE( + it, allocation_info_map_.end(), + "The recorded allocation is not malloced by this allocator."); + return it->second; +} + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h new file mode 100644 index 00000000000000..1746f537f374cb --- /dev/null +++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h @@ -0,0 +1,76 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include "paddle/fluid/memory/allocation/allocator.h" + +namespace paddle { +namespace memory { +namespace allocation { + +class StreamSafeCUDAAllocation : public Allocation { + public: + StreamSafeCUDAAllocation(AllocationPtr underlying_allocation, + cudaStream_t owning_stream); + void RecordStream(cudaStream_t stream); + std::shared_ptr> GetRecordedStreams(); + + private: + AllocationPtr underlying_allocation_; + cudaStream_t owning_stream_; + std::shared_ptr> recorded_streams_; + std::mutex mutex_; +}; + +class StreamSafeCUDAAllocator : public Allocator { + public: + StreamSafeCUDAAllocator( + const std::shared_ptr &underlying_allocator, + const cudaStream_t default_stream); + bool IsAllocThreadSafe() const override; + void ProcessEventsAndFree(); + + protected: + Allocation *AllocateImpl(size_t size) override; + void FreeImpl(Allocation *allocation) override; + uint64_t ReleaseImpl(const platform::Place &place) override; + + private: + struct AllocationInfo { + std::deque outstanding_events; + bool can_be_freed{false}; + }; + + void CreateEventForAllRecordedStream( + std::set *recorded_streams, + std::deque *outstanding_events); + void FreeStreamSafeCUDAAllocation(Allocation *allocation); + std::shared_ptr GetAllocationInfo(Allocation *); + + std::shared_ptr underlying_allocator_; + cudaStream_t default_stream_; + std::unordered_map> + allocation_info_map_; + mutable std::recursive_mutex mutex_; +}; + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc index 078e841068ca57..55b4657df3ac62 100644 --- a/paddle/fluid/memory/malloc.cc +++ b/paddle/fluid/memory/malloc.cc @@ -20,18 +20,40 @@ limitations under the License. */ namespace paddle { namespace memory { -std::shared_ptr AllocShared(const platform::Place &place, +std::shared_ptr AllocShared(const platform::Place& place, size_t size) { return allocation::AllocatorFacade::Instance().AllocShared(place, size); } -AllocationPtr Alloc(const platform::Place &place, size_t size) { +AllocationPtr Alloc(const platform::Place& place, size_t size) { return allocation::AllocatorFacade::Instance().Alloc(place, size); } -uint64_t Release(const platform::Place &place) { +uint64_t Release(const platform::Place& place) { return allocation::AllocatorFacade::Instance().Release(place); } +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +std::shared_ptr AllocShared(const platform::CUDAPlace& place, + const cudaStream_t& stream, + size_t size) { + return allocation::AllocatorFacade::Instance().AllocShared(place, stream, + size); +} + +AllocationPtr Alloc(const platform::CUDAPlace& place, + const cudaStream_t& stream, size_t size) { + return allocation::AllocatorFacade::Instance().Alloc(place, stream, size); +} + +uint64_t Release(const platform::CUDAPlace& place, const cudaStream_t& stream) { + return allocation::AllocatorFacade::Instance().Release(place, stream); +} + +void RecordStream(Allocation* allocation, const cudaStream_t& stream) { + return allocation::AllocatorFacade::Instance().RecordStream(allocation, + stream); +} +#endif } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/malloc.h b/paddle/fluid/memory/malloc.h index 3b8d07548ee0c0..0b49e1cbee1047 100644 --- a/paddle/fluid/memory/malloc.h +++ b/paddle/fluid/memory/malloc.h @@ -40,5 +40,18 @@ extern AllocationPtr Alloc(const platform::DeviceContext& dev_ctx, size_t size); extern uint64_t Release(const platform::Place& place); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +extern std::shared_ptr AllocShared(const platform::CUDAPlace& place, + const cudaStream_t& stream, + size_t size); + +extern AllocationPtr Alloc(const platform::CUDAPlace& place, + const cudaStream_t& stream, size_t size); + +extern uint64_t Release(const platform::CUDAPlace& place, + const cudaStream_t& stream); + +void RecordStream(Allocation* allocation, const cudaStream_t& stream); +#endif } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/malloc_test.cu b/paddle/fluid/memory/malloc_test.cu index d015ed7ce693fb..e8d30e6fe98533 100644 --- a/paddle/fluid/memory/malloc_test.cu +++ b/paddle/fluid/memory/malloc_test.cu @@ -27,6 +27,11 @@ #include "gtest/gtest.h" #include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/gpu_info.h" + +#if defined(PADDLE_WITH_CUDA) +DECLARE_int64(gpu_allocator_retry_time); +#endif namespace paddle { namespace memory { @@ -169,5 +174,144 @@ TEST(Malloc, AllocZero) { AllocationPtr allocation_ptr = Alloc(place, 0); EXPECT_GE(allocation_ptr->size(), 0); } + +TEST(Malloc, CUDAAllocRetry) { + platform::Place place = platform::CUDAPlace(); + size_t available_size = platform::GpuAvailableMemToAlloc(); + // alloc_size < available_size < 2 * alloc_size + size_t alloc_size = available_size / 4 * 3; + + auto alloc_fun = [&place, alloc_size]() { + return AllocShared(place, alloc_size); + }; + std::shared_ptr allocation = alloc_fun(); + auto start_time = std::chrono::steady_clock::now(); + std::thread th(alloc_fun); + std::this_thread::sleep_for(std::chrono::seconds(1)); + allocation.reset(); + th.join(); + auto end_time = std::chrono::steady_clock::now(); + + std::chrono::duration time = end_time - start_time; + VLOG(10) << "time cost = " << time.count() << " s"; + EXPECT_LE(time.count() * 1000, FLAGS_gpu_allocator_retry_time); +} + +__global__ void add_kernel(int *x, int n) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + for (int i = tid; i < n; i += blockDim.x * gridDim.x) { + atomicAdd(x + i, tid); + } +} + +class StreamSafeCUDAAllocTest : public ::testing::Test { + protected: + void SetUp() override { + place_ = platform::CUDAPlace(); + stream_num_ = 64; + grid_num_ = 1; + block_num_ = 64; + data_num_ = 64; + default_stream = nullptr; + + streams_.reserve(stream_num_); + streams_.emplace_back(default_stream); + for (size_t i = 1; i < stream_num_; ++i) { + cudaStream_t stream; + PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream)); + streams_.emplace_back(stream); + } + + for (size_t i = 0; i < stream_num_; ++i) { + size_t allocation_size = data_num_ * sizeof(int); + std::shared_ptr allocation = + AllocShared(place_, streams_[i], allocation_size); + PADDLE_ENFORCE_CUDA_SUCCESS( + cudaMemset(allocation->ptr(), 0, allocation->size())); + allocations_.emplace_back(allocation); + } + } + + void SingleStreamRun(size_t idx) { + for (size_t i = 0; i < stream_num_; ++i) { + int *x = reinterpret_cast(allocations_[i]->ptr()); + add_kernel<<>>(x, data_num_); + if (i != idx) { + RecordStream(allocations_[i].get(), streams_[idx]); + } + } + } + + void MultiStreamRun() { + for (int i = 0; i < stream_num_; ++i) { + SingleStreamRun(i); + } + allocations_.clear(); // fast_gc + } + + void MultiThreadMUltiStreamRun() { + std::vector threads; + for (size_t i = 0; i < stream_num_; ++i) { + threads.push_back( + std::thread(&StreamSafeCUDAAllocTest::SingleStreamRun, this, i)); + } + for (size_t i = 0; i < stream_num_; ++i) { + threads[i].join(); + } + allocations_.clear(); // fast_gc + } + + void CheckResult() { + auto host_x = std::unique_ptr(new int[data_num_]); + size_t thread_num = grid_num_ * block_num_; + for (int i = 0; i < stream_num_; ++i) { + // tricky code, the allocations are still accessible even though + // allocations_.clear() has been called + PADDLE_ENFORCE_CUDA_SUCCESS( + cudaMemcpy(host_x.get(), allocations_[i]->ptr(), + data_num_ * sizeof(int), cudaMemcpyDeviceToHost)); + for (int j = 0; j < data_num_; ++j) { + EXPECT_TRUE(host_x[j] == (j % thread_num) * stream_num_); + } + } + } + + void TearDown() override { + cudaDeviceSynchronize(); + for (cudaStream_t stream : streams_) { + Release(place_, stream); + } + + for (size_t i = 1; i < stream_num_; ++i) { + PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(streams_[i])); + } + + uint64_t cuda_malloc_size = + platform::RecordedCudaMallocSize(place_.GetDeviceId()); + ASSERT_EQ(cuda_malloc_size, 0) << "Found " << cuda_malloc_size + << " bytes memory that not released yet, " + "there may be a memory leak problem."; + } + + size_t stream_num_; + size_t grid_num_; + size_t block_num_; + size_t data_num_; + platform::CUDAPlace place_; + cudaStream_t default_stream; + std::vector streams_; + std::vector> allocations_; +}; + +TEST_F(StreamSafeCUDAAllocTest, CUDAMutilStream) { + MultiStreamRun(); + CheckResult(); +} + +TEST_F(StreamSafeCUDAAllocTest, CUDAMutilThreadMutilStream) { + MultiThreadMUltiStreamRun(); + CheckResult(); +} + } // namespace memory } // namespace paddle From ee154eef809d8764d7792748c368218c8d7123a4 Mon Sep 17 00:00:00 2001 From: chenruibiao Date: Thu, 18 Nov 2021 18:00:26 +0800 Subject: [PATCH 02/13] Do not notify the retrying from other streams when free CUDA allocation --- paddle/fluid/memory/allocation/allocator.h | 9 -- .../memory/allocation/allocator_facade.cc | 83 +++++-------------- .../memory/allocation/allocator_facade.h | 2 - .../memory/allocation/retry_allocator.cc | 19 +++++ .../fluid/memory/allocation/retry_allocator.h | 19 ----- .../allocation/stream_safe_cuda_allocator.cc | 7 +- .../allocation/stream_safe_cuda_allocator.h | 1 - paddle/fluid/memory/malloc_test.cu | 42 ++++++---- 8 files changed, 67 insertions(+), 115 deletions(-) diff --git a/paddle/fluid/memory/allocation/allocator.h b/paddle/fluid/memory/allocation/allocator.h index de108434154395..b11c657b96b74c 100644 --- a/paddle/fluid/memory/allocation/allocator.h +++ b/paddle/fluid/memory/allocation/allocator.h @@ -39,7 +39,6 @@ struct BadAlloc : public std::exception { }; class Allocator; -extern void NotifyGPURetryThreads(); // Allocation is the object holding the actually pointer. Use // `Allocation::ptr()` will returns the pointer that allocated. @@ -156,16 +155,8 @@ class Allocator { class AllocationDeleter { public: inline void operator()(Allocation* allocation) const { - VLOG(10) << "Run AllocationDeleter"; Allocator* allocator = allocation->TopDecoratedAllocator(); allocator->Free(allocation); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - // TODO(Ruibiao): Dirty code, consider a better design to notify CUDA - // alloc retry - if (platform::is_gpu_place(allocation->place())) { - NotifyGPURetryThreads(); - } -#endif } }; diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index d5bab43d60c442..636798d09d4c42 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -170,7 +170,7 @@ class AllocatorFacadePrivate { << " " << place << " " << size; if (platform::is_gpu_place(place) && size > 0) { - return GetCUDAAllocator(boost::get(place), + return GetCUDAAllocator(BOOST_GET_CONST(platform::CUDAPlace, place), default_cuda_stream_); } @@ -206,8 +206,6 @@ class AllocatorFacadePrivate { cudaStream_t GetDefaultCudaStream() { return default_cuda_stream_; } - void NotifyGPURetryThreads() { cuda_retry_cv_.notify_all(); } - void RecordStream(Allocation* allocation, const cudaStream_t& stream) { PADDLE_ENFORCE_EQ( platform::is_gpu_place(allocation->place()), true, @@ -229,9 +227,9 @@ class AllocatorFacadePrivate { try { cuda_allocator = GetCUDAAllocator(place, stream); } catch (platform::EnforceNotMet& err) { - VLOG(9) << "No allocator found for stream " << stream << "in place " + VLOG(9) << "No allocator found for stream " << stream << " in place " << place << " , build a new one"; - std::unique_lock lock(cuda_retry_mutex_); + std::unique_lock lock(cuda_init_mutex_); try { cuda_allocator = GetCUDAAllocator(place, stream); } catch (platform::EnforceNotMet& err) { @@ -244,61 +242,19 @@ class AllocatorFacadePrivate { throw; } - if (FLAGS_gpu_allocator_retry_time <= 0) { - return cuda_allocator->Allocate(size); - } - - // In fact, we can unify the code of allocation success and failure - // But it would add lock even when allocation success at the first time try { return cuda_allocator->Allocate(size); } catch (BadAlloc&) { VLOG(9) << "Allocation failed when allocating " << size << " bytes for stream " << stream; for (auto pair : cuda_allocators_[place]) { - std::shared_ptr cuda_allocator = pair.second; - std::dynamic_pointer_cast(cuda_allocator) - ->ProcessEventsAndFree(); + pair.second->Release(place); } try { return cuda_allocator->Allocate(size); - } catch (BadAlloc&) { - { - WaitedAllocateSizeGuard guard(&cuda_waited_allocate_size_, size); - VLOG(10) - << "Still allocation failed after calling ProcessEventAndFree, " - << " cuda_waited_allocate_size_ = " << cuda_waited_allocate_size_; - // We can just write allocation retry inside the predicate function of - // wait_until. But it needs to acquire the lock when executing - // predicate - // function. For better performance, we use loop here - auto end_time = - std::chrono::high_resolution_clock::now() + - std::chrono::milliseconds(FLAGS_gpu_allocator_retry_time); - auto wait_until = [&end_time, this] { - std::unique_lock lock(cuda_retry_mutex_); - return cuda_retry_cv_.wait_until(lock, end_time); - }; - - size_t retry_times = 0; - while (wait_until() != std::cv_status::timeout) { - try { - return cuda_allocator->Allocate(size); - } catch (BadAlloc&) { - ++retry_times; - VLOG(10) << "Allocation failed when retrying " << retry_times - << " times when allocating " << size - << " bytes. Wait still."; - } catch (...) { - throw; - } - } - } - VLOG(10) << "Allocation failed because of timeout when allocating " - << size << " bytes."; - return cuda_allocator->Allocate( - size); // If timeout, try last allocation request } catch (...) { + VLOG(9) << "Still allocation failed " + << "after release memory from all streams"; throw; } } catch (...) { @@ -414,6 +370,7 @@ class AllocatorFacadePrivate { } } WrapStreamSafeCUDAAllocator(p, stream); + WrapCUDARetryAllocator(p, stream, FLAGS_gpu_allocator_retry_time); } void InitNaiveBestFitCUDAAllocator(platform::CUDAPlace p, @@ -501,6 +458,16 @@ class AllocatorFacadePrivate { cuda_allocators_[p][stream] = std::make_shared(p); } + void WrapCUDARetryAllocator(platform::CUDAPlace p, cudaStream_t stream, + size_t retry_time) { + PADDLE_ENFORCE_GT( + retry_time, 0, + platform::errors::InvalidArgument( + "Retry time should be larger than 0, but got %d", retry_time)); + std::shared_ptr& allocator = cuda_allocators_[p][stream]; + allocator = std::make_shared(allocator, retry_time); + } + void WrapStreamSafeCUDAAllocator(platform::CUDAPlace p, cudaStream_t stream) { const std::shared_ptr& underlying_allocator = GetCUDAAllocator(p, stream); @@ -607,10 +574,7 @@ class AllocatorFacadePrivate { // a standalone CUDA allocator to support multi-stream GC in new executor CUDAAllocatorMap cuda_allocators_; cudaStream_t default_cuda_stream_; - static std::condition_variable cuda_retry_cv_; - std::mutex cuda_retry_mutex_; std::mutex cuda_init_mutex_; - std::atomic cuda_waited_allocate_size_{0}; #ifdef PADDLE_WITH_CUDA std::unordered_map> cuda_graph_allocator_map_; @@ -625,9 +589,6 @@ class AllocatorFacadePrivate { AllocatorFacadePrivate::AllocatorMap AllocatorFacadePrivate::zero_size_allocators_; AllocatorFacadePrivate::AllocatorMap AllocatorFacadePrivate::system_allocators_; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -std::condition_variable AllocatorFacadePrivate::cuda_retry_cv_; -#endif // Pimpl. Make interface clean. AllocatorFacade::AllocatorFacade() : m_(new AllocatorFacadePrivate()) {} @@ -649,7 +610,7 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::is_gpu_place(place)) { - return Alloc(boost::get(place), + return Alloc(BOOST_GET_CONST(platform::CUDAPlace, place), m_->GetDefaultCudaStream(), size); } #endif @@ -659,7 +620,7 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, uint64_t AllocatorFacade::Release(const platform::Place& place) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::is_gpu_place(place)) { - return Release(boost::get(place), + return Release(BOOST_GET_CONST(platform::CUDAPlace, place), m_->GetDefaultCudaStream()); } #endif @@ -692,8 +653,6 @@ uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place, return m_->GetCUDAAllocator(place, stream)->Release(place); } -void AllocatorFacade::NotifyGPURetryThreads() { m_->NotifyGPURetryThreads(); } - void AllocatorFacade::RecordStream(Allocation* allocation, const cudaStream_t& stream) { m_->RecordStream(allocation, stream); @@ -708,10 +667,6 @@ void AllocatorFacade::RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id) { return m_->RemoveMemoryPoolOfCUDAGraph(id); } #endif - -void NotifyGPURetryThreads() { - allocation::AllocatorFacade::Instance().NotifyGPURetryThreads(); -} #endif } // namespace allocation } // namespace memory diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h index 507b36abd5404c..b61e1616fee45e 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.h +++ b/paddle/fluid/memory/allocation/allocator_facade.h @@ -65,8 +65,6 @@ class AllocatorFacade { const cudaStream_t& stream, size_t size); uint64_t Release(const platform::CUDAPlace& place, const cudaStream_t& stream); - - void NotifyGPURetryThreads(); void RecordStream(Allocation* allocation, const cudaStream_t& stream); #ifdef PADDLE_WITH_CUDA void PrepareMemoryPoolForCUDAGraph(CUDAGraphID id); diff --git a/paddle/fluid/memory/allocation/retry_allocator.cc b/paddle/fluid/memory/allocation/retry_allocator.cc index 86c7ad3dc5be16..1607af3808b434 100644 --- a/paddle/fluid/memory/allocation/retry_allocator.cc +++ b/paddle/fluid/memory/allocation/retry_allocator.cc @@ -20,6 +20,25 @@ namespace paddle { namespace memory { namespace allocation { +class WaitedAllocateSizeGuard { + public: + WaitedAllocateSizeGuard(std::atomic* waited_size, + size_t requested_size) + : waited_size_(waited_size), requested_size_(requested_size) { + waited_size_->fetch_add(requested_size_, + std::memory_order::memory_order_relaxed); + } + + ~WaitedAllocateSizeGuard() { + waited_size_->fetch_sub(requested_size_, + std::memory_order::memory_order_relaxed); + } + + private: + std::atomic* waited_size_; + size_t requested_size_; +}; + void RetryAllocator::FreeImpl(Allocation* allocation) { // Delete underlying allocation first. size_t size = allocation->size(); diff --git a/paddle/fluid/memory/allocation/retry_allocator.h b/paddle/fluid/memory/allocation/retry_allocator.h index 36ff2e027c1a37..031a5e2b97f178 100644 --- a/paddle/fluid/memory/allocation/retry_allocator.h +++ b/paddle/fluid/memory/allocation/retry_allocator.h @@ -28,25 +28,6 @@ namespace paddle { namespace memory { namespace allocation { -class WaitedAllocateSizeGuard { - public: - WaitedAllocateSizeGuard(std::atomic* waited_size, - size_t requested_size) - : waited_size_(waited_size), requested_size_(requested_size) { - waited_size_->fetch_add(requested_size_, - std::memory_order::memory_order_relaxed); - } - - ~WaitedAllocateSizeGuard() { - waited_size_->fetch_sub(requested_size_, - std::memory_order::memory_order_relaxed); - } - - private: - std::atomic* waited_size_; - size_t requested_size_; -}; - class RetryAllocator : public Allocator { public: RetryAllocator(std::shared_ptr allocator, size_t retry_ms) diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc index 5558ddcbbb5361..dcc203ae462ad9 100644 --- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc +++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc @@ -73,8 +73,8 @@ void StreamSafeCUDAAllocator::ProcessEventsAndFree() { if (deque_it == outstanding_events.end()) { outstanding_events.clear(); Allocation* allocation = map_it->first; - auto next_it = ++map_it; // "map_it" may be invalid after calling - // FreeStreamSafeCUDAAllocation + // "map_it" may be invalid after calling FreeStreamSafeCUDAAllocation + auto next_it = ++map_it; FreeStreamSafeCUDAAllocation(allocation); map_it = next_it; } else { @@ -151,7 +151,8 @@ StreamSafeCUDAAllocator::GetAllocationInfo(Allocation* allocation) { auto it = allocation_info_map_.find(allocation); PADDLE_ENFORCE_NE( it, allocation_info_map_.end(), - "The recorded allocation is not malloced by this allocator."); + platform::errors::NotFound( + "The recorded allocation is not malloced by this allocator")); return it->second; } diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h index 1746f537f374cb..33a542ddc97733 100644 --- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h +++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h @@ -14,7 +14,6 @@ #pragma once -#include #include #include #include diff --git a/paddle/fluid/memory/malloc_test.cu b/paddle/fluid/memory/malloc_test.cu index e8d30e6fe98533..0bf9435ae700c2 100644 --- a/paddle/fluid/memory/malloc_test.cu +++ b/paddle/fluid/memory/malloc_test.cu @@ -175,26 +175,30 @@ TEST(Malloc, AllocZero) { EXPECT_GE(allocation_ptr->size(), 0); } -TEST(Malloc, CUDAAllocRetry) { - platform::Place place = platform::CUDAPlace(); +TEST(Malloc, StreamSafeCUDAAllocRetry) { + platform::CUDAPlace place = platform::CUDAPlace(); + cudaStream_t stream1, stream2; + PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream1)); + PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream2)); + size_t available_size = platform::GpuAvailableMemToAlloc(); // alloc_size < available_size < 2 * alloc_size size_t alloc_size = available_size / 4 * 3; - auto alloc_fun = [&place, alloc_size]() { - return AllocShared(place, alloc_size); - }; - std::shared_ptr allocation = alloc_fun(); - auto start_time = std::chrono::steady_clock::now(); - std::thread th(alloc_fun); - std::this_thread::sleep_for(std::chrono::seconds(1)); - allocation.reset(); + std::shared_ptr allocation1 = + AllocShared(place, stream1, alloc_size); + std::shared_ptr allocation2; + + std::thread th([&allocation2, &place, &stream2, alloc_size]() { + std::this_thread::sleep_for(std::chrono::seconds(1)); + allocation2 = AllocShared(place, stream2, alloc_size); + }); + allocation1.reset(); // free but not release th.join(); - auto end_time = std::chrono::steady_clock::now(); + EXPECT_GE(allocation2->size(), alloc_size); - std::chrono::duration time = end_time - start_time; - VLOG(10) << "time cost = " << time.count() << " s"; - EXPECT_LE(time.count() * 1000, FLAGS_gpu_allocator_retry_time); + Release(place, stream1); + Release(place, stream2); } __global__ void add_kernel(int *x, int n) { @@ -213,6 +217,8 @@ class StreamSafeCUDAAllocTest : public ::testing::Test { block_num_ = 64; data_num_ = 64; default_stream = nullptr; + cuda_malloc_size_before_test_ = + platform::RecordedCudaMallocSize(place_.GetDeviceId()); streams_.reserve(stream_num_); streams_.emplace_back(default_stream); @@ -288,15 +294,17 @@ class StreamSafeCUDAAllocTest : public ::testing::Test { uint64_t cuda_malloc_size = platform::RecordedCudaMallocSize(place_.GetDeviceId()); - ASSERT_EQ(cuda_malloc_size, 0) << "Found " << cuda_malloc_size - << " bytes memory that not released yet, " - "there may be a memory leak problem."; + ASSERT_EQ(cuda_malloc_size, cuda_malloc_size_before_test_) + << "Found " << cuda_malloc_size + << " bytes memory that not released yet, there may be a memory leak " + "problem."; } size_t stream_num_; size_t grid_num_; size_t block_num_; size_t data_num_; + uint64_t cuda_malloc_size_before_test_; platform::CUDAPlace place_; cudaStream_t default_stream; std::vector streams_; From c9e5291cfef8010b02bfb0fba6228033fc4e9d0e Mon Sep 17 00:00:00 2001 From: chenruibiao Date: Thu, 18 Nov 2021 20:20:20 +0800 Subject: [PATCH 03/13] Fix compile error for CPU --- paddle/fluid/memory/allocation/CMakeLists.txt | 13 ++++++++----- paddle/fluid/memory/allocation/allocator_facade.cc | 13 ++++--------- .../memory/allocation/stream_safe_cuda_allocator.cc | 1 - .../memory/allocation/stream_safe_cuda_allocator.h | 2 ++ paddle/fluid/memory/malloc_test.cu | 6 ++---- 5 files changed, 16 insertions(+), 19 deletions(-) diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt index 4b4717cead36e5..4d44c533b7456f 100644 --- a/paddle/fluid/memory/allocation/CMakeLists.txt +++ b/paddle/fluid/memory/allocation/CMakeLists.txt @@ -15,8 +15,10 @@ endif() if (WITH_GPU) nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard) - nv_library(thread_local_allocator SRCS thread_local_allocator.cc DEPS allocator) nv_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator) + nv_library(stream_safe_cuda_allocator SRCS stream_safe_cuda_allocator.cc DEPS allocator) + nv_library(thread_local_allocator SRCS thread_local_allocator.cc DEPS allocator) + cc_test(thread_local_allocator_test SRCS thread_local_allocator_test.cc DEPS thread_local_allocator) if(CUDA_VERSION GREATER_EQUAL 10.2) nv_library(cuda_virtual_mem_allocator SRCS cuda_virtual_mem_allocator.cc DEPS dynload_cuda) @@ -25,8 +27,10 @@ endif() if (WITH_ROCM) hip_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard) - hip_library(thread_local_allocator SRCS thread_local_allocator.cc DEPS allocator) hip_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator) + hip_library(stream_safe_cuda_allocator SRCS stream_safe_cuda_allocator.cc DEPS allocator) + hip_library(thread_local_allocator SRCS thread_local_allocator.cc DEPS allocator) + cc_test(thread_local_allocator_test SRCS thread_local_allocator_test.cc DEPS thread_local_allocator) endif() @@ -38,7 +42,7 @@ endif() cc_library(retry_allocator SRCS retry_allocator.cc DEPS allocator) if (WITH_GPU OR WITH_ROCM) - set(AllocatorFacadeDeps gpu_info cuda_allocator pinned_allocator cuda_device_guard thread_local_allocator) + set(AllocatorFacadeDeps gpu_info cuda_allocator pinned_allocator cuda_device_guard thread_local_allocator stream_safe_cuda_allocator) if(CUDA_VERSION GREATER_EQUAL 10.2) list(APPEND AllocatorFacadeDeps cuda_virtual_mem_allocator) endif() @@ -88,7 +92,7 @@ endif() cc_library(aligned_allocator SRCS aligned_allocator.cc DEPS allocator) cc_test(test_aligned_allocator SRCS test_aligned_allocator.cc DEPS aligned_allocator) cc_library(allocator_strategy SRCS allocator_strategy.cc DEPS gflags ${AllocatorFacadeDeps}) -cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy stream_safe_cuda_allocator) +cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy) if (WITH_GPU) target_link_libraries(allocator_facade cuda_graph) @@ -113,7 +117,6 @@ cc_library(auto_growth_best_fit_allocator SRCS auto_growth_best_fit_allocator.cc cc_test(auto_growth_best_fit_allocator_facade_test SRCS auto_growth_best_fit_allocator_facade_test.cc DEPS cpu_allocator auto_growth_best_fit_allocator) cc_test(auto_growth_best_fit_allocator_test SRCS auto_growth_best_fit_allocator_test.cc DEPS auto_growth_best_fit_allocator) -cc_library(stream_safe_cuda_allocator SRCS stream_safe_cuda_allocator.cc DEPS allocator) cc_library(virtual_memory_auto_growth_best_fit_allocator SRCS virtual_memory_auto_growth_best_fit_allocator.cc DEPS allocator aligned_allocator) if(NOT WIN32) diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 636798d09d4c42..902eba1adfc3d8 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -14,15 +14,7 @@ #include "paddle/fluid/memory/allocation/allocator_facade.h" -#include -#include -#include - -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -#include -#endif #include "gflags/gflags.h" - #include "paddle/fluid/memory/allocation/aligned_allocator.h" #include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/memory/allocation/allocator_strategy.h" @@ -30,7 +22,6 @@ #include "paddle/fluid/memory/allocation/cpu_allocator.h" #include "paddle/fluid/memory/allocation/naive_best_fit_allocator.h" #include "paddle/fluid/memory/allocation/retry_allocator.h" -#include "paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/npu_info.h" #include "paddle/fluid/platform/place.h" @@ -38,8 +29,10 @@ #include "paddle/fluid/memory/allocation/npu_pinned_allocator.h" #endif #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#include #include "paddle/fluid/memory/allocation/cuda_allocator.h" #include "paddle/fluid/memory/allocation/pinned_allocator.h" +#include "paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h" #include "paddle/fluid/memory/allocation/thread_local_allocator.h" #include "paddle/fluid/platform/gpu_info.h" #endif @@ -169,10 +162,12 @@ class AllocatorFacadePrivate { VLOG(6) << "GetAllocator" << " " << place << " " << size; +#ifdef PADDLE_WITH_CUDA if (platform::is_gpu_place(place) && size > 0) { return GetCUDAAllocator(BOOST_GET_CONST(platform::CUDAPlace, place), default_cuda_stream_); } +#endif const auto& allocators = (size > 0 ? (UNLIKELY(FLAGS_use_system_allocator) ? system_allocators_ diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc index dcc203ae462ad9..0993921ebc0ddc 100644 --- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc +++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc @@ -13,7 +13,6 @@ // limitations under the License. #include "paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h" -#include "paddle/fluid/memory/allocation/allocator.h" namespace paddle { namespace memory { diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h index 33a542ddc97733..8c5f32665b8770 100644 --- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h +++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h @@ -14,9 +14,11 @@ #pragma once +#include #include #include #include +#include #include #include "paddle/fluid/memory/allocation/allocator.h" diff --git a/paddle/fluid/memory/malloc_test.cu b/paddle/fluid/memory/malloc_test.cu index 0bf9435ae700c2..b243acdeabc2c6 100644 --- a/paddle/fluid/memory/malloc_test.cu +++ b/paddle/fluid/memory/malloc_test.cu @@ -29,10 +29,6 @@ #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/gpu_info.h" -#if defined(PADDLE_WITH_CUDA) -DECLARE_int64(gpu_allocator_retry_time); -#endif - namespace paddle { namespace memory { @@ -175,6 +171,7 @@ TEST(Malloc, AllocZero) { EXPECT_GE(allocation_ptr->size(), 0); } +#ifdef PADDLE_WITH_CUDA TEST(Malloc, StreamSafeCUDAAllocRetry) { platform::CUDAPlace place = platform::CUDAPlace(); cudaStream_t stream1, stream2; @@ -320,6 +317,7 @@ TEST_F(StreamSafeCUDAAllocTest, CUDAMutilThreadMutilStream) { MultiThreadMUltiStreamRun(); CheckResult(); } +#endif } // namespace memory } // namespace paddle From e1e8012380de0814853eed62aebbf5959ee8c650 Mon Sep 17 00:00:00 2001 From: chenruibiao Date: Thu, 18 Nov 2021 21:38:57 +0800 Subject: [PATCH 04/13] Fix compile error for HIP --- .../memory/allocation/allocator_facade.cc | 64 ++++++++++--------- .../memory/allocation/allocator_facade.h | 9 ++- .../allocation/stream_safe_cuda_allocator.cc | 43 +++++++++---- .../allocation/stream_safe_cuda_allocator.h | 26 ++++---- paddle/fluid/memory/malloc.cc | 10 +-- paddle/fluid/memory/malloc.h | 8 +-- paddle/fluid/memory/malloc_test.cu | 43 ++++++++++--- 7 files changed, 129 insertions(+), 74 deletions(-) diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 902eba1adfc3d8..a47dba54932646 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -25,29 +25,36 @@ #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/npu_info.h" #include "paddle/fluid/platform/place.h" -#ifdef PADDLE_WITH_ASCEND_CL -#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h" -#endif + #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -#include #include "paddle/fluid/memory/allocation/cuda_allocator.h" #include "paddle/fluid/memory/allocation/pinned_allocator.h" #include "paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h" #include "paddle/fluid/memory/allocation/thread_local_allocator.h" #include "paddle/fluid/platform/gpu_info.h" + +#ifdef PADDLE_WITH_CUDA +#include +#include "paddle/fluid/platform/cuda_graph.h" +#else +#include #endif + #if CUDA_VERSION >= 10020 #include "paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.h" #include "paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.h" #include "paddle/fluid/platform/dynload/cuda_driver.h" #endif -#ifdef PADDLE_WITH_CUDA -#include "paddle/fluid/platform/cuda_graph.h" #endif + #ifdef PADDLE_WITH_XPU #include "paddle/fluid/platform/xpu/xpu_info.h" #endif +#ifdef PADDLE_WITH_ASCEND_CL +#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h" +#endif + PADDLE_DEFINE_EXPORTED_int64( gpu_allocator_retry_time, 10000, "The retry time (milliseconds) when allocator fails " @@ -119,7 +126,7 @@ class AllocatorFacadePrivate { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) using CUDAAllocatorMap = std::map>>; + std::map>>; #endif explicit AllocatorFacadePrivate(bool allow_free_idle_chunk = true) { @@ -130,9 +137,9 @@ class AllocatorFacadePrivate { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) allow_free_idle_chunk_ = allow_free_idle_chunk; - default_cuda_stream_ = nullptr; + default_stream_ = nullptr; for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount(); ++dev_id) { - InitCUDAAllocator(platform::CUDAPlace(dev_id), default_cuda_stream_); + InitCUDAAllocator(platform::CUDAPlace(dev_id), default_stream_); } InitNaiveBestFitCUDAPinnedAllocator(); #endif @@ -165,7 +172,7 @@ class AllocatorFacadePrivate { #ifdef PADDLE_WITH_CUDA if (platform::is_gpu_place(place) && size > 0) { return GetCUDAAllocator(BOOST_GET_CONST(platform::CUDAPlace, place), - default_cuda_stream_); + default_stream_); } #endif @@ -182,13 +189,13 @@ class AllocatorFacadePrivate { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) const std::shared_ptr& GetCUDAAllocator( - const platform::CUDAPlace& place, const cudaStream_t& stream) { + const platform::CUDAPlace& place, const gpuStream_t& stream) { auto place_it = cuda_allocators_.find(place); PADDLE_ENFORCE_NE(place_it, cuda_allocators_.end(), platform::errors::NotFound( "No allocator found for the place %s", place)); - const std::map>& allocator_map = + const std::map>& allocator_map = place_it->second; auto stream_it = allocator_map.find(stream); PADDLE_ENFORCE_NE( @@ -199,9 +206,9 @@ class AllocatorFacadePrivate { return stream_it->second; } - cudaStream_t GetDefaultCudaStream() { return default_cuda_stream_; } + gpuStream_t GetDefaultStream() { return default_stream_; } - void RecordStream(Allocation* allocation, const cudaStream_t& stream) { + void RecordStream(Allocation* allocation, const gpuStream_t& stream) { PADDLE_ENFORCE_EQ( platform::is_gpu_place(allocation->place()), true, platform::errors::InvalidArgument( @@ -211,7 +218,7 @@ class AllocatorFacadePrivate { } AllocationPtr CUDAAlloc(const platform::CUDAPlace& place, - const cudaStream_t& stream, size_t size) { + const gpuStream_t& stream, size_t size) { std::shared_ptr cuda_allocator; /* NOTE(Ruibiao): This code does not lead to lock competition * for seraching initialized CUDA allocator in multithreaded scenario. @@ -345,7 +352,7 @@ class AllocatorFacadePrivate { std::make_shared(platform::CUDAPinnedPlace()); } - void InitCUDAAllocator(platform::CUDAPlace p, cudaStream_t stream) { + void InitCUDAAllocator(platform::CUDAPlace p, gpuStream_t stream) { switch (strategy_) { case AllocatorStrategy::kNaiveBestFit: { InitNaiveBestFitCUDAAllocator(p, stream); @@ -369,11 +376,11 @@ class AllocatorFacadePrivate { } void InitNaiveBestFitCUDAAllocator(platform::CUDAPlace p, - cudaStream_t stream) { + gpuStream_t stream) { cuda_allocators_[p][stream] = std::make_shared(p); } - void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p, cudaStream_t stream) { + void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p, gpuStream_t stream) { #if defined(PADDLE_WITH_HIP) auto cuda_allocator = std::make_shared(p); cuda_allocators_[p][stream] = std::make_shared( @@ -448,12 +455,11 @@ class AllocatorFacadePrivate { #endif } - void InitThreadLocalCUDAAllocator(platform::CUDAPlace p, - cudaStream_t stream) { + void InitThreadLocalCUDAAllocator(platform::CUDAPlace p, gpuStream_t stream) { cuda_allocators_[p][stream] = std::make_shared(p); } - void WrapCUDARetryAllocator(platform::CUDAPlace p, cudaStream_t stream, + void WrapCUDARetryAllocator(platform::CUDAPlace p, gpuStream_t stream, size_t retry_time) { PADDLE_ENFORCE_GT( retry_time, 0, @@ -463,7 +469,7 @@ class AllocatorFacadePrivate { allocator = std::make_shared(allocator, retry_time); } - void WrapStreamSafeCUDAAllocator(platform::CUDAPlace p, cudaStream_t stream) { + void WrapStreamSafeCUDAAllocator(platform::CUDAPlace p, gpuStream_t stream) { const std::shared_ptr& underlying_allocator = GetCUDAAllocator(p, stream); cuda_allocators_[p][stream] = @@ -568,7 +574,7 @@ class AllocatorFacadePrivate { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) // a standalone CUDA allocator to support multi-stream GC in new executor CUDAAllocatorMap cuda_allocators_; - cudaStream_t default_cuda_stream_; + gpuStream_t default_stream_; std::mutex cuda_init_mutex_; #ifdef PADDLE_WITH_CUDA std::unordered_map> @@ -606,7 +612,7 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::is_gpu_place(place)) { return Alloc(BOOST_GET_CONST(platform::CUDAPlace, place), - m_->GetDefaultCudaStream(), size); + m_->GetDefaultStream(), size); } #endif return m_->GetAllocator(place, size)->Allocate(size); @@ -616,7 +622,7 @@ uint64_t AllocatorFacade::Release(const platform::Place& place) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::is_gpu_place(place)) { return Release(BOOST_GET_CONST(platform::CUDAPlace, place), - m_->GetDefaultCudaStream()); + m_->GetDefaultStream()); } #endif return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1) @@ -630,12 +636,12 @@ const std::shared_ptr& AllocatorFacade::GetAllocator( #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) std::shared_ptr AllocatorFacade::AllocShared( - const platform::CUDAPlace& place, const cudaStream_t& stream, size_t size) { + const platform::CUDAPlace& place, const gpuStream_t& stream, size_t size) { return std::shared_ptr(Alloc(place, stream, size)); } AllocationPtr AllocatorFacade::Alloc(const platform::CUDAPlace& place, - const cudaStream_t& stream, size_t size) { + const gpuStream_t& stream, size_t size) { if (size > 0) { return m_->CUDAAlloc(place, stream, size); } else { @@ -644,12 +650,12 @@ AllocationPtr AllocatorFacade::Alloc(const platform::CUDAPlace& place, } uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place, - const cudaStream_t& stream) { + const gpuStream_t& stream) { return m_->GetCUDAAllocator(place, stream)->Release(place); } void AllocatorFacade::RecordStream(Allocation* allocation, - const cudaStream_t& stream) { + const gpuStream_t& stream) { m_->RecordStream(allocation, stream); } diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h index b61e1616fee45e..813cf62c1366c9 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.h +++ b/paddle/fluid/memory/allocation/allocator_facade.h @@ -59,13 +59,12 @@ class AllocatorFacade { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) std::shared_ptr AllocShared(const platform::CUDAPlace& place, - const cudaStream_t& stream, + const gpuStream_t& stream, size_t size); AllocationPtr Alloc(const platform::CUDAPlace& place, - const cudaStream_t& stream, size_t size); - uint64_t Release(const platform::CUDAPlace& place, - const cudaStream_t& stream); - void RecordStream(Allocation* allocation, const cudaStream_t& stream); + const gpuStream_t& stream, size_t size); + uint64_t Release(const platform::CUDAPlace& place, const gpuStream_t& stream); + void RecordStream(Allocation* allocation, const gpuStream_t& stream); #ifdef PADDLE_WITH_CUDA void PrepareMemoryPoolForCUDAGraph(CUDAGraphID id); void RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id); diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc index 0993921ebc0ddc..d7515baa2787b2 100644 --- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc +++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc @@ -19,14 +19,14 @@ namespace memory { namespace allocation { StreamSafeCUDAAllocation::StreamSafeCUDAAllocation( - AllocationPtr underlying_allocation, cudaStream_t owning_stream) + AllocationPtr underlying_allocation, gpuStream_t owning_stream) : Allocation(underlying_allocation->ptr(), underlying_allocation->size(), underlying_allocation->place()), underlying_allocation_(std::move(underlying_allocation)), owning_stream_(owning_stream), - recorded_streams_(std::make_shared>()) {} + recorded_streams_(std::make_shared>()) {} -void StreamSafeCUDAAllocation::RecordStream(cudaStream_t stream) { +void StreamSafeCUDAAllocation::RecordStream(gpuStream_t stream) { VLOG(8) << "Record stream " << stream << " to " << ptr(); if (stream == owning_stream_) { return; @@ -35,14 +35,14 @@ void StreamSafeCUDAAllocation::RecordStream(cudaStream_t stream) { recorded_streams_->insert(stream); } -std::shared_ptr> +std::shared_ptr> StreamSafeCUDAAllocation::GetRecordedStreams() { return recorded_streams_; } StreamSafeCUDAAllocator::StreamSafeCUDAAllocator( const std::shared_ptr& underlying_allocator, - const cudaStream_t default_stream) + const gpuStream_t default_stream) : underlying_allocator_(underlying_allocator), default_stream_(default_stream) {} @@ -51,13 +51,14 @@ bool StreamSafeCUDAAllocator::IsAllocThreadSafe() const { return true; } void StreamSafeCUDAAllocator::ProcessEventsAndFree() { for (auto map_it = allocation_info_map_.begin(); map_it != allocation_info_map_.end();) { - std::deque& outstanding_events = + std::deque& outstanding_events = map_it->second->outstanding_events; VLOG(10) << "Check " << outstanding_events.size() << " outstanding events for " << map_it->first->ptr(); auto deque_it = outstanding_events.begin(); while (deque_it != outstanding_events.end()) { - cudaError_t err = cudaEventQuery(*deque_it); +#ifdef PADDLE_WITH_CUDA + gpuError_t err = cudaEventQuery(*deque_it); if (err == cudaErrorNotReady) { VLOG(10) << "Event " << *deque_it << " for " << map_it->first->ptr() << " is not complete"; @@ -67,6 +68,18 @@ void StreamSafeCUDAAllocator::ProcessEventsAndFree() { PADDLE_ENFORCE_CUDA_SUCCESS(err); PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(*deque_it)); ++deque_it; +#else + gpuError_t err = hipEventQuery(*deque_it); + if (err == hipErrorNotReady) { + VLOG(10) << "Event " << *deque_it << " for " << map_it->first->ptr() + << " is not complete"; + outstanding_events.erase(outstanding_events.begin(), deque_it); + break; + } + PADDLE_ENFORCE_CUDA_SUCCESS(err); + PADDLE_ENFORCE_CUDA_SUCCESS(hipEventDestroy(*deque_it)); + ++deque_it; +#endif } if (deque_it == outstanding_events.end()) { @@ -107,13 +120,19 @@ uint64_t StreamSafeCUDAAllocator::ReleaseImpl(const platform::Place& place) { } void StreamSafeCUDAAllocator::CreateEventForAllRecordedStream( - std::set* recorded_streams, - std::deque* outstanding_events) { - for (cudaStream_t stream : *recorded_streams) { - cudaEvent_t event; + std::set* recorded_streams, + std::deque* outstanding_events) { + for (gpuStream_t stream : *recorded_streams) { + gpuEvent_t event; +#ifdef PADDLE_WITH_CUDA PADDLE_ENFORCE_CUDA_SUCCESS( cudaEventCreateWithFlags(&event, cudaEventDisableTiming)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event, stream)); +#else + PADDLE_ENFORCE_CUDA_SUCCESS( + hipEventCreateWithFlags(&event, hipEventDisableTiming)); + PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(event, stream)); +#endif outstanding_events->emplace_back(event); VLOG(9) << "Record event " << event << " in stream " << stream; } @@ -128,7 +147,7 @@ void StreamSafeCUDAAllocator::FreeStreamSafeCUDAAllocation( return; } - std::deque& outstanding_events = + std::deque& outstanding_events = allocation_info->outstanding_events; CreateEventForAllRecordedStream( dynamic_cast(allocation) diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h index 8c5f32665b8770..f74c01e970c109 100644 --- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h +++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h @@ -13,8 +13,12 @@ // limitations under the License. #pragma once - +#ifdef PADDLE_WITH_CUDA #include +#else +#include +#endif + #include #include #include @@ -29,14 +33,14 @@ namespace allocation { class StreamSafeCUDAAllocation : public Allocation { public: StreamSafeCUDAAllocation(AllocationPtr underlying_allocation, - cudaStream_t owning_stream); - void RecordStream(cudaStream_t stream); - std::shared_ptr> GetRecordedStreams(); + gpuStream_t owning_stream); + void RecordStream(gpuStream_t stream); + std::shared_ptr> GetRecordedStreams(); private: AllocationPtr underlying_allocation_; - cudaStream_t owning_stream_; - std::shared_ptr> recorded_streams_; + gpuStream_t owning_stream_; + std::shared_ptr> recorded_streams_; std::mutex mutex_; }; @@ -44,7 +48,7 @@ class StreamSafeCUDAAllocator : public Allocator { public: StreamSafeCUDAAllocator( const std::shared_ptr &underlying_allocator, - const cudaStream_t default_stream); + const gpuStream_t default_stream); bool IsAllocThreadSafe() const override; void ProcessEventsAndFree(); @@ -55,18 +59,18 @@ class StreamSafeCUDAAllocator : public Allocator { private: struct AllocationInfo { - std::deque outstanding_events; + std::deque outstanding_events; bool can_be_freed{false}; }; void CreateEventForAllRecordedStream( - std::set *recorded_streams, - std::deque *outstanding_events); + std::set *recorded_streams, + std::deque *outstanding_events); void FreeStreamSafeCUDAAllocation(Allocation *allocation); std::shared_ptr GetAllocationInfo(Allocation *); std::shared_ptr underlying_allocator_; - cudaStream_t default_stream_; + gpuStream_t default_stream_; std::unordered_map> allocation_info_map_; mutable std::recursive_mutex mutex_; diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc index 55b4657df3ac62..9cd391acca6a45 100644 --- a/paddle/fluid/memory/malloc.cc +++ b/paddle/fluid/memory/malloc.cc @@ -35,22 +35,22 @@ uint64_t Release(const platform::Place& place) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) std::shared_ptr AllocShared(const platform::CUDAPlace& place, - const cudaStream_t& stream, + const gpuStream_t& stream, size_t size) { return allocation::AllocatorFacade::Instance().AllocShared(place, stream, size); } -AllocationPtr Alloc(const platform::CUDAPlace& place, - const cudaStream_t& stream, size_t size) { +AllocationPtr Alloc(const platform::CUDAPlace& place, const gpuStream_t& stream, + size_t size) { return allocation::AllocatorFacade::Instance().Alloc(place, stream, size); } -uint64_t Release(const platform::CUDAPlace& place, const cudaStream_t& stream) { +uint64_t Release(const platform::CUDAPlace& place, const gpuStream_t& stream) { return allocation::AllocatorFacade::Instance().Release(place, stream); } -void RecordStream(Allocation* allocation, const cudaStream_t& stream) { +void RecordStream(Allocation* allocation, const gpuStream_t& stream) { return allocation::AllocatorFacade::Instance().RecordStream(allocation, stream); } diff --git a/paddle/fluid/memory/malloc.h b/paddle/fluid/memory/malloc.h index 0b49e1cbee1047..4926dc572cad4c 100644 --- a/paddle/fluid/memory/malloc.h +++ b/paddle/fluid/memory/malloc.h @@ -42,16 +42,16 @@ extern uint64_t Release(const platform::Place& place); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) extern std::shared_ptr AllocShared(const platform::CUDAPlace& place, - const cudaStream_t& stream, + const gpuStream_t& stream, size_t size); extern AllocationPtr Alloc(const platform::CUDAPlace& place, - const cudaStream_t& stream, size_t size); + const gpuStream_t& stream, size_t size); extern uint64_t Release(const platform::CUDAPlace& place, - const cudaStream_t& stream); + const gpuStream_t& stream); -void RecordStream(Allocation* allocation, const cudaStream_t& stream); +void RecordStream(Allocation* allocation, const gpuStream_t& stream); #endif } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/malloc_test.cu b/paddle/fluid/memory/malloc_test.cu index b243acdeabc2c6..3d0c1896bfaafa 100644 --- a/paddle/fluid/memory/malloc_test.cu +++ b/paddle/fluid/memory/malloc_test.cu @@ -174,10 +174,14 @@ TEST(Malloc, AllocZero) { #ifdef PADDLE_WITH_CUDA TEST(Malloc, StreamSafeCUDAAllocRetry) { platform::CUDAPlace place = platform::CUDAPlace(); - cudaStream_t stream1, stream2; + gpuStream_t stream1, stream2; +#ifdef PADDLE_WITH_CUDA PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream1)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream2)); - +#else + PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreate(&stream1)); + PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreate(&stream2)); +#endif size_t available_size = platform::GpuAvailableMemToAlloc(); // alloc_size < available_size < 2 * alloc_size size_t alloc_size = available_size / 4 * 3; @@ -220,8 +224,12 @@ class StreamSafeCUDAAllocTest : public ::testing::Test { streams_.reserve(stream_num_); streams_.emplace_back(default_stream); for (size_t i = 1; i < stream_num_; ++i) { - cudaStream_t stream; + gpuStream_t stream; +#ifdef PADDLE_WITH_CUDA PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream)); +#else + PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreate(&stream)); +#endif streams_.emplace_back(stream); } @@ -229,8 +237,13 @@ class StreamSafeCUDAAllocTest : public ::testing::Test { size_t allocation_size = data_num_ * sizeof(int); std::shared_ptr allocation = AllocShared(place_, streams_[i], allocation_size); +#ifdef PADDLE_WITH_CUDA PADDLE_ENFORCE_CUDA_SUCCESS( cudaMemset(allocation->ptr(), 0, allocation->size())); +#else + PADDLE_ENFORCE_CUDA_SUCCESS( + hipMemset(allocation->ptr(), 0, allocation->size())); +#endif allocations_.emplace_back(allocation); } } @@ -268,11 +281,17 @@ class StreamSafeCUDAAllocTest : public ::testing::Test { auto host_x = std::unique_ptr(new int[data_num_]); size_t thread_num = grid_num_ * block_num_; for (int i = 0; i < stream_num_; ++i) { - // tricky code, the allocations are still accessible even though - // allocations_.clear() has been called +// tricky code, the allocations are still accessible even though +// allocations_.clear() has been called +#ifdef PADDLE_WITH_CUDA PADDLE_ENFORCE_CUDA_SUCCESS( cudaMemcpy(host_x.get(), allocations_[i]->ptr(), data_num_ * sizeof(int), cudaMemcpyDeviceToHost)); +#else + PADDLE_ENFORCE_CUDA_SUCCESS( + hipMemcpy(host_x.get(), allocations_[i]->ptr(), + data_num_ * sizeof(int), hipMemcpyDeviceToHost)); +#endif for (int j = 0; j < data_num_; ++j) { EXPECT_TRUE(host_x[j] == (j % thread_num) * stream_num_); } @@ -280,13 +299,21 @@ class StreamSafeCUDAAllocTest : public ::testing::Test { } void TearDown() override { +#ifdef PADDLE_WITH_CUDA cudaDeviceSynchronize(); - for (cudaStream_t stream : streams_) { +#else + hipDeviceSynchronize(); +#endif + for (gpuStream_t stream : streams_) { Release(place_, stream); } for (size_t i = 1; i < stream_num_; ++i) { +#ifdef PADDLE_WITH_CUDA PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(streams_[i])); +#else + PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamDestroy(streams_[i])); +#endif } uint64_t cuda_malloc_size = @@ -303,8 +330,8 @@ class StreamSafeCUDAAllocTest : public ::testing::Test { size_t data_num_; uint64_t cuda_malloc_size_before_test_; platform::CUDAPlace place_; - cudaStream_t default_stream; - std::vector streams_; + gpuStream_t default_stream; + std::vector streams_; std::vector> allocations_; }; From dc7a05503ae590ff3e310b118827aa1b4a6d19b2 Mon Sep 17 00:00:00 2001 From: chenruibiao Date: Fri, 19 Nov 2021 11:23:20 +0800 Subject: [PATCH 05/13] Release memory for StreamSafeCUDAAllocaRetry in malloc_test --- .../allocation/stream_safe_cuda_allocator.cc | 94 +++++++++---------- .../allocation/stream_safe_cuda_allocator.h | 2 +- paddle/fluid/memory/malloc_test.cu | 23 ++--- 3 files changed, 60 insertions(+), 59 deletions(-) diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc index d7515baa2787b2..7ac20ab9b0f967 100644 --- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc +++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc @@ -48,53 +48,6 @@ StreamSafeCUDAAllocator::StreamSafeCUDAAllocator( bool StreamSafeCUDAAllocator::IsAllocThreadSafe() const { return true; } -void StreamSafeCUDAAllocator::ProcessEventsAndFree() { - for (auto map_it = allocation_info_map_.begin(); - map_it != allocation_info_map_.end();) { - std::deque& outstanding_events = - map_it->second->outstanding_events; - VLOG(10) << "Check " << outstanding_events.size() - << " outstanding events for " << map_it->first->ptr(); - auto deque_it = outstanding_events.begin(); - while (deque_it != outstanding_events.end()) { -#ifdef PADDLE_WITH_CUDA - gpuError_t err = cudaEventQuery(*deque_it); - if (err == cudaErrorNotReady) { - VLOG(10) << "Event " << *deque_it << " for " << map_it->first->ptr() - << " is not complete"; - outstanding_events.erase(outstanding_events.begin(), deque_it); - break; - } - PADDLE_ENFORCE_CUDA_SUCCESS(err); - PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(*deque_it)); - ++deque_it; -#else - gpuError_t err = hipEventQuery(*deque_it); - if (err == hipErrorNotReady) { - VLOG(10) << "Event " << *deque_it << " for " << map_it->first->ptr() - << " is not complete"; - outstanding_events.erase(outstanding_events.begin(), deque_it); - break; - } - PADDLE_ENFORCE_CUDA_SUCCESS(err); - PADDLE_ENFORCE_CUDA_SUCCESS(hipEventDestroy(*deque_it)); - ++deque_it; -#endif - } - - if (deque_it == outstanding_events.end()) { - outstanding_events.clear(); - Allocation* allocation = map_it->first; - // "map_it" may be invalid after calling FreeStreamSafeCUDAAllocation - auto next_it = ++map_it; - FreeStreamSafeCUDAAllocation(allocation); - map_it = next_it; - } else { - ++map_it; - } - } -} - Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) { std::lock_guard lock(mutex_); ProcessEventsAndFree(); @@ -174,6 +127,53 @@ StreamSafeCUDAAllocator::GetAllocationInfo(Allocation* allocation) { return it->second; } +void StreamSafeCUDAAllocator::ProcessEventsAndFree() { + for (auto map_it = allocation_info_map_.begin(); + map_it != allocation_info_map_.end();) { + std::deque& outstanding_events = + map_it->second->outstanding_events; + VLOG(10) << "Check " << outstanding_events.size() + << " outstanding events for " << map_it->first->ptr(); + auto deque_it = outstanding_events.begin(); + while (deque_it != outstanding_events.end()) { +#ifdef PADDLE_WITH_CUDA + gpuError_t err = cudaEventQuery(*deque_it); + if (err == cudaErrorNotReady) { + VLOG(10) << "Event " << *deque_it << " for " << map_it->first->ptr() + << " is not complete"; + outstanding_events.erase(outstanding_events.begin(), deque_it); + break; + } + PADDLE_ENFORCE_CUDA_SUCCESS(err); + PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(*deque_it)); + ++deque_it; +#else + gpuError_t err = hipEventQuery(*deque_it); + if (err == hipErrorNotReady) { + VLOG(10) << "Event " << *deque_it << " for " << map_it->first->ptr() + << " is not complete"; + outstanding_events.erase(outstanding_events.begin(), deque_it); + break; + } + PADDLE_ENFORCE_CUDA_SUCCESS(err); + PADDLE_ENFORCE_CUDA_SUCCESS(hipEventDestroy(*deque_it)); + ++deque_it; +#endif + } + + if (deque_it == outstanding_events.end()) { + outstanding_events.clear(); + Allocation* allocation = map_it->first; + // "map_it" may be invalid after calling FreeStreamSafeCUDAAllocation + auto next_it = ++map_it; + FreeStreamSafeCUDAAllocation(allocation); + map_it = next_it; + } else { + ++map_it; + } + } +} + } // namespace allocation } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h index f74c01e970c109..f625a8063becd6 100644 --- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h +++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h @@ -50,7 +50,6 @@ class StreamSafeCUDAAllocator : public Allocator { const std::shared_ptr &underlying_allocator, const gpuStream_t default_stream); bool IsAllocThreadSafe() const override; - void ProcessEventsAndFree(); protected: Allocation *AllocateImpl(size_t size) override; @@ -68,6 +67,7 @@ class StreamSafeCUDAAllocator : public Allocator { std::deque *outstanding_events); void FreeStreamSafeCUDAAllocation(Allocation *allocation); std::shared_ptr GetAllocationInfo(Allocation *); + void ProcessEventsAndFree(); std::shared_ptr underlying_allocator_; gpuStream_t default_stream_; diff --git a/paddle/fluid/memory/malloc_test.cu b/paddle/fluid/memory/malloc_test.cu index 3d0c1896bfaafa..cf7e1276e88b5a 100644 --- a/paddle/fluid/memory/malloc_test.cu +++ b/paddle/fluid/memory/malloc_test.cu @@ -171,7 +171,6 @@ TEST(Malloc, AllocZero) { EXPECT_GE(allocation_ptr->size(), 0); } -#ifdef PADDLE_WITH_CUDA TEST(Malloc, StreamSafeCUDAAllocRetry) { platform::CUDAPlace place = platform::CUDAPlace(); gpuStream_t stream1, stream2; @@ -197,6 +196,13 @@ TEST(Malloc, StreamSafeCUDAAllocRetry) { allocation1.reset(); // free but not release th.join(); EXPECT_GE(allocation2->size(), alloc_size); + allocation2.reset(); + +#ifdef PADDLE_WITH_CUDA + PADDLE_ENFORCE_CUDA_SUCCESS(cudaDeviceSynchronize()); +#else + PADDLE_ENFORCE_CUDA_SUCCESS(hipDeviceSynchronize()); +#endif Release(place, stream1); Release(place, stream2); @@ -218,8 +224,6 @@ class StreamSafeCUDAAllocTest : public ::testing::Test { block_num_ = 64; data_num_ = 64; default_stream = nullptr; - cuda_malloc_size_before_test_ = - platform::RecordedCudaMallocSize(place_.GetDeviceId()); streams_.reserve(stream_num_); streams_.emplace_back(default_stream); @@ -300,9 +304,9 @@ class StreamSafeCUDAAllocTest : public ::testing::Test { void TearDown() override { #ifdef PADDLE_WITH_CUDA - cudaDeviceSynchronize(); + PADDLE_ENFORCE_CUDA_SUCCESS(cudaDeviceSynchronize()); #else - hipDeviceSynchronize(); + PADDLE_ENFORCE_CUDA_SUCCESS(hipDeviceSynchronize()); #endif for (gpuStream_t stream : streams_) { Release(place_, stream); @@ -318,17 +322,15 @@ class StreamSafeCUDAAllocTest : public ::testing::Test { uint64_t cuda_malloc_size = platform::RecordedCudaMallocSize(place_.GetDeviceId()); - ASSERT_EQ(cuda_malloc_size, cuda_malloc_size_before_test_) - << "Found " << cuda_malloc_size - << " bytes memory that not released yet, there may be a memory leak " - "problem."; + ASSERT_EQ(cuda_malloc_size, 0) << "Found " << cuda_malloc_size + << " bytes memory that not released yet," + << " there may be a memory leak problem"; } size_t stream_num_; size_t grid_num_; size_t block_num_; size_t data_num_; - uint64_t cuda_malloc_size_before_test_; platform::CUDAPlace place_; gpuStream_t default_stream; std::vector streams_; @@ -344,7 +346,6 @@ TEST_F(StreamSafeCUDAAllocTest, CUDAMutilThreadMutilStream) { MultiThreadMUltiStreamRun(); CheckResult(); } -#endif } // namespace memory } // namespace paddle From 165d4999704517797d97d4355a56d1f6dab82910 Mon Sep 17 00:00:00 2001 From: chenruibiao Date: Sat, 20 Nov 2021 21:12:25 +0800 Subject: [PATCH 06/13] Add FLAGS_use_stream_safe_cuda_allocator --- paddle/fluid/memory/CMakeLists.txt | 6 + .../memory/allocation/allocator_facade.cc | 320 +++++++++++++----- paddle/fluid/memory/malloc_test.cu | 178 ---------- .../memory/stream_safe_cuda_alloc_test.cu | 211 ++++++++++++ 4 files changed, 454 insertions(+), 261 deletions(-) create mode 100644 paddle/fluid/memory/stream_safe_cuda_alloc_test.cu diff --git a/paddle/fluid/memory/CMakeLists.txt b/paddle/fluid/memory/CMakeLists.txt index 75b1bffca31f84..0de114ae40728b 100644 --- a/paddle/fluid/memory/CMakeLists.txt +++ b/paddle/fluid/memory/CMakeLists.txt @@ -17,6 +17,12 @@ if (WITH_GPU) nv_test(malloc_test SRCS malloc_test.cu DEPS device_context malloc) + nv_test(stream_safe_cuda_alloc_test + SRCS stream_safe_cuda_alloc_test.cu + DEPS malloc) + set_tests_properties(stream_safe_cuda_alloc_test PROPERTIES + ENVIRONMENT "FLAGS_enable_stream_safe_cuda_allocator=true" + ENVIRONMENT "FLAGS_allocator_strategy=auto_growth") endif() if (WITH_ROCM) diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index a47dba54932646..0c1d458687a1ec 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -68,6 +68,12 @@ PADDLE_DEFINE_EXPORTED_bool( PADDLE_DEFINE_EXPORTED_bool(use_virtual_memory_auto_growth, false, "Use VirtualMemoryAutoGrowthBestFitAllocator."); +// NOTE(Ruibiao): This FLAGS is just to be compatible with the old single-stream +// CUDA allocator, and will be remove after StreamSafeCudaAllocator has been +// fully tested +PADDLE_DEFINE_EXPORTED_bool(use_stream_safe_cuda_allocator, true, + "Enable StreamSafeCUDAAllocator"); + DECLARE_string(allocator_strategy); namespace paddle { @@ -131,36 +137,88 @@ class AllocatorFacadePrivate { explicit AllocatorFacadePrivate(bool allow_free_idle_chunk = true) { strategy_ = GetAllocatorStrategy(); - CheckStrategy(strategy_); - - InitNaiveBestFitCPUAllocator(); + switch (strategy_) { + case AllocatorStrategy::kNaiveBestFit: { + InitNaiveBestFitCPUAllocator(); +#ifdef PADDLE_WITH_XPU + for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) { + InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id)); + } +#endif +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount(); + ++dev_id) { + InitNaiveBestFitCUDAAllocator(platform::CUDAPlace(dev_id)); + } + InitNaiveBestFitCUDAPinnedAllocator(); +#endif +#ifdef PADDLE_WITH_ASCEND_CL + for (int dev_id = 0; dev_id < platform::GetNPUDeviceCount(); ++dev_id) { + InitNaiveBestFitNPUAllocator(platform::NPUPlace(dev_id)); + } + InitNaiveBestFitNPUPinnedAllocator(); +#endif + break; + } + case AllocatorStrategy::kAutoGrowth: { + InitNaiveBestFitCPUAllocator(); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - allow_free_idle_chunk_ = allow_free_idle_chunk; - default_stream_ = nullptr; - for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount(); ++dev_id) { - InitCUDAAllocator(platform::CUDAPlace(dev_id), default_stream_); - } - InitNaiveBestFitCUDAPinnedAllocator(); + allow_free_idle_chunk_ = allow_free_idle_chunk; + if (FLAGS_use_stream_safe_cuda_allocator) { + // TODO(Ruibiao): Support multi-stream allocator for other strategies + default_stream_ = nullptr; + for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount(); + ++dev_id) { + InitStreamSafeCUDAAllocator(platform::CUDAPlace(dev_id), + default_stream_); + } + } else { + for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount(); + ++dev_id) { + InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id), + allow_free_idle_chunk_); + } + } + InitNaiveBestFitCUDAPinnedAllocator(); #endif +#ifdef PADDLE_WITH_XPU + for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) { + InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id)); + } +#endif + break; + } + case AllocatorStrategy::kThreadLocal: { + InitNaiveBestFitCPUAllocator(); #ifdef PADDLE_WITH_XPU - for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) { - InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id)); - } + for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) { + InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id)); + } +#endif +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount(); + ++dev_id) { + InitThreadLocalCUDAAllocator(platform::CUDAPlace(dev_id)); + } + InitNaiveBestFitCUDAPinnedAllocator(); #endif + break; + } -#ifdef PADDLE_WITH_ASCEND_CL - if (strategy_ == AllocatorStrategy::kNaiveBestFit) { - for (int dev_id = 0; dev_id < platform::GetNPUDeviceCount(); ++dev_id) { - InitNaiveBestFitNPUAllocator(platform::NPUPlace(dev_id)); + default: { + PADDLE_THROW(platform::errors::InvalidArgument( + "Unsupported allocator strategy: %d", static_cast(strategy_))); } - InitNaiveBestFitNPUPinnedAllocator(); } -#endif - InitZeroSizeAllocators(); InitSystemAllocators(); + + if (FLAGS_gpu_allocator_retry_time > 0) { + WrapCUDARetryAllocator(FLAGS_gpu_allocator_retry_time); + } + CheckAllocThreadSafe(); } @@ -220,12 +278,6 @@ class AllocatorFacadePrivate { AllocationPtr CUDAAlloc(const platform::CUDAPlace& place, const gpuStream_t& stream, size_t size) { std::shared_ptr cuda_allocator; - /* NOTE(Ruibiao): This code does not lead to lock competition - * for seraching initialized CUDA allocator in multithreaded scenario. - * However, when the corresponding CUDA allocator is not initialized, - * it may result in large lookup overhead, - * which call GetCUDAAAllocator 3 times in the worst case. - **/ try { cuda_allocator = GetCUDAAllocator(place, stream); } catch (platform::EnforceNotMet& err) { @@ -234,8 +286,10 @@ class AllocatorFacadePrivate { std::unique_lock lock(cuda_init_mutex_); try { cuda_allocator = GetCUDAAllocator(place, stream); + VLOG(9) << "Other thread had build a new allocator for stream " + << stream << " in place " << place; } catch (platform::EnforceNotMet& err) { - InitCUDAAllocator(place, stream); + InitStreamSafeCUDAAllocator(place, stream); cuda_allocator = GetCUDAAllocator(place, stream); } catch (...) { throw; @@ -314,15 +368,6 @@ class AllocatorFacadePrivate { platform::Place place_; }; - void CheckStrategy(AllocatorStrategy strategy) { - if (strategy != AllocatorStrategy::kNaiveBestFit && - strategy != AllocatorStrategy::kAutoGrowth && - strategy != AllocatorStrategy::kThreadLocal) { - PADDLE_THROW(platform::errors::InvalidArgument( - "Unsupported allocator strategy: %d", static_cast(strategy_))); - } - } - const AllocatorMap& GetAllocatorMap() { #ifdef PADDLE_WITH_CUDA if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { @@ -352,32 +397,23 @@ class AllocatorFacadePrivate { std::make_shared(platform::CUDAPinnedPlace()); } - void InitCUDAAllocator(platform::CUDAPlace p, gpuStream_t stream) { - switch (strategy_) { - case AllocatorStrategy::kNaiveBestFit: { - InitNaiveBestFitCUDAAllocator(p, stream); - break; - } - case AllocatorStrategy::kAutoGrowth: { - InitAutoGrowthCUDAAllocator(p, stream); - break; - } - case AllocatorStrategy::kThreadLocal: { - InitThreadLocalCUDAAllocator(p, stream); - break; - } - default: { - PADDLE_THROW(platform::errors::InvalidArgument( - "Unsupported allocator strategy: %d", static_cast(strategy_))); - } - } + void InitStreamSafeCUDAAllocator(platform::CUDAPlace p, gpuStream_t stream) { + PADDLE_ENFORCE_EQ( + strategy_, AllocatorStrategy::kAutoGrowth, + platform::errors::Unimplemented( + "Now only support auto-growth strategey for " + "StreamSafeCUDAAllocator, " + "the allocator strategy %d is unsupported for multi-stream", + static_cast(strategy_))); + VLOG(9) << "Init CUDA allocator for stream " << stream << " in place " << p; + + InitAutoGrowthCUDAAllocator(p, stream); WrapStreamSafeCUDAAllocator(p, stream); WrapCUDARetryAllocator(p, stream, FLAGS_gpu_allocator_retry_time); } - void InitNaiveBestFitCUDAAllocator(platform::CUDAPlace p, - gpuStream_t stream) { - cuda_allocators_[p][stream] = std::make_shared(p); + void InitNaiveBestFitCUDAAllocator(platform::CUDAPlace p) { + allocators_[p] = std::make_shared(p); } void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p, gpuStream_t stream) { @@ -455,8 +491,83 @@ class AllocatorFacadePrivate { #endif } - void InitThreadLocalCUDAAllocator(platform::CUDAPlace p, gpuStream_t stream) { - cuda_allocators_[p][stream] = std::make_shared(p); + // NOTE(Ruibiao): Old single-stream version, will be removed later + void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p, + bool allow_free_idle_chunk) { +#if defined(PADDLE_WITH_HIP) + auto cuda_allocator = std::make_shared(p); + allocators_[p] = std::make_shared( + cuda_allocator, platform::GpuMinChunkSize(), allow_free_idle_chunk); +#endif + +#if defined(PADDLE_WITH_CUDA) +#if CUDA_VERSION >= 10020 + CUdevice device; + int val; + try { + PADDLE_ENFORCE_CUDA_SUCCESS( + paddle::platform::dynload::cuDeviceGet(&device, p.GetDeviceId())); + + PADDLE_ENFORCE_CUDA_SUCCESS( + paddle::platform::dynload::cuDeviceGetAttribute( + &val, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED, + device)); + } catch (...) { + val = 0; + } + + if (val > 0 && FLAGS_use_virtual_memory_auto_growth) { + auto cuda_allocator = std::make_shared(p); + allocators_[p] = + std::make_shared( + cuda_allocator, platform::GpuMinChunkSize(), p); + } else { + auto cuda_allocator = std::make_shared(p); + allocators_[p] = std::make_shared( + cuda_allocator, platform::GpuMinChunkSize(), allow_free_idle_chunk); + } + +#else + auto cuda_allocator = std::make_shared(p); + auto alignment = platform::GpuMinChunkSize(); + bool need_addr_align = true; + // NOTE: sometimes, since cuda runtime can not be forked, calling any cuda + // API in that case may got cuda error(3), i.e., + // cudaErrorInitializationError. And, the CUDAAllocator is only initialized + // but not really used. + // Here, the try-catch block is added to handle the case that + // GetDeviceProperties() may failed in the multiple process(for example, in + // dataloader with num_worker > 0) + try { + const auto& prop = platform::GetDeviceProperties(p.GetDeviceId()); + need_addr_align = prop.textureAlignment < alignment; + VLOG(4) << "GetDeviceProperties ok, textureAlignment: " + << prop.textureAlignment + << ", set need_addr_align=" << need_addr_align; + } catch (...) { + need_addr_align = true; + VLOG(4) << "GetDeviceProperties failed, set need_addr_align=true"; + } + // The address returned is aligned already, + // ref: + // https://stackoverflow.com/questions/14082964/cuda-alignment-256bytes-seriously/14083295#14083295 + std::shared_ptr underlying_allocator{nullptr}; + if (need_addr_align) { + VLOG(10) << "use AlignedAllocator with alignment: " << alignment; + underlying_allocator = + std::make_shared(underlying_allocator, alignment); + } else { + VLOG(10) << "not use AlignedAllocator with alignment: " << alignment; + underlying_allocator = cuda_allocator; + } + allocators_[p] = std::make_shared( + underlying_allocator, alignment, 0, allow_free_idle_chunk); +#endif +#endif + } + + void InitThreadLocalCUDAAllocator(platform::CUDAPlace p) { + allocators_[p] = std::make_shared(p); } void WrapCUDARetryAllocator(platform::CUDAPlace p, gpuStream_t stream, @@ -504,6 +615,27 @@ class AllocatorFacadePrivate { } #endif + void InitSystemAllocators() { + if (!system_allocators_.empty()) return; + system_allocators_[platform::CPUPlace()] = std::make_shared(); +#ifdef PADDLE_WITH_XPU + int device_count = platform::GetXPUDeviceCount(); + for (int i = 0; i < device_count; ++i) { + platform::XPUPlace p(i); + system_allocators_[p] = std::make_shared(p); + } +#endif +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + system_allocators_[platform::CUDAPinnedPlace()] = + std::make_shared(); + int device_count = platform::GetCUDADeviceCount(); + for (int i = 0; i < device_count; ++i) { + platform::CUDAPlace p(i); + system_allocators_[p] = std::make_shared(p); + } +#endif + } + void InitZeroSizeAllocators() { if (!zero_size_allocators_.empty()) return; std::vector places; @@ -541,36 +673,30 @@ class AllocatorFacadePrivate { } } - void InitSystemAllocators() { - if (!system_allocators_.empty()) return; - system_allocators_[platform::CPUPlace()] = std::make_shared(); -#ifdef PADDLE_WITH_XPU - int device_count = platform::GetXPUDeviceCount(); - for (int i = 0; i < device_count; ++i) { - platform::XPUPlace p(i); - system_allocators_[p] = std::make_shared(p); - } -#endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - system_allocators_[platform::CUDAPinnedPlace()] = - std::make_shared(); - int device_count = platform::GetCUDADeviceCount(); - for (int i = 0; i < device_count; ++i) { - platform::CUDAPlace p(i); - system_allocators_[p] = std::make_shared(p); - } -#endif - } - void CheckAllocThreadSafe() const { CheckAllocThreadSafe(allocators_); CheckAllocThreadSafe(zero_size_allocators_); CheckAllocThreadSafe(system_allocators_); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - CheckCUDAAllocThreadSafe(cuda_allocators_); + if (FLAGS_use_stream_safe_cuda_allocator) { + CheckCUDAAllocThreadSafe(cuda_allocators_); + } #endif } + // NOTE(Ruibiao): Old single-stream version, will be removed later + void WrapCUDARetryAllocator(size_t retry_time) { + PADDLE_ENFORCE_GT( + retry_time, 0, + platform::errors::InvalidArgument( + "Retry time should be larger than 0, but got %d", retry_time)); + for (auto& pair : allocators_) { + if (platform::is_gpu_place(pair.first)) { + pair.second = std::make_shared(pair.second, retry_time); + } + } + } + #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) // a standalone CUDA allocator to support multi-stream GC in new executor CUDAAllocatorMap cuda_allocators_; @@ -610,7 +736,7 @@ std::shared_ptr AllocatorFacade::AllocShared( AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - if (platform::is_gpu_place(place)) { + if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place)) { return Alloc(BOOST_GET_CONST(platform::CUDAPlace, place), m_->GetDefaultStream(), size); } @@ -620,7 +746,7 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, uint64_t AllocatorFacade::Release(const platform::Place& place) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - if (platform::is_gpu_place(place)) { + if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place)) { return Release(BOOST_GET_CONST(platform::CUDAPlace, place), m_->GetDefaultStream()); } @@ -637,11 +763,25 @@ const std::shared_ptr& AllocatorFacade::GetAllocator( #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) std::shared_ptr AllocatorFacade::AllocShared( const platform::CUDAPlace& place, const gpuStream_t& stream, size_t size) { + PADDLE_ENFORCE_EQ( + FLAGS_use_stream_safe_cuda_allocator, true, + platform::errors::Unimplemented( + "StreamSafeCUDAAllocator is disabled, you should not call this " + "multi-stream 'AllocaShared' function. " + "To enable it, you can enter 'export " + "FLAGS_use_stream_safe_cuda_allocator=true' in the terminal.")); return std::shared_ptr(Alloc(place, stream, size)); } AllocationPtr AllocatorFacade::Alloc(const platform::CUDAPlace& place, const gpuStream_t& stream, size_t size) { + PADDLE_ENFORCE_EQ( + FLAGS_use_stream_safe_cuda_allocator, true, + platform::errors::Unimplemented( + "StreamSafeCUDAAllocator is disabled, you should not call this " + "multi-stream 'Alloca' function. " + "To enable it, you can enter 'export " + "FLAGS_use_stream_safe_cuda_allocator=true' in the terminal.")); if (size > 0) { return m_->CUDAAlloc(place, stream, size); } else { @@ -651,11 +791,25 @@ AllocationPtr AllocatorFacade::Alloc(const platform::CUDAPlace& place, uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place, const gpuStream_t& stream) { + PADDLE_ENFORCE_EQ( + FLAGS_use_stream_safe_cuda_allocator, true, + platform::errors::Unimplemented( + "StreamSafeCUDAAllocator is disabled, you should not call this " + "multi-stream 'Release' function. " + "To enable it, you can enter 'export " + "FLAGS_use_stream_safe_cuda_allocator=true' in the terminal.")); return m_->GetCUDAAllocator(place, stream)->Release(place); } void AllocatorFacade::RecordStream(Allocation* allocation, const gpuStream_t& stream) { + PADDLE_ENFORCE_EQ( + FLAGS_use_stream_safe_cuda_allocator, true, + platform::errors::Unimplemented( + "StreamSafeCUDAAllocator is disabled, you should not call this " + "'RecordStream' function. " + "To enable it, you can enter 'export " + "FLAGS_use_stream_safe_cuda_allocator=true' in the terminal.")); m_->RecordStream(allocation, stream); } diff --git a/paddle/fluid/memory/malloc_test.cu b/paddle/fluid/memory/malloc_test.cu index cf7e1276e88b5a..d015ed7ce693fb 100644 --- a/paddle/fluid/memory/malloc_test.cu +++ b/paddle/fluid/memory/malloc_test.cu @@ -27,7 +27,6 @@ #include "gtest/gtest.h" #include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/gpu_info.h" namespace paddle { namespace memory { @@ -170,182 +169,5 @@ TEST(Malloc, AllocZero) { AllocationPtr allocation_ptr = Alloc(place, 0); EXPECT_GE(allocation_ptr->size(), 0); } - -TEST(Malloc, StreamSafeCUDAAllocRetry) { - platform::CUDAPlace place = platform::CUDAPlace(); - gpuStream_t stream1, stream2; -#ifdef PADDLE_WITH_CUDA - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream1)); - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream2)); -#else - PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreate(&stream1)); - PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreate(&stream2)); -#endif - size_t available_size = platform::GpuAvailableMemToAlloc(); - // alloc_size < available_size < 2 * alloc_size - size_t alloc_size = available_size / 4 * 3; - - std::shared_ptr allocation1 = - AllocShared(place, stream1, alloc_size); - std::shared_ptr allocation2; - - std::thread th([&allocation2, &place, &stream2, alloc_size]() { - std::this_thread::sleep_for(std::chrono::seconds(1)); - allocation2 = AllocShared(place, stream2, alloc_size); - }); - allocation1.reset(); // free but not release - th.join(); - EXPECT_GE(allocation2->size(), alloc_size); - allocation2.reset(); - -#ifdef PADDLE_WITH_CUDA - PADDLE_ENFORCE_CUDA_SUCCESS(cudaDeviceSynchronize()); -#else - PADDLE_ENFORCE_CUDA_SUCCESS(hipDeviceSynchronize()); -#endif - - Release(place, stream1); - Release(place, stream2); -} - -__global__ void add_kernel(int *x, int n) { - int tid = threadIdx.x + blockIdx.x * blockDim.x; - for (int i = tid; i < n; i += blockDim.x * gridDim.x) { - atomicAdd(x + i, tid); - } -} - -class StreamSafeCUDAAllocTest : public ::testing::Test { - protected: - void SetUp() override { - place_ = platform::CUDAPlace(); - stream_num_ = 64; - grid_num_ = 1; - block_num_ = 64; - data_num_ = 64; - default_stream = nullptr; - - streams_.reserve(stream_num_); - streams_.emplace_back(default_stream); - for (size_t i = 1; i < stream_num_; ++i) { - gpuStream_t stream; -#ifdef PADDLE_WITH_CUDA - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream)); -#else - PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreate(&stream)); -#endif - streams_.emplace_back(stream); - } - - for (size_t i = 0; i < stream_num_; ++i) { - size_t allocation_size = data_num_ * sizeof(int); - std::shared_ptr allocation = - AllocShared(place_, streams_[i], allocation_size); -#ifdef PADDLE_WITH_CUDA - PADDLE_ENFORCE_CUDA_SUCCESS( - cudaMemset(allocation->ptr(), 0, allocation->size())); -#else - PADDLE_ENFORCE_CUDA_SUCCESS( - hipMemset(allocation->ptr(), 0, allocation->size())); -#endif - allocations_.emplace_back(allocation); - } - } - - void SingleStreamRun(size_t idx) { - for (size_t i = 0; i < stream_num_; ++i) { - int *x = reinterpret_cast(allocations_[i]->ptr()); - add_kernel<<>>(x, data_num_); - if (i != idx) { - RecordStream(allocations_[i].get(), streams_[idx]); - } - } - } - - void MultiStreamRun() { - for (int i = 0; i < stream_num_; ++i) { - SingleStreamRun(i); - } - allocations_.clear(); // fast_gc - } - - void MultiThreadMUltiStreamRun() { - std::vector threads; - for (size_t i = 0; i < stream_num_; ++i) { - threads.push_back( - std::thread(&StreamSafeCUDAAllocTest::SingleStreamRun, this, i)); - } - for (size_t i = 0; i < stream_num_; ++i) { - threads[i].join(); - } - allocations_.clear(); // fast_gc - } - - void CheckResult() { - auto host_x = std::unique_ptr(new int[data_num_]); - size_t thread_num = grid_num_ * block_num_; - for (int i = 0; i < stream_num_; ++i) { -// tricky code, the allocations are still accessible even though -// allocations_.clear() has been called -#ifdef PADDLE_WITH_CUDA - PADDLE_ENFORCE_CUDA_SUCCESS( - cudaMemcpy(host_x.get(), allocations_[i]->ptr(), - data_num_ * sizeof(int), cudaMemcpyDeviceToHost)); -#else - PADDLE_ENFORCE_CUDA_SUCCESS( - hipMemcpy(host_x.get(), allocations_[i]->ptr(), - data_num_ * sizeof(int), hipMemcpyDeviceToHost)); -#endif - for (int j = 0; j < data_num_; ++j) { - EXPECT_TRUE(host_x[j] == (j % thread_num) * stream_num_); - } - } - } - - void TearDown() override { -#ifdef PADDLE_WITH_CUDA - PADDLE_ENFORCE_CUDA_SUCCESS(cudaDeviceSynchronize()); -#else - PADDLE_ENFORCE_CUDA_SUCCESS(hipDeviceSynchronize()); -#endif - for (gpuStream_t stream : streams_) { - Release(place_, stream); - } - - for (size_t i = 1; i < stream_num_; ++i) { -#ifdef PADDLE_WITH_CUDA - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(streams_[i])); -#else - PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamDestroy(streams_[i])); -#endif - } - - uint64_t cuda_malloc_size = - platform::RecordedCudaMallocSize(place_.GetDeviceId()); - ASSERT_EQ(cuda_malloc_size, 0) << "Found " << cuda_malloc_size - << " bytes memory that not released yet," - << " there may be a memory leak problem"; - } - - size_t stream_num_; - size_t grid_num_; - size_t block_num_; - size_t data_num_; - platform::CUDAPlace place_; - gpuStream_t default_stream; - std::vector streams_; - std::vector> allocations_; -}; - -TEST_F(StreamSafeCUDAAllocTest, CUDAMutilStream) { - MultiStreamRun(); - CheckResult(); -} - -TEST_F(StreamSafeCUDAAllocTest, CUDAMutilThreadMutilStream) { - MultiThreadMUltiStreamRun(); - CheckResult(); -} - } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu b/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu new file mode 100644 index 00000000000000..a75ef88141d6ca --- /dev/null +++ b/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu @@ -0,0 +1,211 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifdef PADDLE_WITH_CUDA +#include +#include +#endif + +#ifdef PADDLE_WITH_HIP +#include +#endif + +#include // NOLINT +#include + +#include "gtest/gtest.h" +#include "paddle/fluid/memory/malloc.h" +#include "paddle/fluid/platform/gpu_info.h" + +namespace paddle { +namespace memory { + +__global__ void add_kernel(int *x, int n) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + for (int i = tid; i < n; i += blockDim.x * gridDim.x) { + atomicAdd(x + i, tid); + } +} + +class StreamSafeCUDAAllocTest : public ::testing::Test { + protected: + void SetUp() override { + place_ = platform::CUDAPlace(); + stream_num_ = 64; + grid_num_ = 1; + block_num_ = 64; + data_num_ = 64; + default_stream = nullptr; + + streams_.reserve(stream_num_); + streams_.emplace_back(default_stream); + for (size_t i = 1; i < stream_num_; ++i) { + gpuStream_t stream; +#ifdef PADDLE_WITH_CUDA + PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream)); +#else + PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreate(&stream)); +#endif + streams_.emplace_back(stream); + } + + for (size_t i = 0; i < stream_num_; ++i) { + size_t allocation_size = data_num_ * sizeof(int); + std::shared_ptr allocation = + AllocShared(place_, streams_[i], allocation_size); +#ifdef PADDLE_WITH_CUDA + PADDLE_ENFORCE_CUDA_SUCCESS( + cudaMemset(allocation->ptr(), 0, allocation->size())); +#else + PADDLE_ENFORCE_CUDA_SUCCESS( + hipMemset(allocation->ptr(), 0, allocation->size())); +#endif + allocations_.emplace_back(allocation); + } + } + + void SingleStreamRun(size_t idx) { + for (size_t i = 0; i < stream_num_; ++i) { + int *x = reinterpret_cast(allocations_[i]->ptr()); + add_kernel<<>>(x, data_num_); + if (i != idx) { + RecordStream(allocations_[i].get(), streams_[idx]); + } + } + } + + void MultiStreamRun() { + for (int i = 0; i < stream_num_; ++i) { + SingleStreamRun(i); + } + allocations_.clear(); // fast_gc + } + + void MultiThreadMUltiStreamRun() { + std::vector threads; + for (size_t i = 0; i < stream_num_; ++i) { + threads.push_back( + std::thread(&StreamSafeCUDAAllocTest::SingleStreamRun, this, i)); + } + for (size_t i = 0; i < stream_num_; ++i) { + threads[i].join(); + } + allocations_.clear(); // fast_gc + } + + void CheckResult() { + auto host_x = std::unique_ptr(new int[data_num_]); + size_t thread_num = grid_num_ * block_num_; + for (int i = 0; i < stream_num_; ++i) { +// tricky code, the allocations are still accessible even though +// allocations_.clear() has been called +#ifdef PADDLE_WITH_CUDA + PADDLE_ENFORCE_CUDA_SUCCESS( + cudaMemcpy(host_x.get(), allocations_[i]->ptr(), + data_num_ * sizeof(int), cudaMemcpyDeviceToHost)); +#else + PADDLE_ENFORCE_CUDA_SUCCESS( + hipMemcpy(host_x.get(), allocations_[i]->ptr(), + data_num_ * sizeof(int), hipMemcpyDeviceToHost)); +#endif + for (int j = 0; j < data_num_; ++j) { + EXPECT_TRUE(host_x[j] == (j % thread_num) * stream_num_); + } + } + } + + void TearDown() override { +#ifdef PADDLE_WITH_CUDA + PADDLE_ENFORCE_CUDA_SUCCESS(cudaDeviceSynchronize()); +#else + PADDLE_ENFORCE_CUDA_SUCCESS(hipDeviceSynchronize()); +#endif + for (gpuStream_t stream : streams_) { + Release(place_, stream); + } + + for (size_t i = 1; i < stream_num_; ++i) { +#ifdef PADDLE_WITH_CUDA + PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(streams_[i])); +#else + PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamDestroy(streams_[i])); +#endif + } + + uint64_t cuda_malloc_size = + platform::RecordedCudaMallocSize(place_.GetDeviceId()); + ASSERT_EQ(cuda_malloc_size, 0) << "Found " << cuda_malloc_size + << " bytes memory that not released yet," + << " there may be a memory leak problem"; + } + + size_t stream_num_; + size_t grid_num_; + size_t block_num_; + size_t data_num_; + platform::CUDAPlace place_; + gpuStream_t default_stream; + std::vector streams_; + std::vector> allocations_; +}; + +TEST_F(StreamSafeCUDAAllocTest, CUDAMutilStream) { + MultiStreamRun(); + CheckResult(); +} + +TEST_F(StreamSafeCUDAAllocTest, CUDAMutilThreadMutilStream) { + MultiThreadMUltiStreamRun(); + CheckResult(); +} + +TEST(StreamSafeCUDAAllocRetryTest, StreamSafeCUDAAllocRetry) { + platform::CUDAPlace place = platform::CUDAPlace(); + gpuStream_t stream1, stream2; +#ifdef PADDLE_WITH_CUDA + PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream1)); + PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream2)); +#else + PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreate(&stream1)); + PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreate(&stream2)); +#endif + size_t available_size = platform::GpuAvailableMemToAlloc(); + // alloc_size < available_size < 2 * alloc_size + size_t alloc_size = available_size / 4 * 3; + + std::shared_ptr allocation1 = + AllocShared(place, stream1, alloc_size); + std::shared_ptr allocation2; + + std::thread th([&allocation2, &place, &stream2, alloc_size]() { + std::this_thread::sleep_for(std::chrono::seconds(1)); + allocation2 = AllocShared(place, stream2, alloc_size); + }); + allocation1.reset(); // free but not release + th.join(); + EXPECT_GE(allocation2->size(), alloc_size); + allocation2.reset(); + +#ifdef PADDLE_WITH_CUDA + PADDLE_ENFORCE_CUDA_SUCCESS(cudaDeviceSynchronize()); +#else + PADDLE_ENFORCE_CUDA_SUCCESS(hipDeviceSynchronize()); +#endif + + Release(place, stream1); + Release(place, stream2); +} + +} // namespace memory +} // namespace paddle From dc50b288e0634e980c4494f6e3b47bb3e927dd38 Mon Sep 17 00:00:00 2001 From: chenruibiao Date: Sat, 20 Nov 2021 21:28:59 +0800 Subject: [PATCH 07/13] Fix CI error for 'set_tests_properties' --- paddle/fluid/memory/CMakeLists.txt | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/memory/CMakeLists.txt b/paddle/fluid/memory/CMakeLists.txt index 0de114ae40728b..64fd04b5ea82ba 100644 --- a/paddle/fluid/memory/CMakeLists.txt +++ b/paddle/fluid/memory/CMakeLists.txt @@ -20,9 +20,12 @@ if (WITH_GPU) nv_test(stream_safe_cuda_alloc_test SRCS stream_safe_cuda_alloc_test.cu DEPS malloc) - set_tests_properties(stream_safe_cuda_alloc_test PROPERTIES - ENVIRONMENT "FLAGS_enable_stream_safe_cuda_allocator=true" - ENVIRONMENT "FLAGS_allocator_strategy=auto_growth") + + if(WITH_TESTING AND TEST stream_safe_cuda_alloc_test) + set_tests_properties(stream_safe_cuda_alloc_test PROPERTIES + ENVIRONMENT "FLAGS_enable_stream_safe_cuda_allocator=true" + ENVIRONMENT "FLAGS_allocator_strategy=auto_growth") + endif() endif() if (WITH_ROCM) From 43353c3db54c24350d5cbd69d1ddcf08caebd0fd Mon Sep 17 00:00:00 2001 From: chenruibiao Date: Sat, 20 Nov 2021 21:54:40 +0800 Subject: [PATCH 08/13] Invalidate stream safe CUDA allocator for naive_best_fit and thread_local strategy --- .../memory/allocation/allocator_facade.cc | 24 ++++++++++++++----- 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 0c1d458687a1ec..9f5dec1a7c4cc9 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -140,18 +140,23 @@ class AllocatorFacadePrivate { switch (strategy_) { case AllocatorStrategy::kNaiveBestFit: { InitNaiveBestFitCPUAllocator(); -#ifdef PADDLE_WITH_XPU - for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) { - InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id)); - } -#endif #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + if (FLAGS_use_stream_safe_cuda_allocator) { + LOG(WARNING) << "FLAGS_use_stream_safe_cuda_allocator is invalid for " + "naive_best_fit strategy"; + FLAGS_use_stream_safe_cuda_allocator = false; + } for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount(); ++dev_id) { InitNaiveBestFitCUDAAllocator(platform::CUDAPlace(dev_id)); } InitNaiveBestFitCUDAPinnedAllocator(); #endif +#ifdef PADDLE_WITH_XPU + for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) { + InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id)); + } +#endif #ifdef PADDLE_WITH_ASCEND_CL for (int dev_id = 0; dev_id < platform::GetNPUDeviceCount(); ++dev_id) { InitNaiveBestFitNPUAllocator(platform::NPUPlace(dev_id)); @@ -198,6 +203,12 @@ class AllocatorFacadePrivate { } #endif #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + if (FLAGS_use_stream_safe_cuda_allocator) { + LOG(WARNING) << "FLAGS_use_stream_safe_cuda_allocator is invalid for " + "thread_local strategy"; + FLAGS_use_stream_safe_cuda_allocator = false; + } + for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount(); ++dev_id) { InitThreadLocalCUDAAllocator(platform::CUDAPlace(dev_id)); @@ -228,7 +239,8 @@ class AllocatorFacadePrivate { << " " << place << " " << size; #ifdef PADDLE_WITH_CUDA - if (platform::is_gpu_place(place) && size > 0) { + if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) && + size > 0) { return GetCUDAAllocator(BOOST_GET_CONST(platform::CUDAPlace, place), default_stream_); } From dc1f33c0026fc29b0485bba84dc19c2daafd8c45 Mon Sep 17 00:00:00 2001 From: chenruibiao Date: Mon, 22 Nov 2021 15:47:25 +0800 Subject: [PATCH 09/13] Performance improvement: insert allocation pair to outstanding_events_map when free but not alloc; replace recursive_mutex with SpinLock --- .../memory/allocation/allocator_facade.cc | 7 +-- .../allocation/stream_safe_cuda_allocator.cc | 45 ++++++------------- .../allocation/stream_safe_cuda_allocator.h | 16 +++---- 3 files changed, 22 insertions(+), 46 deletions(-) diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 9f5dec1a7c4cc9..8e3aa349e72bce 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -68,9 +68,10 @@ PADDLE_DEFINE_EXPORTED_bool( PADDLE_DEFINE_EXPORTED_bool(use_virtual_memory_auto_growth, false, "Use VirtualMemoryAutoGrowthBestFitAllocator."); -// NOTE(Ruibiao): This FLAGS is just to be compatible with the old single-stream -// CUDA allocator, and will be remove after StreamSafeCudaAllocator has been -// fully tested +// NOTE(Ruibiao): This FLAGS is just to be compatibled with the old +// single-stream +// CUDA allocator. It will be removed after StreamSafeCudaAllocator has been +// fully tested. PADDLE_DEFINE_EXPORTED_bool(use_stream_safe_cuda_allocator, true, "Enable StreamSafeCUDAAllocator"); diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc index 7ac20ab9b0f967..5996786995656d 100644 --- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc +++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc @@ -31,7 +31,7 @@ void StreamSafeCUDAAllocation::RecordStream(gpuStream_t stream) { if (stream == owning_stream_) { return; } - std::lock_guard lock(mutex_); + std::lock_guard lock_guard(spin_lock_); recorded_streams_->insert(stream); } @@ -49,24 +49,22 @@ StreamSafeCUDAAllocator::StreamSafeCUDAAllocator( bool StreamSafeCUDAAllocator::IsAllocThreadSafe() const { return true; } Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) { - std::lock_guard lock(mutex_); + std::lock_guard lock_guard(spin_lock_); ProcessEventsAndFree(); AllocationPtr underlying_allocation = underlying_allocator_->Allocate(size); StreamSafeCUDAAllocation* allocation = new StreamSafeCUDAAllocation( std::move(underlying_allocation), default_stream_); - allocation_info_map_[allocation] = std::make_shared(); return allocation; } void StreamSafeCUDAAllocator::FreeImpl(Allocation* allocation) { - std::lock_guard lock(mutex_); - GetAllocationInfo(allocation)->can_be_freed = true; + std::lock_guard lock_guard(spin_lock_); FreeStreamSafeCUDAAllocation(allocation); } uint64_t StreamSafeCUDAAllocator::ReleaseImpl(const platform::Place& place) { /*lock_guard*/ { - std::lock_guard lock(mutex_); + std::lock_guard lock_guard(spin_lock_); ProcessEventsAndFree(); } return underlying_allocator_->Release(place); @@ -94,14 +92,8 @@ void StreamSafeCUDAAllocator::CreateEventForAllRecordedStream( void StreamSafeCUDAAllocator::FreeStreamSafeCUDAAllocation( Allocation* allocation) { - std::shared_ptr allocation_info = - GetAllocationInfo(allocation); - if (!allocation_info->can_be_freed) { - return; - } - std::deque& outstanding_events = - allocation_info->outstanding_events; + outstanding_events_map_[allocation]; CreateEventForAllRecordedStream( dynamic_cast(allocation) ->GetRecordedStreams() @@ -113,25 +105,14 @@ void StreamSafeCUDAAllocator::FreeStreamSafeCUDAAllocation( } VLOG(8) << "Free " << allocation->ptr(); - allocation_info_map_.erase(allocation); + outstanding_events_map_.erase(allocation); delete allocation; } -std::shared_ptr -StreamSafeCUDAAllocator::GetAllocationInfo(Allocation* allocation) { - auto it = allocation_info_map_.find(allocation); - PADDLE_ENFORCE_NE( - it, allocation_info_map_.end(), - platform::errors::NotFound( - "The recorded allocation is not malloced by this allocator")); - return it->second; -} - void StreamSafeCUDAAllocator::ProcessEventsAndFree() { - for (auto map_it = allocation_info_map_.begin(); - map_it != allocation_info_map_.end();) { - std::deque& outstanding_events = - map_it->second->outstanding_events; + for (auto map_it = outstanding_events_map_.begin(); + map_it != outstanding_events_map_.end();) { + std::deque& outstanding_events = map_it->second; VLOG(10) << "Check " << outstanding_events.size() << " outstanding events for " << map_it->first->ptr(); auto deque_it = outstanding_events.begin(); @@ -140,25 +121,25 @@ void StreamSafeCUDAAllocator::ProcessEventsAndFree() { gpuError_t err = cudaEventQuery(*deque_it); if (err == cudaErrorNotReady) { VLOG(10) << "Event " << *deque_it << " for " << map_it->first->ptr() - << " is not complete"; + << " is not completed"; outstanding_events.erase(outstanding_events.begin(), deque_it); break; } PADDLE_ENFORCE_CUDA_SUCCESS(err); PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(*deque_it)); - ++deque_it; #else gpuError_t err = hipEventQuery(*deque_it); if (err == hipErrorNotReady) { VLOG(10) << "Event " << *deque_it << " for " << map_it->first->ptr() - << " is not complete"; + << " is not completed"; + // Erase the completded event before "deque_it" outstanding_events.erase(outstanding_events.begin(), deque_it); break; } PADDLE_ENFORCE_CUDA_SUCCESS(err); PADDLE_ENFORCE_CUDA_SUCCESS(hipEventDestroy(*deque_it)); - ++deque_it; #endif + ++deque_it; } if (deque_it == outstanding_events.end()) { diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h index f625a8063becd6..93da1fd664f018 100644 --- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h +++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h @@ -20,11 +20,12 @@ #endif #include +#include #include #include #include -#include #include "paddle/fluid/memory/allocation/allocator.h" +#include "paddle/fluid/memory/allocation/spin_lock.h" namespace paddle { namespace memory { @@ -41,7 +42,7 @@ class StreamSafeCUDAAllocation : public Allocation { AllocationPtr underlying_allocation_; gpuStream_t owning_stream_; std::shared_ptr> recorded_streams_; - std::mutex mutex_; + SpinLock spin_lock_; }; class StreamSafeCUDAAllocator : public Allocator { @@ -57,23 +58,16 @@ class StreamSafeCUDAAllocator : public Allocator { uint64_t ReleaseImpl(const platform::Place &place) override; private: - struct AllocationInfo { - std::deque outstanding_events; - bool can_be_freed{false}; - }; - void CreateEventForAllRecordedStream( std::set *recorded_streams, std::deque *outstanding_events); void FreeStreamSafeCUDAAllocation(Allocation *allocation); - std::shared_ptr GetAllocationInfo(Allocation *); void ProcessEventsAndFree(); std::shared_ptr underlying_allocator_; gpuStream_t default_stream_; - std::unordered_map> - allocation_info_map_; - mutable std::recursive_mutex mutex_; + std::map> outstanding_events_map_; + SpinLock spin_lock_; }; } // namespace allocation From 9b881d45d2bed00632b9f5a8885dce91c9324fd4 Mon Sep 17 00:00:00 2001 From: chenruibiao Date: Mon, 22 Nov 2021 16:33:43 +0800 Subject: [PATCH 10/13] FLAGS priority changes: FLAGS_use_system_allocator > FLAGS_use_stream_safe_cuda_allocator --- paddle/fluid/memory/CMakeLists.txt | 1 + .../memory/allocation/allocator_facade.cc | 35 ++++++++++--------- 2 files changed, 19 insertions(+), 17 deletions(-) diff --git a/paddle/fluid/memory/CMakeLists.txt b/paddle/fluid/memory/CMakeLists.txt index 64fd04b5ea82ba..69134e1c76bb74 100644 --- a/paddle/fluid/memory/CMakeLists.txt +++ b/paddle/fluid/memory/CMakeLists.txt @@ -23,6 +23,7 @@ if (WITH_GPU) if(WITH_TESTING AND TEST stream_safe_cuda_alloc_test) set_tests_properties(stream_safe_cuda_alloc_test PROPERTIES + ENVIRONMENT "FLAGS_use_system_allocator=false" ENVIRONMENT "FLAGS_enable_stream_safe_cuda_allocator=true" ENVIRONMENT "FLAGS_allocator_strategy=auto_growth") endif() diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 8e3aa349e72bce..f9f5ee3d29be5f 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -239,14 +239,6 @@ class AllocatorFacadePrivate { VLOG(6) << "GetAllocator" << " " << place << " " << size; -#ifdef PADDLE_WITH_CUDA - if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) && - size > 0) { - return GetCUDAAllocator(BOOST_GET_CONST(platform::CUDAPlace, place), - default_stream_); - } -#endif - const auto& allocators = (size > 0 ? (UNLIKELY(FLAGS_use_system_allocator) ? system_allocators_ : GetAllocatorMap()) @@ -259,7 +251,7 @@ class AllocatorFacadePrivate { } #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - const std::shared_ptr& GetCUDAAllocator( + const std::shared_ptr& GetAllocator( const platform::CUDAPlace& place, const gpuStream_t& stream) { auto place_it = cuda_allocators_.find(place); PADDLE_ENFORCE_NE(place_it, cuda_allocators_.end(), @@ -292,18 +284,18 @@ class AllocatorFacadePrivate { const gpuStream_t& stream, size_t size) { std::shared_ptr cuda_allocator; try { - cuda_allocator = GetCUDAAllocator(place, stream); + cuda_allocator = GetAllocator(place, stream); } catch (platform::EnforceNotMet& err) { VLOG(9) << "No allocator found for stream " << stream << " in place " << place << " , build a new one"; std::unique_lock lock(cuda_init_mutex_); try { - cuda_allocator = GetCUDAAllocator(place, stream); + cuda_allocator = GetAllocator(place, stream); VLOG(9) << "Other thread had build a new allocator for stream " << stream << " in place " << place; } catch (platform::EnforceNotMet& err) { InitStreamSafeCUDAAllocator(place, stream); - cuda_allocator = GetCUDAAllocator(place, stream); + cuda_allocator = GetAllocator(place, stream); } catch (...) { throw; } @@ -595,7 +587,7 @@ class AllocatorFacadePrivate { void WrapStreamSafeCUDAAllocator(platform::CUDAPlace p, gpuStream_t stream) { const std::shared_ptr& underlying_allocator = - GetCUDAAllocator(p, stream); + GetAllocator(p, stream); cuda_allocators_[p][stream] = std::make_shared(underlying_allocator, stream); } @@ -749,7 +741,8 @@ std::shared_ptr AllocatorFacade::AllocShared( AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place)) { + if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) && + size > 0 && FLAGS_use_system_allocator == false) { return Alloc(BOOST_GET_CONST(platform::CUDAPlace, place), m_->GetDefaultStream(), size); } @@ -759,7 +752,8 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, uint64_t AllocatorFacade::Release(const platform::Place& place) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place)) { + if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) && + FLAGS_use_system_allocator == false) { return Release(BOOST_GET_CONST(platform::CUDAPlace, place), m_->GetDefaultStream()); } @@ -770,6 +764,13 @@ uint64_t AllocatorFacade::Release(const platform::Place& place) { const std::shared_ptr& AllocatorFacade::GetAllocator( const platform::Place& place) { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) && + FLAGS_use_system_allocator == false) { + return m_->GetAllocator(BOOST_GET_CONST(platform::CUDAPlace, place), + m_->GetDefaultStream()); + } +#endif return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1); } @@ -795,7 +796,7 @@ AllocationPtr AllocatorFacade::Alloc(const platform::CUDAPlace& place, "multi-stream 'Alloca' function. " "To enable it, you can enter 'export " "FLAGS_use_stream_safe_cuda_allocator=true' in the terminal.")); - if (size > 0) { + if (LIKELY(size > 0 && FLAGS_use_system_allocator == false)) { return m_->CUDAAlloc(place, stream, size); } else { return m_->GetAllocator(place, size)->Allocate(size); @@ -811,7 +812,7 @@ uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place, "multi-stream 'Release' function. " "To enable it, you can enter 'export " "FLAGS_use_stream_safe_cuda_allocator=true' in the terminal.")); - return m_->GetCUDAAllocator(place, stream)->Release(place); + return m_->GetAllocator(place, stream)->Release(place); } void AllocatorFacade::RecordStream(Allocation* allocation, From b78b055b821e67f64b869869c0529b44bc3231af Mon Sep 17 00:00:00 2001 From: chenruibiao Date: Mon, 22 Nov 2021 17:54:26 +0800 Subject: [PATCH 11/13] Performance improvement: directly delete allocation when the recorded_streams is empty in FreeImpl of StreamSafeCUDAAllocator --- .../memory/allocation/allocator_facade.cc | 28 +++++++++---------- .../allocation/stream_safe_cuda_allocator.cc | 8 +++++- 2 files changed, 20 insertions(+), 16 deletions(-) diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index f9f5ee3d29be5f..4447dacd8baff6 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -68,10 +68,9 @@ PADDLE_DEFINE_EXPORTED_bool( PADDLE_DEFINE_EXPORTED_bool(use_virtual_memory_auto_growth, false, "Use VirtualMemoryAutoGrowthBestFitAllocator."); -// NOTE(Ruibiao): This FLAGS is just to be compatibled with the old -// single-stream -// CUDA allocator. It will be removed after StreamSafeCudaAllocator has been -// fully tested. +// NOTE(Ruibiao): This FLAGS is just to be compatibled with +// the old single-stream CUDA allocator. It will be removed +// after StreamSafeCudaAllocator has been fully tested. PADDLE_DEFINE_EXPORTED_bool(use_stream_safe_cuda_allocator, true, "Enable StreamSafeCUDAAllocator"); @@ -234,11 +233,10 @@ class AllocatorFacadePrivate { CheckAllocThreadSafe(); } - const std::shared_ptr& GetAllocator(const platform::Place& place, - size_t size) { + inline const std::shared_ptr& GetAllocator( + const platform::Place& place, size_t size) { VLOG(6) << "GetAllocator" << " " << place << " " << size; - const auto& allocators = (size > 0 ? (UNLIKELY(FLAGS_use_system_allocator) ? system_allocators_ : GetAllocatorMap()) @@ -575,23 +573,23 @@ class AllocatorFacadePrivate { allocators_[p] = std::make_shared(p); } + void WrapStreamSafeCUDAAllocator(platform::CUDAPlace p, gpuStream_t stream) { + const std::shared_ptr& underlying_allocator = + GetAllocator(p, stream); + cuda_allocators_[p][stream] = + std::make_shared(underlying_allocator, stream); + } + void WrapCUDARetryAllocator(platform::CUDAPlace p, gpuStream_t stream, size_t retry_time) { PADDLE_ENFORCE_GT( retry_time, 0, platform::errors::InvalidArgument( "Retry time should be larger than 0, but got %d", retry_time)); - std::shared_ptr& allocator = cuda_allocators_[p][stream]; + std::shared_ptr allocator = GetAllocator(p, stream); allocator = std::make_shared(allocator, retry_time); } - void WrapStreamSafeCUDAAllocator(platform::CUDAPlace p, gpuStream_t stream) { - const std::shared_ptr& underlying_allocator = - GetAllocator(p, stream); - cuda_allocators_[p][stream] = - std::make_shared(underlying_allocator, stream); - } - static void CheckCUDAAllocThreadSafe(const CUDAAllocatorMap& allocators) { for (auto& place_pair : allocators) { for (auto& stream_pair : place_pair.second) { diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc index 5996786995656d..26ad68468aa505 100644 --- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc +++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc @@ -59,7 +59,13 @@ Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) { void StreamSafeCUDAAllocator::FreeImpl(Allocation* allocation) { std::lock_guard lock_guard(spin_lock_); - FreeStreamSafeCUDAAllocation(allocation); + if (dynamic_cast(allocation) + ->GetRecordedStreams() + ->empty()) { + delete allocation; + } else { + FreeStreamSafeCUDAAllocation(allocation); + } } uint64_t StreamSafeCUDAAllocator::ReleaseImpl(const platform::Place& place) { From 896a67a3485d9f05f64dcf4bda70554c426f576d Mon Sep 17 00:00:00 2001 From: chenruibiao Date: Tue, 23 Nov 2021 14:26:38 +0800 Subject: [PATCH 12/13] Add UT for alloc interface --- .../memory/stream_safe_cuda_alloc_test.cu | 24 ++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu b/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu index a75ef88141d6ca..e63bc789064276 100644 --- a/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu +++ b/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu @@ -160,17 +160,35 @@ class StreamSafeCUDAAllocTest : public ::testing::Test { std::vector> allocations_; }; -TEST_F(StreamSafeCUDAAllocTest, CUDAMutilStream) { +TEST_F(StreamSafeCUDAAllocTest, CUDAMutilStreamTest) { MultiStreamRun(); CheckResult(); } -TEST_F(StreamSafeCUDAAllocTest, CUDAMutilThreadMutilStream) { +TEST_F(StreamSafeCUDAAllocTest, CUDAMutilThreadMutilStreamTest) { MultiThreadMUltiStreamRun(); CheckResult(); } -TEST(StreamSafeCUDAAllocRetryTest, StreamSafeCUDAAllocRetry) { +TEST(StreamSafeCUDAAllocInterfaceTest, AllocInterfaceTest) { + platform::CUDAPlace place = platform::CUDAPlace(); + size_t alloc_size = 256; + + std::shared_ptr allocation_implicit_stream = + AllocShared(place, alloc_size); + EXPECT_GE(allocation_implicit_stream->size(), alloc_size); + + void *address = allocation_implicit_stream->ptr(); + allocation_implicit_stream.reset(); + + gpuStream_t default_stream = nullptr; + allocation::AllocationPtr allocation_unique = + Alloc(place, default_stream, alloc_size); + EXPECT_GE(allocation_unique->size(), alloc_size); + EXPECT_EQ(allocation_unique->ptr(), address); +} + +TEST(StreamSafeCUDAAllocRetryTest, RetryTest) { platform::CUDAPlace place = platform::CUDAPlace(); gpuStream_t stream1, stream2; #ifdef PADDLE_WITH_CUDA From 5a300a3ac34dc40aa75cf8a00bb4a1440b3305fc Mon Sep 17 00:00:00 2001 From: chenruibiao Date: Tue, 23 Nov 2021 19:52:49 +0800 Subject: [PATCH 13/13] Changes multi-stream interface; move retry code from AllocatorFacadePrivate to StreamSafeCUDAAllocator --- .../memory/allocation/allocator_facade.cc | 120 +++++++----------- .../memory/allocation/allocator_facade.h | 12 +- .../allocation/stream_safe_cuda_allocator.cc | 58 +++++++-- .../allocation/stream_safe_cuda_allocator.h | 12 +- paddle/fluid/memory/malloc.cc | 14 +- paddle/fluid/memory/malloc.h | 8 +- .../memory/stream_safe_cuda_alloc_test.cu | 8 +- 7 files changed, 125 insertions(+), 107 deletions(-) diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 4447dacd8baff6..a2b4cdddd30555 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -250,7 +250,8 @@ class AllocatorFacadePrivate { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) const std::shared_ptr& GetAllocator( - const platform::CUDAPlace& place, const gpuStream_t& stream) { + const platform::CUDAPlace& place, const gpuStream_t& stream, + bool create_if_not_found = false) { auto place_it = cuda_allocators_.find(place); PADDLE_ENFORCE_NE(place_it, cuda_allocators_.end(), platform::errors::NotFound( @@ -259,11 +260,15 @@ class AllocatorFacadePrivate { const std::map>& allocator_map = place_it->second; auto stream_it = allocator_map.find(stream); - PADDLE_ENFORCE_NE( - stream_it, allocator_map.end(), - platform::errors::NotFound( + if (stream_it == allocator_map.end()) { + if (create_if_not_found) { + InitStreamSafeCUDAAllocator(place, stream); + return cuda_allocators_[place][stream]; + } else { + PADDLE_THROW(platform::errors::NotFound( "No allocator found for stream %s in place %s", stream, place)); - + } + } return stream_it->second; } @@ -278,49 +283,6 @@ class AllocatorFacadePrivate { dynamic_cast(allocation)->RecordStream(stream); } - AllocationPtr CUDAAlloc(const platform::CUDAPlace& place, - const gpuStream_t& stream, size_t size) { - std::shared_ptr cuda_allocator; - try { - cuda_allocator = GetAllocator(place, stream); - } catch (platform::EnforceNotMet& err) { - VLOG(9) << "No allocator found for stream " << stream << " in place " - << place << " , build a new one"; - std::unique_lock lock(cuda_init_mutex_); - try { - cuda_allocator = GetAllocator(place, stream); - VLOG(9) << "Other thread had build a new allocator for stream " - << stream << " in place " << place; - } catch (platform::EnforceNotMet& err) { - InitStreamSafeCUDAAllocator(place, stream); - cuda_allocator = GetAllocator(place, stream); - } catch (...) { - throw; - } - } catch (...) { - throw; - } - - try { - return cuda_allocator->Allocate(size); - } catch (BadAlloc&) { - VLOG(9) << "Allocation failed when allocating " << size - << " bytes for stream " << stream; - for (auto pair : cuda_allocators_[place]) { - pair.second->Release(place); - } - try { - return cuda_allocator->Allocate(size); - } catch (...) { - VLOG(9) << "Still allocation failed " - << "after release memory from all streams"; - throw; - } - } catch (...) { - throw; - } - } - #ifdef PADDLE_WITH_CUDA void PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) { PADDLE_ENFORCE_EQ(strategy_, AllocatorStrategy::kAutoGrowth, @@ -404,15 +366,22 @@ class AllocatorFacadePrivate { PADDLE_ENFORCE_EQ( strategy_, AllocatorStrategy::kAutoGrowth, platform::errors::Unimplemented( - "Now only support auto-growth strategey for " - "StreamSafeCUDAAllocator, " + "Only support auto-growth strategey for StreamSafeCUDAAllocator, " "the allocator strategy %d is unsupported for multi-stream", static_cast(strategy_))); VLOG(9) << "Init CUDA allocator for stream " << stream << " in place " << p; - - InitAutoGrowthCUDAAllocator(p, stream); - WrapStreamSafeCUDAAllocator(p, stream); - WrapCUDARetryAllocator(p, stream, FLAGS_gpu_allocator_retry_time); + std::lock_guard lock_guard(cuda_allocators_lock_); + try { + GetAllocator(p, stream); + VLOG(9) << "Other thread had build a allocator for stream " << stream + << " in place " << p; + } catch (platform::EnforceNotMet&) { + InitAutoGrowthCUDAAllocator(p, stream); + WrapStreamSafeCUDAAllocator(p, stream); + WrapCUDARetryAllocator(p, stream, FLAGS_gpu_allocator_retry_time); + } catch (...) { + throw; + } } void InitNaiveBestFitCUDAAllocator(platform::CUDAPlace p) { @@ -576,8 +545,8 @@ class AllocatorFacadePrivate { void WrapStreamSafeCUDAAllocator(platform::CUDAPlace p, gpuStream_t stream) { const std::shared_ptr& underlying_allocator = GetAllocator(p, stream); - cuda_allocators_[p][stream] = - std::make_shared(underlying_allocator, stream); + cuda_allocators_[p][stream] = std::make_shared( + underlying_allocator, p, stream); } void WrapCUDARetryAllocator(platform::CUDAPlace p, gpuStream_t stream, @@ -704,7 +673,7 @@ class AllocatorFacadePrivate { // a standalone CUDA allocator to support multi-stream GC in new executor CUDAAllocatorMap cuda_allocators_; gpuStream_t default_stream_; - std::mutex cuda_init_mutex_; + SpinLock cuda_allocators_lock_; #ifdef PADDLE_WITH_CUDA std::unordered_map> cuda_graph_allocator_map_; @@ -731,6 +700,18 @@ AllocatorFacade& AllocatorFacade::Instance() { return instance; } +const std::shared_ptr& AllocatorFacade::GetAllocator( + const platform::Place& place) { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) && + FLAGS_use_system_allocator == false) { + return m_->GetAllocator(BOOST_GET_CONST(platform::CUDAPlace, place), + m_->GetDefaultStream()); + } +#endif + return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1); +} + std::shared_ptr AllocatorFacade::AllocShared( const platform::Place& place, size_t size) { return std::shared_ptr(Alloc(place, size)); @@ -741,8 +722,8 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) && size > 0 && FLAGS_use_system_allocator == false) { - return Alloc(BOOST_GET_CONST(platform::CUDAPlace, place), - m_->GetDefaultStream(), size); + return Alloc(BOOST_GET_CONST(platform::CUDAPlace, place), size, + m_->GetDefaultStream()); } #endif return m_->GetAllocator(place, size)->Allocate(size); @@ -760,21 +741,9 @@ uint64_t AllocatorFacade::Release(const platform::Place& place) { ->Release(place); } -const std::shared_ptr& AllocatorFacade::GetAllocator( - const platform::Place& place) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) && - FLAGS_use_system_allocator == false) { - return m_->GetAllocator(BOOST_GET_CONST(platform::CUDAPlace, place), - m_->GetDefaultStream()); - } -#endif - return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1); -} - #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) std::shared_ptr AllocatorFacade::AllocShared( - const platform::CUDAPlace& place, const gpuStream_t& stream, size_t size) { + const platform::CUDAPlace& place, size_t size, const gpuStream_t& stream) { PADDLE_ENFORCE_EQ( FLAGS_use_stream_safe_cuda_allocator, true, platform::errors::Unimplemented( @@ -782,11 +751,11 @@ std::shared_ptr AllocatorFacade::AllocShared( "multi-stream 'AllocaShared' function. " "To enable it, you can enter 'export " "FLAGS_use_stream_safe_cuda_allocator=true' in the terminal.")); - return std::shared_ptr(Alloc(place, stream, size)); + return std::shared_ptr(Alloc(place, size, stream)); } AllocationPtr AllocatorFacade::Alloc(const platform::CUDAPlace& place, - const gpuStream_t& stream, size_t size) { + size_t size, const gpuStream_t& stream) { PADDLE_ENFORCE_EQ( FLAGS_use_stream_safe_cuda_allocator, true, platform::errors::Unimplemented( @@ -795,7 +764,8 @@ AllocationPtr AllocatorFacade::Alloc(const platform::CUDAPlace& place, "To enable it, you can enter 'export " "FLAGS_use_stream_safe_cuda_allocator=true' in the terminal.")); if (LIKELY(size > 0 && FLAGS_use_system_allocator == false)) { - return m_->CUDAAlloc(place, stream, size); + return m_->GetAllocator(place, stream, /* creat_if_not_found = */ true) + ->Allocate(size); } else { return m_->GetAllocator(place, size)->Allocate(size); } diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h index 813cf62c1366c9..4cd8b4e91e614e 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.h +++ b/paddle/fluid/memory/allocation/allocator_facade.h @@ -47,6 +47,8 @@ class AllocatorFacade { static AllocatorFacade& Instance(); + const std::shared_ptr& GetAllocator(const platform::Place& place); + // Allocate a shared allocation. std::shared_ptr AllocShared(const platform::Place& place, size_t size); @@ -55,14 +57,12 @@ class AllocatorFacade { // Release unused memory pool. uint64_t Release(const platform::Place& place); - const std::shared_ptr& GetAllocator(const platform::Place& place); - #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) std::shared_ptr AllocShared(const platform::CUDAPlace& place, - const gpuStream_t& stream, - size_t size); - AllocationPtr Alloc(const platform::CUDAPlace& place, - const gpuStream_t& stream, size_t size); + size_t size, + const gpuStream_t& stream); + AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size, + const gpuStream_t& stream); uint64_t Release(const platform::CUDAPlace& place, const gpuStream_t& stream); void RecordStream(Allocation* allocation, const gpuStream_t& stream); #ifdef PADDLE_WITH_CUDA diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc index 26ad68468aa505..b2e13af6ef956e 100644 --- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc +++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h" +#include "paddle/fluid/platform/enforce.h" namespace paddle { namespace memory { @@ -42,38 +43,67 @@ StreamSafeCUDAAllocation::GetRecordedStreams() { StreamSafeCUDAAllocator::StreamSafeCUDAAllocator( const std::shared_ptr& underlying_allocator, - const gpuStream_t default_stream) + const platform::CUDAPlace& place, const gpuStream_t default_stream) : underlying_allocator_(underlying_allocator), - default_stream_(default_stream) {} + place_(place), + default_stream_(default_stream) { + std::lock_guard lock_guard(allocators_map_lock_); + allocators_map_[place].emplace_back(this); +} + +StreamSafeCUDAAllocator::~StreamSafeCUDAAllocator() { + std::lock_guard lock_guard(allocators_map_lock_); + std::vector& allocators = allocators_map_[place_]; + allocators.erase(std::remove(allocators.begin(), allocators.end(), this), + allocators.end()); +} bool StreamSafeCUDAAllocator::IsAllocThreadSafe() const { return true; } Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) { - std::lock_guard lock_guard(spin_lock_); ProcessEventsAndFree(); - AllocationPtr underlying_allocation = underlying_allocator_->Allocate(size); + AllocationPtr underlying_allocation; + try { + underlying_allocation = underlying_allocator_->Allocate(size); + } catch (BadAlloc&) { + VLOG(9) << "Allocation failed when allocating " << size << " bytes"; + uint64_t release_size = ReleaseImpl(place_); + VLOG(9) << "Release " << release_size << " bytes memory from all streams"; + try { + underlying_allocation = underlying_allocator_->Allocate(size); + } catch (...) { + VLOG(9) << "Still allocation failed after release memory"; + throw; + } + } catch (...) { + throw; + } + StreamSafeCUDAAllocation* allocation = new StreamSafeCUDAAllocation( std::move(underlying_allocation), default_stream_); return allocation; } void StreamSafeCUDAAllocator::FreeImpl(Allocation* allocation) { - std::lock_guard lock_guard(spin_lock_); if (dynamic_cast(allocation) ->GetRecordedStreams() ->empty()) { delete allocation; } else { + std::lock_guard lock_guard(outstanding_events_map_lock_); FreeStreamSafeCUDAAllocation(allocation); } } uint64_t StreamSafeCUDAAllocator::ReleaseImpl(const platform::Place& place) { - /*lock_guard*/ { - std::lock_guard lock_guard(spin_lock_); - ProcessEventsAndFree(); + std::lock_guard lock_guard(allocators_map_lock_); + std::vector& allocators = + allocators_map_[BOOST_GET_CONST(platform::CUDAPlace, place)]; + uint64_t release_size = 0; + for (StreamSafeCUDAAllocator* allocator : allocators) { + release_size += allocator->ProcessEventsAndFreeWithRelease(); } - return underlying_allocator_->Release(place); + return release_size; } void StreamSafeCUDAAllocator::CreateEventForAllRecordedStream( @@ -116,6 +146,7 @@ void StreamSafeCUDAAllocator::FreeStreamSafeCUDAAllocation( } void StreamSafeCUDAAllocator::ProcessEventsAndFree() { + std::lock_guard lock_guard(outstanding_events_map_lock_); for (auto map_it = outstanding_events_map_.begin(); map_it != outstanding_events_map_.end();) { std::deque& outstanding_events = map_it->second; @@ -161,6 +192,15 @@ void StreamSafeCUDAAllocator::ProcessEventsAndFree() { } } +uint64_t StreamSafeCUDAAllocator::ProcessEventsAndFreeWithRelease() { + ProcessEventsAndFree(); + return underlying_allocator_->Release(place_); +} + +std::map> + StreamSafeCUDAAllocator::allocators_map_; +SpinLock StreamSafeCUDAAllocator::allocators_map_lock_; + } // namespace allocation } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h index 93da1fd664f018..a516558228be63 100644 --- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h +++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h @@ -26,6 +26,7 @@ #include #include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/memory/allocation/spin_lock.h" +#include "paddle/fluid/platform/place.h" namespace paddle { namespace memory { @@ -49,7 +50,8 @@ class StreamSafeCUDAAllocator : public Allocator { public: StreamSafeCUDAAllocator( const std::shared_ptr &underlying_allocator, - const gpuStream_t default_stream); + const platform::CUDAPlace &place, const gpuStream_t default_stream); + ~StreamSafeCUDAAllocator(); bool IsAllocThreadSafe() const override; protected: @@ -63,11 +65,17 @@ class StreamSafeCUDAAllocator : public Allocator { std::deque *outstanding_events); void FreeStreamSafeCUDAAllocation(Allocation *allocation); void ProcessEventsAndFree(); + uint64_t ProcessEventsAndFreeWithRelease(); + + static std::map> + allocators_map_; + static SpinLock allocators_map_lock_; std::shared_ptr underlying_allocator_; + platform::CUDAPlace place_; gpuStream_t default_stream_; std::map> outstanding_events_map_; - SpinLock spin_lock_; + SpinLock outstanding_events_map_lock_; }; } // namespace allocation diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc index 9cd391acca6a45..4921b87ccd99e9 100644 --- a/paddle/fluid/memory/malloc.cc +++ b/paddle/fluid/memory/malloc.cc @@ -35,15 +35,15 @@ uint64_t Release(const platform::Place& place) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) std::shared_ptr AllocShared(const platform::CUDAPlace& place, - const gpuStream_t& stream, - size_t size) { - return allocation::AllocatorFacade::Instance().AllocShared(place, stream, - size); + size_t size, + const gpuStream_t& stream) { + return allocation::AllocatorFacade::Instance().AllocShared(place, size, + stream); } -AllocationPtr Alloc(const platform::CUDAPlace& place, const gpuStream_t& stream, - size_t size) { - return allocation::AllocatorFacade::Instance().Alloc(place, stream, size); +AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size, + const gpuStream_t& stream) { + return allocation::AllocatorFacade::Instance().Alloc(place, size, stream); } uint64_t Release(const platform::CUDAPlace& place, const gpuStream_t& stream) { diff --git a/paddle/fluid/memory/malloc.h b/paddle/fluid/memory/malloc.h index 4926dc572cad4c..2aa9fbe6ada8fe 100644 --- a/paddle/fluid/memory/malloc.h +++ b/paddle/fluid/memory/malloc.h @@ -42,11 +42,11 @@ extern uint64_t Release(const platform::Place& place); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) extern std::shared_ptr AllocShared(const platform::CUDAPlace& place, - const gpuStream_t& stream, - size_t size); + size_t size, + const gpuStream_t& stream); -extern AllocationPtr Alloc(const platform::CUDAPlace& place, - const gpuStream_t& stream, size_t size); +extern AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size, + const gpuStream_t& stream); extern uint64_t Release(const platform::CUDAPlace& place, const gpuStream_t& stream); diff --git a/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu b/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu index e63bc789064276..6a5818fd9603be 100644 --- a/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu +++ b/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu @@ -63,7 +63,7 @@ class StreamSafeCUDAAllocTest : public ::testing::Test { for (size_t i = 0; i < stream_num_; ++i) { size_t allocation_size = data_num_ * sizeof(int); std::shared_ptr allocation = - AllocShared(place_, streams_[i], allocation_size); + AllocShared(place_, allocation_size, streams_[i]); #ifdef PADDLE_WITH_CUDA PADDLE_ENFORCE_CUDA_SUCCESS( cudaMemset(allocation->ptr(), 0, allocation->size())); @@ -183,7 +183,7 @@ TEST(StreamSafeCUDAAllocInterfaceTest, AllocInterfaceTest) { gpuStream_t default_stream = nullptr; allocation::AllocationPtr allocation_unique = - Alloc(place, default_stream, alloc_size); + Alloc(place, alloc_size, default_stream); EXPECT_GE(allocation_unique->size(), alloc_size); EXPECT_EQ(allocation_unique->ptr(), address); } @@ -203,12 +203,12 @@ TEST(StreamSafeCUDAAllocRetryTest, RetryTest) { size_t alloc_size = available_size / 4 * 3; std::shared_ptr allocation1 = - AllocShared(place, stream1, alloc_size); + AllocShared(place, alloc_size, stream1); std::shared_ptr allocation2; std::thread th([&allocation2, &place, &stream2, alloc_size]() { std::this_thread::sleep_for(std::chrono::seconds(1)); - allocation2 = AllocShared(place, stream2, alloc_size); + allocation2 = AllocShared(place, alloc_size, stream2); }); allocation1.reset(); // free but not release th.join();