From 515be94e8298f0db907a4c3d5788ba3da724dc32 Mon Sep 17 00:00:00 2001
From: chenruibiao <chenruibiao@baidu.com>
Date: Wed, 17 Nov 2021 15:37:28 +0800
Subject: [PATCH 01/13] Support multi-stream allocation for CUDA place

---
 paddle/fluid/memory/allocation/CMakeLists.txt |   3 +-
 paddle/fluid/memory/allocation/allocator.h    |   9 +
 .../memory/allocation/allocator_facade.cc     | 558 ++++++++++++------
 .../memory/allocation/allocator_facade.h      |  18 +-
 .../memory/allocation/retry_allocator.cc      |  19 -
 .../fluid/memory/allocation/retry_allocator.h |  19 +
 .../allocation/stream_safe_cuda_allocator.cc  | 160 +++++
 .../allocation/stream_safe_cuda_allocator.h   |  76 +++
 paddle/fluid/memory/malloc.cc                 |  28 +-
 paddle/fluid/memory/malloc.h                  |  13 +
 paddle/fluid/memory/malloc_test.cu            | 144 +++++
 11 files changed, 838 insertions(+), 209 deletions(-)
 create mode 100644 paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
 create mode 100644 paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
index 58979d6c3e1852..4b4717cead36e5 100644
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -88,7 +88,7 @@ endif()
 cc_library(aligned_allocator SRCS aligned_allocator.cc DEPS allocator)
 cc_test(test_aligned_allocator SRCS test_aligned_allocator.cc DEPS aligned_allocator)
 cc_library(allocator_strategy SRCS allocator_strategy.cc DEPS gflags ${AllocatorFacadeDeps})
-cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy)
+cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy stream_safe_cuda_allocator)
 
 if (WITH_GPU)
   target_link_libraries(allocator_facade cuda_graph)
@@ -113,6 +113,7 @@ cc_library(auto_growth_best_fit_allocator SRCS auto_growth_best_fit_allocator.cc
 cc_test(auto_growth_best_fit_allocator_facade_test SRCS auto_growth_best_fit_allocator_facade_test.cc DEPS cpu_allocator auto_growth_best_fit_allocator)
 cc_test(auto_growth_best_fit_allocator_test SRCS auto_growth_best_fit_allocator_test.cc DEPS auto_growth_best_fit_allocator)
 
+cc_library(stream_safe_cuda_allocator SRCS stream_safe_cuda_allocator.cc DEPS allocator)
 cc_library(virtual_memory_auto_growth_best_fit_allocator SRCS virtual_memory_auto_growth_best_fit_allocator.cc DEPS allocator aligned_allocator)
 
 if(NOT WIN32)
diff --git a/paddle/fluid/memory/allocation/allocator.h b/paddle/fluid/memory/allocation/allocator.h
index b11c657b96b74c..de108434154395 100644
--- a/paddle/fluid/memory/allocation/allocator.h
+++ b/paddle/fluid/memory/allocation/allocator.h
@@ -39,6 +39,7 @@ struct BadAlloc : public std::exception {
 };
 
 class Allocator;
+extern void NotifyGPURetryThreads();
 
 // Allocation is the object holding the actually pointer. Use
 // `Allocation::ptr()` will returns the pointer that allocated.
@@ -155,8 +156,16 @@ class Allocator {
   class AllocationDeleter {
    public:
     inline void operator()(Allocation* allocation) const {
+      VLOG(10) << "Run AllocationDeleter";
       Allocator* allocator = allocation->TopDecoratedAllocator();
       allocator->Free(allocation);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+      // TODO(Ruibiao): Dirty code, consider a better design to notify CUDA
+      // alloc retry
+      if (platform::is_gpu_place(allocation->place())) {
+        NotifyGPURetryThreads();
+      }
+#endif
     }
   };
 
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 9da735636fc00f..d5bab43d60c442 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -14,19 +14,29 @@
 
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 
+#include <atomic>
+#include <chrono>
+#include <condition_variable>
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#include <cuda_runtime.h>
+#endif
 #include "gflags/gflags.h"
+
+#include "paddle/fluid/memory/allocation/aligned_allocator.h"
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/memory/allocation/allocator_strategy.h"
 #include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h"
 #include "paddle/fluid/memory/allocation/cpu_allocator.h"
 #include "paddle/fluid/memory/allocation/naive_best_fit_allocator.h"
-#ifdef PADDLE_WITH_ASCEND_CL
-#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
-#endif
-#include "paddle/fluid/memory/allocation/aligned_allocator.h"
 #include "paddle/fluid/memory/allocation/retry_allocator.h"
+#include "paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/npu_info.h"
 #include "paddle/fluid/platform/place.h"
+#ifdef PADDLE_WITH_ASCEND_CL
+#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
+#endif
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/memory/allocation/cuda_allocator.h"
 #include "paddle/fluid/memory/allocation/pinned_allocator.h"
@@ -44,7 +54,6 @@
 #ifdef PADDLE_WITH_XPU
 #include "paddle/fluid/platform/xpu/xpu_info.h"
 #endif
-#include "paddle/fluid/platform/npu_info.h"
 
 PADDLE_DEFINE_EXPORTED_int64(
     gpu_allocator_retry_time, 10000,
@@ -114,83 +123,249 @@ class AllocatorFacadePrivate {
  public:
   using AllocatorMap = std::map<platform::Place, std::shared_ptr<Allocator>>;
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  using CUDAAllocatorMap =
+      std::map<platform::CUDAPlace,
+               std::map<cudaStream_t, std::shared_ptr<Allocator>>>;
+#endif
+
   explicit AllocatorFacadePrivate(bool allow_free_idle_chunk = true) {
     strategy_ = GetAllocatorStrategy();
-    switch (strategy_) {
-      case AllocatorStrategy::kNaiveBestFit: {
-        InitNaiveBestFitCPUAllocator();
-#ifdef PADDLE_WITH_XPU
-        for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
-          InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
-        }
-#endif
+    CheckStrategy(strategy_);
+
+    InitNaiveBestFitCPUAllocator();
+
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-        for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
-             ++dev_id) {
-          InitNaiveBestFitCUDAAllocator(platform::CUDAPlace(dev_id));
-        }
-        InitNaiveBestFitCUDAPinnedAllocator();
-#endif
-#ifdef PADDLE_WITH_ASCEND_CL
-        for (int dev_id = 0; dev_id < platform::GetNPUDeviceCount(); ++dev_id) {
-          InitNaiveBestFitNPUAllocator(platform::NPUPlace(dev_id));
-        }
-        InitNaiveBestFitNPUPinnedAllocator();
+    allow_free_idle_chunk_ = allow_free_idle_chunk;
+    default_cuda_stream_ = nullptr;
+    for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount(); ++dev_id) {
+      InitCUDAAllocator(platform::CUDAPlace(dev_id), default_cuda_stream_);
+    }
+    InitNaiveBestFitCUDAPinnedAllocator();
 #endif
-        break;
-      }
 
-      case AllocatorStrategy::kAutoGrowth: {
-        InitNaiveBestFitCPUAllocator();
 #ifdef PADDLE_WITH_XPU
-        for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
-          InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
-        }
-#endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-        for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
-             ++dev_id) {
-          InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id),
-                                      allow_free_idle_chunk);
-        }
-        InitNaiveBestFitCUDAPinnedAllocator();
+    for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
+      InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
+    }
 #endif
-        break;
-      }
 
-      case AllocatorStrategy::kThreadLocal: {
-        InitNaiveBestFitCPUAllocator();
-#ifdef PADDLE_WITH_XPU
-        for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
-          InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
-        }
+#ifdef PADDLE_WITH_ASCEND_CL
+    if (strategy_ == AllocatorStrategy::kNaiveBestFit) {
+      for (int dev_id = 0; dev_id < platform::GetNPUDeviceCount(); ++dev_id) {
+        InitNaiveBestFitNPUAllocator(platform::NPUPlace(dev_id));
+      }
+      InitNaiveBestFitNPUPinnedAllocator();
+    }
 #endif
+
+    InitZeroSizeAllocators();
+    InitSystemAllocators();
+    CheckAllocThreadSafe();
+  }
+
+  const std::shared_ptr<Allocator>& GetAllocator(const platform::Place& place,
+                                                 size_t size) {
+    VLOG(6) << "GetAllocator"
+            << " " << place << " " << size;
+
+    if (platform::is_gpu_place(place) && size > 0) {
+      return GetCUDAAllocator(boost::get<platform::CUDAPlace>(place),
+                              default_cuda_stream_);
+    }
+
+    const auto& allocators =
+        (size > 0 ? (UNLIKELY(FLAGS_use_system_allocator) ? system_allocators_
+                                                          : GetAllocatorMap())
+                  : zero_size_allocators_);
+    auto iter = allocators.find(place);
+    PADDLE_ENFORCE_NE(iter, allocators.end(),
+                      platform::errors::NotFound(
+                          "No allocator found for the place, %s", place));
+    return iter->second;
+  }
+
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-        for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
-             ++dev_id) {
-          InitThreadLocalCUDAAllocator(platform::CUDAPlace(dev_id));
-        }
-        InitNaiveBestFitCUDAPinnedAllocator();
-#endif
-        break;
+  const std::shared_ptr<Allocator>& GetCUDAAllocator(
+      const platform::CUDAPlace& place, const cudaStream_t& stream) {
+    auto place_it = cuda_allocators_.find(place);
+    PADDLE_ENFORCE_NE(place_it, cuda_allocators_.end(),
+                      platform::errors::NotFound(
+                          "No allocator found for the place %s", place));
+
+    const std::map<cudaStream_t, std::shared_ptr<Allocator>>& allocator_map =
+        place_it->second;
+    auto stream_it = allocator_map.find(stream);
+    PADDLE_ENFORCE_NE(
+        stream_it, allocator_map.end(),
+        platform::errors::NotFound(
+            "No allocator found for stream %s in place %s", stream, place));
+
+    return stream_it->second;
+  }
+
+  cudaStream_t GetDefaultCudaStream() { return default_cuda_stream_; }
+
+  void NotifyGPURetryThreads() { cuda_retry_cv_.notify_all(); }
+
+  void RecordStream(Allocation* allocation, const cudaStream_t& stream) {
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(allocation->place()), true,
+        platform::errors::InvalidArgument(
+            "Not allow to record stream for an allocation with place %s",
+            allocation->place()));
+    dynamic_cast<StreamSafeCUDAAllocation*>(allocation)->RecordStream(stream);
+  }
+
+  AllocationPtr CUDAAlloc(const platform::CUDAPlace& place,
+                          const cudaStream_t& stream, size_t size) {
+    std::shared_ptr<Allocator> cuda_allocator;
+    /* NOTE(Ruibiao): This code does not lead to lock competition
+     * for seraching initialized CUDA allocator in multithreaded scenario.
+     * However, when the corresponding CUDA allocator is not initialized,
+     * it may result in large lookup overhead,
+     * which call GetCUDAAAllocator 3 times in the worst case.
+    **/
+    try {
+      cuda_allocator = GetCUDAAllocator(place, stream);
+    } catch (platform::EnforceNotMet& err) {
+      VLOG(9) << "No allocator found for stream " << stream << "in place "
+              << place << " , build a new one";
+      std::unique_lock<std::mutex> lock(cuda_retry_mutex_);
+      try {
+        cuda_allocator = GetCUDAAllocator(place, stream);
+      } catch (platform::EnforceNotMet& err) {
+        InitCUDAAllocator(place, stream);
+        cuda_allocator = GetCUDAAllocator(place, stream);
+      } catch (...) {
+        throw;
       }
+    } catch (...) {
+      throw;
+    }
 
-      default: {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "Unsupported allocator strategy: %d", static_cast<int>(strategy_)));
+    if (FLAGS_gpu_allocator_retry_time <= 0) {
+      return cuda_allocator->Allocate(size);
+    }
+
+    // In fact, we can unify the code of allocation success and failure
+    // But it would add lock even when allocation success at the first time
+    try {
+      return cuda_allocator->Allocate(size);
+    } catch (BadAlloc&) {
+      VLOG(9) << "Allocation failed when allocating " << size
+              << " bytes for stream " << stream;
+      for (auto pair : cuda_allocators_[place]) {
+        std::shared_ptr<Allocator> cuda_allocator = pair.second;
+        std::dynamic_pointer_cast<StreamSafeCUDAAllocator>(cuda_allocator)
+            ->ProcessEventsAndFree();
       }
+      try {
+        return cuda_allocator->Allocate(size);
+      } catch (BadAlloc&) {
+        {
+          WaitedAllocateSizeGuard guard(&cuda_waited_allocate_size_, size);
+          VLOG(10)
+              << "Still allocation failed after calling ProcessEventAndFree, "
+              << " cuda_waited_allocate_size_ = " << cuda_waited_allocate_size_;
+          // We can just write allocation retry inside the predicate function of
+          // wait_until. But it needs to acquire the lock when executing
+          // predicate
+          // function. For better performance, we use loop here
+          auto end_time =
+              std::chrono::high_resolution_clock::now() +
+              std::chrono::milliseconds(FLAGS_gpu_allocator_retry_time);
+          auto wait_until = [&end_time, this] {
+            std::unique_lock<std::mutex> lock(cuda_retry_mutex_);
+            return cuda_retry_cv_.wait_until(lock, end_time);
+          };
+
+          size_t retry_times = 0;
+          while (wait_until() != std::cv_status::timeout) {
+            try {
+              return cuda_allocator->Allocate(size);
+            } catch (BadAlloc&) {
+              ++retry_times;
+              VLOG(10) << "Allocation failed when retrying " << retry_times
+                       << " times when allocating " << size
+                       << " bytes. Wait still.";
+            } catch (...) {
+              throw;
+            }
+          }
+        }
+        VLOG(10) << "Allocation failed because of timeout when allocating "
+                 << size << " bytes.";
+        return cuda_allocator->Allocate(
+            size);  // If timeout, try last allocation request
+      } catch (...) {
+        throw;
+      }
+    } catch (...) {
+      throw;
     }
-    InitZeroSizeAllocators();
-    InitSystemAllocators();
+  }
 
-    if (FLAGS_gpu_allocator_retry_time > 0) {
-      WrapCUDARetryAllocator(FLAGS_gpu_allocator_retry_time);
+#ifdef PADDLE_WITH_CUDA
+  void PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) {
+    PADDLE_ENFORCE_EQ(strategy_, AllocatorStrategy::kAutoGrowth,
+                      platform::errors::InvalidArgument(
+                          "CUDA Graph is only supported when the "
+                          "FLAGS_allocator_strategy=\"auto_growth\", but got "
+                          "FLAGS_allocator_strategy=\"%s\"",
+                          FLAGS_allocator_strategy));
+    auto& allocator = cuda_graph_allocator_map_[id];
+    PADDLE_ENFORCE_EQ(
+        allocator.get(), nullptr,
+        platform::errors::InvalidArgument(
+            "The memory pool of the CUDA Graph with ID %d have been prepared.",
+            id));
+    allocator.reset(
+        new AllocatorFacadePrivate(/*allow_free_idle_chunk=*/false));
+    for (auto& item : allocator->allocators_) {
+      auto& old_allocator = item.second;
+      old_allocator = CUDAGraphAllocator::Create(old_allocator);
     }
+    VLOG(10) << "Prepare memory pool for CUDA Graph with ID " << id;
+  }
 
-    CheckAllocThreadSafe();
+  void RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id) {
+    auto iter = cuda_graph_allocator_map_.find(id);
+    PADDLE_ENFORCE_NE(iter, cuda_graph_allocator_map_.end(),
+                      platform::errors::InvalidArgument(
+                          "Cannot find CUDA Graph with ID = %d", id));
+    cuda_graph_allocator_map_.erase(iter);
+    VLOG(10) << "Remove memory pool of CUDA Graph with ID " << id;
   }
+#endif
+#endif
+
+ private:
+  class ZeroSizeAllocator : public Allocator {
+   public:
+    explicit ZeroSizeAllocator(platform::Place place) : place_(place) {}
+    bool IsAllocThreadSafe() const override { return true; }
+
+   protected:
+    Allocation* AllocateImpl(size_t size) override {
+      return new Allocation(nullptr, 0, place_);
+    }
+    void FreeImpl(Allocation* allocation) override { delete allocation; }
 
-  inline const AllocatorMap& GetAllocatorMap() {
+   private:
+    platform::Place place_;
+  };
+
+  void CheckStrategy(AllocatorStrategy strategy) {
+    if (strategy != AllocatorStrategy::kNaiveBestFit &&
+        strategy != AllocatorStrategy::kAutoGrowth &&
+        strategy != AllocatorStrategy::kThreadLocal) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Unsupported allocator strategy: %d", static_cast<int>(strategy_)));
+    }
+  }
+
+  const AllocatorMap& GetAllocatorMap() {
 #ifdef PADDLE_WITH_CUDA
     if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
       auto id = platform::CUDAGraph::CapturingID();
@@ -208,43 +383,6 @@ class AllocatorFacadePrivate {
 #endif
   }
 
-  inline const std::shared_ptr<Allocator>& GetAllocator(
-      const platform::Place& place, size_t size) {
-    VLOG(6) << "GetAllocator"
-            << " " << place << " " << size;
-    const auto& allocators =
-        (size > 0 ? (UNLIKELY(FLAGS_use_system_allocator) ? system_allocators_
-                                                          : GetAllocatorMap())
-                  : zero_size_allocators_);
-    auto iter = allocators.find(place);
-    PADDLE_ENFORCE_NE(iter, allocators.end(),
-                      platform::errors::NotFound(
-                          "No allocator found for the place, %s", place));
-    return iter->second;
-  }
-
- private:
-  void InitSystemAllocators() {
-    if (!system_allocators_.empty()) return;
-    system_allocators_[platform::CPUPlace()] = std::make_shared<CPUAllocator>();
-#ifdef PADDLE_WITH_XPU
-    int device_count = platform::GetXPUDeviceCount();
-    for (int i = 0; i < device_count; ++i) {
-      platform::XPUPlace p(i);
-      system_allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
-    }
-#endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    system_allocators_[platform::CUDAPinnedPlace()] =
-        std::make_shared<CPUPinnedAllocator>();
-    int device_count = platform::GetCUDADeviceCount();
-    for (int i = 0; i < device_count; ++i) {
-      platform::CUDAPlace p(i);
-      system_allocators_[p] = std::make_shared<CUDAAllocator>(p);
-    }
-#endif
-  }
-
   void InitNaiveBestFitCPUAllocator() {
     allocators_[platform::CPUPlace()] =
         std::make_shared<NaiveBestFitAllocator>(platform::CPUPlace());
@@ -256,20 +394,38 @@ class AllocatorFacadePrivate {
         std::make_shared<NaiveBestFitAllocator>(platform::CUDAPinnedPlace());
   }
 
-  void InitNaiveBestFitCUDAAllocator(platform::CUDAPlace p) {
-    allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
+  void InitCUDAAllocator(platform::CUDAPlace p, cudaStream_t stream) {
+    switch (strategy_) {
+      case AllocatorStrategy::kNaiveBestFit: {
+        InitNaiveBestFitCUDAAllocator(p, stream);
+        break;
+      }
+      case AllocatorStrategy::kAutoGrowth: {
+        InitAutoGrowthCUDAAllocator(p, stream);
+        break;
+      }
+      case AllocatorStrategy::kThreadLocal: {
+        InitThreadLocalCUDAAllocator(p, stream);
+        break;
+      }
+      default: {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "Unsupported allocator strategy: %d", static_cast<int>(strategy_)));
+      }
+    }
+    WrapStreamSafeCUDAAllocator(p, stream);
   }
 
-  void InitThreadLocalCUDAAllocator(platform::CUDAPlace p) {
-    allocators_[p] = std::make_shared<ThreadLocalCUDAAllocator>(p);
+  void InitNaiveBestFitCUDAAllocator(platform::CUDAPlace p,
+                                     cudaStream_t stream) {
+    cuda_allocators_[p][stream] = std::make_shared<NaiveBestFitAllocator>(p);
   }
 
-  void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p,
-                                   bool allow_free_idle_chunk) {
+  void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p, cudaStream_t stream) {
 #if defined(PADDLE_WITH_HIP)
     auto cuda_allocator = std::make_shared<CUDAAllocator>(p);
-    allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
-        cuda_allocator, platform::GpuMinChunkSize(), allow_free_idle_chunk);
+    cuda_allocators_[p][stream] = std::make_shared<AutoGrowthBestFitAllocator>(
+        cuda_allocator, platform::GpuMinChunkSize(), allow_free_idle_chunk_);
 #endif
 
 #if defined(PADDLE_WITH_CUDA)
@@ -290,15 +446,16 @@ class AllocatorFacadePrivate {
 
     if (val > 0 && FLAGS_use_virtual_memory_auto_growth) {
       auto cuda_allocator = std::make_shared<CUDAVirtualMemAllocator>(p);
-      allocators_[p] =
+      cuda_allocators_[p][stream] =
           std::make_shared<VirtualMemoryAutoGrowthBestFitAllocator>(
               cuda_allocator, platform::GpuMinChunkSize(), p);
     } else {
       auto cuda_allocator = std::make_shared<CUDAAllocator>(p);
-      allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
-          cuda_allocator, platform::GpuMinChunkSize(), allow_free_idle_chunk);
+      cuda_allocators_[p][stream] =
+          std::make_shared<AutoGrowthBestFitAllocator>(
+              cuda_allocator, platform::GpuMinChunkSize(),
+              allow_free_idle_chunk_);
     }
-
 #else
     auto cuda_allocator = std::make_shared<CUDAAllocator>(p);
     auto alignment = platform::GpuMinChunkSize();
@@ -332,11 +489,34 @@ class AllocatorFacadePrivate {
       VLOG(10) << "not use AlignedAllocator with alignment: " << alignment;
       underlying_allocator = cuda_allocator;
     }
-    allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
-        underlying_allocator, alignment, 0, allow_free_idle_chunk);
+
+    cuda_allocators_[p][stream] = std::make_shared<AutoGrowthBestFitAllocator>(
+        underlying_allocator, alignment, 0, allow_free_idle_chunk_);
 #endif
 #endif
   }
+
+  void InitThreadLocalCUDAAllocator(platform::CUDAPlace p,
+                                    cudaStream_t stream) {
+    cuda_allocators_[p][stream] = std::make_shared<ThreadLocalCUDAAllocator>(p);
+  }
+
+  void WrapStreamSafeCUDAAllocator(platform::CUDAPlace p, cudaStream_t stream) {
+    const std::shared_ptr<Allocator>& underlying_allocator =
+        GetCUDAAllocator(p, stream);
+    cuda_allocators_[p][stream] =
+        std::make_shared<StreamSafeCUDAAllocator>(underlying_allocator, stream);
+  }
+
+  static void CheckCUDAAllocThreadSafe(const CUDAAllocatorMap& allocators) {
+    for (auto& place_pair : allocators) {
+      for (auto& stream_pair : place_pair.second) {
+        PADDLE_ENFORCE_EQ(stream_pair.second->IsAllocThreadSafe(), true,
+                          platform::errors::InvalidArgument(
+                              "Public allocators must be thread safe"));
+      }
+    }
+  }
 #endif
 
 #ifdef PADDLE_WITH_XPU
@@ -354,26 +534,8 @@ class AllocatorFacadePrivate {
     allocators_[platform::NPUPinnedPlace()] =
         std::make_shared<paddle::memory::allocation::NPUPinnedAllocator>();
   }
-
 #endif
 
-  class ZeroSizeAllocator : public Allocator {
-   public:
-    explicit ZeroSizeAllocator(platform::Place place) : place_(place) {}
-
-    bool IsAllocThreadSafe() const override { return true; }
-
-   protected:
-    Allocation* AllocateImpl(size_t size) override {
-      return new Allocation(nullptr, 0, place_);
-    }
-
-    void FreeImpl(Allocation* allocation) override { delete allocation; }
-
-   private:
-    platform::Place place_;
-  };
-
   void InitZeroSizeAllocators() {
     if (!zero_size_allocators_.empty()) return;
     std::vector<platform::Place> places;
@@ -411,74 +573,61 @@ class AllocatorFacadePrivate {
     }
   }
 
-  void CheckAllocThreadSafe() const {
-    CheckAllocThreadSafe(allocators_);
-    CheckAllocThreadSafe(zero_size_allocators_);
-    CheckAllocThreadSafe(system_allocators_);
-  }
-
-  void WrapCUDARetryAllocator(size_t retry_time) {
-    PADDLE_ENFORCE_GT(
-        retry_time, 0,
-        platform::errors::InvalidArgument(
-            "Retry time should be larger than 0, but got %d", retry_time));
-    for (auto& pair : allocators_) {
-      if (platform::is_gpu_place(pair.first)) {
-        pair.second = std::make_shared<RetryAllocator>(pair.second, retry_time);
-      }
+  void InitSystemAllocators() {
+    if (!system_allocators_.empty()) return;
+    system_allocators_[platform::CPUPlace()] = std::make_shared<CPUAllocator>();
+#ifdef PADDLE_WITH_XPU
+    int device_count = platform::GetXPUDeviceCount();
+    for (int i = 0; i < device_count; ++i) {
+      platform::XPUPlace p(i);
+      system_allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
     }
-  }
-
-#ifdef PADDLE_WITH_CUDA
-
- public:
-  void PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) {
-    PADDLE_ENFORCE_EQ(strategy_, AllocatorStrategy::kAutoGrowth,
-                      platform::errors::InvalidArgument(
-                          "CUDA Graph is only supported when the "
-                          "FLAGS_allocator_strategy=\"auto_growth\", but got "
-                          "FLAGS_allocator_strategy=\"%s\"",
-                          FLAGS_allocator_strategy));
-    auto& allocator = cuda_graph_allocator_map_[id];
-    PADDLE_ENFORCE_EQ(
-        allocator.get(), nullptr,
-        platform::errors::InvalidArgument(
-            "The memory pool of the CUDA Graph with ID %d have been prepared.",
-            id));
-    allocator.reset(
-        new AllocatorFacadePrivate(/*allow_free_idle_chunk=*/false));
-    for (auto& item : allocator->allocators_) {
-      auto& old_allocator = item.second;
-      old_allocator = CUDAGraphAllocator::Create(old_allocator);
+#endif
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    system_allocators_[platform::CUDAPinnedPlace()] =
+        std::make_shared<CPUPinnedAllocator>();
+    int device_count = platform::GetCUDADeviceCount();
+    for (int i = 0; i < device_count; ++i) {
+      platform::CUDAPlace p(i);
+      system_allocators_[p] = std::make_shared<CUDAAllocator>(p);
     }
-    VLOG(10) << "Prepare memory pool for CUDA Graph with ID " << id;
+#endif
   }
 
-  void RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id) {
-    auto iter = cuda_graph_allocator_map_.find(id);
-    PADDLE_ENFORCE_NE(iter, cuda_graph_allocator_map_.end(),
-                      platform::errors::InvalidArgument(
-                          "Cannot find CUDA Graph with ID = %d", id));
-    cuda_graph_allocator_map_.erase(iter);
-    VLOG(10) << "Remove memory pool of CUDA Graph with ID " << id;
-  }
+  void CheckAllocThreadSafe() const {
+    CheckAllocThreadSafe(allocators_);
+    CheckAllocThreadSafe(zero_size_allocators_);
+    CheckAllocThreadSafe(system_allocators_);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    CheckCUDAAllocThreadSafe(cuda_allocators_);
 #endif
+  }
 
- private:
-  AllocatorMap allocators_;
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  // a standalone CUDA allocator to support multi-stream GC in new executor
+  CUDAAllocatorMap cuda_allocators_;
+  cudaStream_t default_cuda_stream_;
+  static std::condition_variable cuda_retry_cv_;
+  std::mutex cuda_retry_mutex_;
+  std::mutex cuda_init_mutex_;
+  std::atomic<size_t> cuda_waited_allocate_size_{0};
 #ifdef PADDLE_WITH_CUDA
   std::unordered_map<CUDAGraphID, std::unique_ptr<AllocatorFacadePrivate>>
       cuda_graph_allocator_map_;
+#endif
 #endif
   AllocatorStrategy strategy_;
-
+  AllocatorMap allocators_;
   static AllocatorMap zero_size_allocators_;
   static AllocatorMap system_allocators_;
+  bool allow_free_idle_chunk_;
 };
-
 AllocatorFacadePrivate::AllocatorMap
     AllocatorFacadePrivate::zero_size_allocators_;
 AllocatorFacadePrivate::AllocatorMap AllocatorFacadePrivate::system_allocators_;
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+std::condition_variable AllocatorFacadePrivate::cuda_retry_cv_;
+#endif
 
 // Pimpl. Make interface clean.
 AllocatorFacade::AllocatorFacade() : m_(new AllocatorFacadePrivate()) {}
@@ -498,10 +647,22 @@ std::shared_ptr<Allocation> AllocatorFacade::AllocShared(
 
 AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
                                      size_t size) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  if (platform::is_gpu_place(place)) {
+    return Alloc(boost::get<platform::CUDAPlace>(place),
+                 m_->GetDefaultCudaStream(), size);
+  }
+#endif
   return m_->GetAllocator(place, size)->Allocate(size);
 }
 
 uint64_t AllocatorFacade::Release(const platform::Place& place) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  if (platform::is_gpu_place(place)) {
+    return Release(boost::get<platform::CUDAPlace>(place),
+                   m_->GetDefaultCudaStream());
+  }
+#endif
   return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1)
       ->Release(place);
 }
@@ -511,6 +672,33 @@ const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
   return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);
 }
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+std::shared_ptr<Allocation> AllocatorFacade::AllocShared(
+    const platform::CUDAPlace& place, const cudaStream_t& stream, size_t size) {
+  return std::shared_ptr<Allocation>(Alloc(place, stream, size));
+}
+
+AllocationPtr AllocatorFacade::Alloc(const platform::CUDAPlace& place,
+                                     const cudaStream_t& stream, size_t size) {
+  if (size > 0) {
+    return m_->CUDAAlloc(place, stream, size);
+  } else {
+    return m_->GetAllocator(place, size)->Allocate(size);
+  }
+}
+
+uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place,
+                                  const cudaStream_t& stream) {
+  return m_->GetCUDAAllocator(place, stream)->Release(place);
+}
+
+void AllocatorFacade::NotifyGPURetryThreads() { m_->NotifyGPURetryThreads(); }
+
+void AllocatorFacade::RecordStream(Allocation* allocation,
+                                   const cudaStream_t& stream) {
+  m_->RecordStream(allocation, stream);
+}
+
 #ifdef PADDLE_WITH_CUDA
 void AllocatorFacade::PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) {
   return m_->PrepareMemoryPoolForCUDAGraph(id);
@@ -521,6 +709,10 @@ void AllocatorFacade::RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id) {
 }
 #endif
 
+void NotifyGPURetryThreads() {
+  allocation::AllocatorFacade::Instance().NotifyGPURetryThreads();
+}
+#endif
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h
index 8d889ec38eed7e..507b36abd5404c 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.h
+++ b/paddle/fluid/memory/allocation/allocator_facade.h
@@ -26,6 +26,7 @@
 namespace paddle {
 namespace memory {
 namespace allocation {
+
 #ifdef PADDLE_WITH_ASCEND_CL
 using NPUPinnedAllocator = paddle::memory::allocation::NPUPinnedAllocator;
 #endif
@@ -40,26 +41,37 @@ using NPUPinnedAllocator = paddle::memory::allocation::NPUPinnedAllocator;
 class AllocatorFacadePrivate;
 class AllocatorFacade {
  public:
-  ~AllocatorFacade();
   AllocatorFacade(const AllocatorFacade& o) = delete;
   const AllocatorFacade& operator=(const AllocatorFacade& o) = delete;
+  ~AllocatorFacade();
 
   static AllocatorFacade& Instance();
 
   // Allocate a shared allocation.
   std::shared_ptr<Allocation> AllocShared(const platform::Place& place,
                                           size_t size);
-
   // Allocate a unique allocation.
   AllocationPtr Alloc(const platform::Place& place, size_t size);
-
   // Release unused memory pool.
   uint64_t Release(const platform::Place& place);
+
   const std::shared_ptr<Allocator>& GetAllocator(const platform::Place& place);
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  std::shared_ptr<Allocation> AllocShared(const platform::CUDAPlace& place,
+                                          const cudaStream_t& stream,
+                                          size_t size);
+  AllocationPtr Alloc(const platform::CUDAPlace& place,
+                      const cudaStream_t& stream, size_t size);
+  uint64_t Release(const platform::CUDAPlace& place,
+                   const cudaStream_t& stream);
+
+  void NotifyGPURetryThreads();
+  void RecordStream(Allocation* allocation, const cudaStream_t& stream);
 #ifdef PADDLE_WITH_CUDA
   void PrepareMemoryPoolForCUDAGraph(CUDAGraphID id);
   void RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id);
+#endif
 #endif
 
   // TODO(yy): Allocate a Copy-On-Write allocation?
diff --git a/paddle/fluid/memory/allocation/retry_allocator.cc b/paddle/fluid/memory/allocation/retry_allocator.cc
index 1607af3808b434..86c7ad3dc5be16 100644
--- a/paddle/fluid/memory/allocation/retry_allocator.cc
+++ b/paddle/fluid/memory/allocation/retry_allocator.cc
@@ -20,25 +20,6 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 
-class WaitedAllocateSizeGuard {
- public:
-  WaitedAllocateSizeGuard(std::atomic<size_t>* waited_size,
-                          size_t requested_size)
-      : waited_size_(waited_size), requested_size_(requested_size) {
-    waited_size_->fetch_add(requested_size_,
-                            std::memory_order::memory_order_relaxed);
-  }
-
-  ~WaitedAllocateSizeGuard() {
-    waited_size_->fetch_sub(requested_size_,
-                            std::memory_order::memory_order_relaxed);
-  }
-
- private:
-  std::atomic<size_t>* waited_size_;
-  size_t requested_size_;
-};
-
 void RetryAllocator::FreeImpl(Allocation* allocation) {
   // Delete underlying allocation first.
   size_t size = allocation->size();
diff --git a/paddle/fluid/memory/allocation/retry_allocator.h b/paddle/fluid/memory/allocation/retry_allocator.h
index 031a5e2b97f178..36ff2e027c1a37 100644
--- a/paddle/fluid/memory/allocation/retry_allocator.h
+++ b/paddle/fluid/memory/allocation/retry_allocator.h
@@ -28,6 +28,25 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 
+class WaitedAllocateSizeGuard {
+ public:
+  WaitedAllocateSizeGuard(std::atomic<size_t>* waited_size,
+                          size_t requested_size)
+      : waited_size_(waited_size), requested_size_(requested_size) {
+    waited_size_->fetch_add(requested_size_,
+                            std::memory_order::memory_order_relaxed);
+  }
+
+  ~WaitedAllocateSizeGuard() {
+    waited_size_->fetch_sub(requested_size_,
+                            std::memory_order::memory_order_relaxed);
+  }
+
+ private:
+  std::atomic<size_t>* waited_size_;
+  size_t requested_size_;
+};
+
 class RetryAllocator : public Allocator {
  public:
   RetryAllocator(std::shared_ptr<Allocator> allocator, size_t retry_ms)
diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
new file mode 100644
index 00000000000000..5558ddcbbb5361
--- /dev/null
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
@@ -0,0 +1,160 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h"
+#include "paddle/fluid/memory/allocation/allocator.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+StreamSafeCUDAAllocation::StreamSafeCUDAAllocation(
+    AllocationPtr underlying_allocation, cudaStream_t owning_stream)
+    : Allocation(underlying_allocation->ptr(), underlying_allocation->size(),
+                 underlying_allocation->place()),
+      underlying_allocation_(std::move(underlying_allocation)),
+      owning_stream_(owning_stream),
+      recorded_streams_(std::make_shared<std::set<cudaStream_t>>()) {}
+
+void StreamSafeCUDAAllocation::RecordStream(cudaStream_t stream) {
+  VLOG(8) << "Record stream " << stream << " to " << ptr();
+  if (stream == owning_stream_) {
+    return;
+  }
+  std::lock_guard<std::mutex> lock(mutex_);
+  recorded_streams_->insert(stream);
+}
+
+std::shared_ptr<std::set<cudaStream_t>>
+StreamSafeCUDAAllocation::GetRecordedStreams() {
+  return recorded_streams_;
+}
+
+StreamSafeCUDAAllocator::StreamSafeCUDAAllocator(
+    const std::shared_ptr<Allocator>& underlying_allocator,
+    const cudaStream_t default_stream)
+    : underlying_allocator_(underlying_allocator),
+      default_stream_(default_stream) {}
+
+bool StreamSafeCUDAAllocator::IsAllocThreadSafe() const { return true; }
+
+void StreamSafeCUDAAllocator::ProcessEventsAndFree() {
+  for (auto map_it = allocation_info_map_.begin();
+       map_it != allocation_info_map_.end();) {
+    std::deque<cudaEvent_t>& outstanding_events =
+        map_it->second->outstanding_events;
+    VLOG(10) << "Check " << outstanding_events.size()
+             << " outstanding events for " << map_it->first->ptr();
+    auto deque_it = outstanding_events.begin();
+    while (deque_it != outstanding_events.end()) {
+      cudaError_t err = cudaEventQuery(*deque_it);
+      if (err == cudaErrorNotReady) {
+        VLOG(10) << "Event " << *deque_it << " for " << map_it->first->ptr()
+                 << " is not complete";
+        outstanding_events.erase(outstanding_events.begin(), deque_it);
+        break;
+      }
+      PADDLE_ENFORCE_CUDA_SUCCESS(err);
+      PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(*deque_it));
+      ++deque_it;
+    }
+
+    if (deque_it == outstanding_events.end()) {
+      outstanding_events.clear();
+      Allocation* allocation = map_it->first;
+      auto next_it = ++map_it;  // "map_it" may be invalid after calling
+                                // FreeStreamSafeCUDAAllocation
+      FreeStreamSafeCUDAAllocation(allocation);
+      map_it = next_it;
+    } else {
+      ++map_it;
+    }
+  }
+}
+
+Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) {
+  std::lock_guard<std::recursive_mutex> lock(mutex_);
+  ProcessEventsAndFree();
+  AllocationPtr underlying_allocation = underlying_allocator_->Allocate(size);
+  StreamSafeCUDAAllocation* allocation = new StreamSafeCUDAAllocation(
+      std::move(underlying_allocation), default_stream_);
+  allocation_info_map_[allocation] = std::make_shared<AllocationInfo>();
+  return allocation;
+}
+
+void StreamSafeCUDAAllocator::FreeImpl(Allocation* allocation) {
+  std::lock_guard<std::recursive_mutex> lock(mutex_);
+  GetAllocationInfo(allocation)->can_be_freed = true;
+  FreeStreamSafeCUDAAllocation(allocation);
+}
+
+uint64_t StreamSafeCUDAAllocator::ReleaseImpl(const platform::Place& place) {
+  /*lock_guard*/ {
+    std::lock_guard<std::recursive_mutex> lock(mutex_);
+    ProcessEventsAndFree();
+  }
+  return underlying_allocator_->Release(place);
+}
+
+void StreamSafeCUDAAllocator::CreateEventForAllRecordedStream(
+    std::set<cudaStream_t>* recorded_streams,
+    std::deque<cudaEvent_t>* outstanding_events) {
+  for (cudaStream_t stream : *recorded_streams) {
+    cudaEvent_t event;
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
+    PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event, stream));
+    outstanding_events->emplace_back(event);
+    VLOG(9) << "Record event " << event << " in stream " << stream;
+  }
+  recorded_streams->clear();
+}
+
+void StreamSafeCUDAAllocator::FreeStreamSafeCUDAAllocation(
+    Allocation* allocation) {
+  std::shared_ptr<AllocationInfo> allocation_info =
+      GetAllocationInfo(allocation);
+  if (!allocation_info->can_be_freed) {
+    return;
+  }
+
+  std::deque<cudaEvent_t>& outstanding_events =
+      allocation_info->outstanding_events;
+  CreateEventForAllRecordedStream(
+      dynamic_cast<StreamSafeCUDAAllocation*>(allocation)
+          ->GetRecordedStreams()
+          .get(),
+      &outstanding_events);
+  if (!outstanding_events.empty()) {
+    VLOG(8) << allocation->ptr() << " is not ready to free";
+    return;
+  }
+
+  VLOG(8) << "Free " << allocation->ptr();
+  allocation_info_map_.erase(allocation);
+  delete allocation;
+}
+
+std::shared_ptr<StreamSafeCUDAAllocator::AllocationInfo>
+StreamSafeCUDAAllocator::GetAllocationInfo(Allocation* allocation) {
+  auto it = allocation_info_map_.find(allocation);
+  PADDLE_ENFORCE_NE(
+      it, allocation_info_map_.end(),
+      "The recorded allocation is not malloced by this allocator.");
+  return it->second;
+}
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
new file mode 100644
index 00000000000000..1746f537f374cb
--- /dev/null
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
@@ -0,0 +1,76 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cuda_runtime.h>
+#include <deque>
+#include <memory>
+#include <mutex>
+#include <unordered_map>
+#include "paddle/fluid/memory/allocation/allocator.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+class StreamSafeCUDAAllocation : public Allocation {
+ public:
+  StreamSafeCUDAAllocation(AllocationPtr underlying_allocation,
+                           cudaStream_t owning_stream);
+  void RecordStream(cudaStream_t stream);
+  std::shared_ptr<std::set<cudaStream_t>> GetRecordedStreams();
+
+ private:
+  AllocationPtr underlying_allocation_;
+  cudaStream_t owning_stream_;
+  std::shared_ptr<std::set<cudaStream_t>> recorded_streams_;
+  std::mutex mutex_;
+};
+
+class StreamSafeCUDAAllocator : public Allocator {
+ public:
+  StreamSafeCUDAAllocator(
+      const std::shared_ptr<Allocator> &underlying_allocator,
+      const cudaStream_t default_stream);
+  bool IsAllocThreadSafe() const override;
+  void ProcessEventsAndFree();
+
+ protected:
+  Allocation *AllocateImpl(size_t size) override;
+  void FreeImpl(Allocation *allocation) override;
+  uint64_t ReleaseImpl(const platform::Place &place) override;
+
+ private:
+  struct AllocationInfo {
+    std::deque<cudaEvent_t> outstanding_events;
+    bool can_be_freed{false};
+  };
+
+  void CreateEventForAllRecordedStream(
+      std::set<cudaStream_t> *recorded_streams,
+      std::deque<cudaEvent_t> *outstanding_events);
+  void FreeStreamSafeCUDAAllocation(Allocation *allocation);
+  std::shared_ptr<AllocationInfo> GetAllocationInfo(Allocation *);
+
+  std::shared_ptr<Allocator> underlying_allocator_;
+  cudaStream_t default_stream_;
+  std::unordered_map<Allocation *, std::shared_ptr<AllocationInfo>>
+      allocation_info_map_;
+  mutable std::recursive_mutex mutex_;
+};
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc
index 078e841068ca57..55b4657df3ac62 100644
--- a/paddle/fluid/memory/malloc.cc
+++ b/paddle/fluid/memory/malloc.cc
@@ -20,18 +20,40 @@ limitations under the License. */
 namespace paddle {
 namespace memory {
 
-std::shared_ptr<Allocation> AllocShared(const platform::Place &place,
+std::shared_ptr<Allocation> AllocShared(const platform::Place& place,
                                         size_t size) {
   return allocation::AllocatorFacade::Instance().AllocShared(place, size);
 }
 
-AllocationPtr Alloc(const platform::Place &place, size_t size) {
+AllocationPtr Alloc(const platform::Place& place, size_t size) {
   return allocation::AllocatorFacade::Instance().Alloc(place, size);
 }
 
-uint64_t Release(const platform::Place &place) {
+uint64_t Release(const platform::Place& place) {
   return allocation::AllocatorFacade::Instance().Release(place);
 }
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+std::shared_ptr<Allocation> AllocShared(const platform::CUDAPlace& place,
+                                        const cudaStream_t& stream,
+                                        size_t size) {
+  return allocation::AllocatorFacade::Instance().AllocShared(place, stream,
+                                                             size);
+}
+
+AllocationPtr Alloc(const platform::CUDAPlace& place,
+                    const cudaStream_t& stream, size_t size) {
+  return allocation::AllocatorFacade::Instance().Alloc(place, stream, size);
+}
+
+uint64_t Release(const platform::CUDAPlace& place, const cudaStream_t& stream) {
+  return allocation::AllocatorFacade::Instance().Release(place, stream);
+}
+
+void RecordStream(Allocation* allocation, const cudaStream_t& stream) {
+  return allocation::AllocatorFacade::Instance().RecordStream(allocation,
+                                                              stream);
+}
+#endif
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/malloc.h b/paddle/fluid/memory/malloc.h
index 3b8d07548ee0c0..0b49e1cbee1047 100644
--- a/paddle/fluid/memory/malloc.h
+++ b/paddle/fluid/memory/malloc.h
@@ -40,5 +40,18 @@ extern AllocationPtr Alloc(const platform::DeviceContext& dev_ctx, size_t size);
 
 extern uint64_t Release(const platform::Place& place);
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+extern std::shared_ptr<Allocation> AllocShared(const platform::CUDAPlace& place,
+                                               const cudaStream_t& stream,
+                                               size_t size);
+
+extern AllocationPtr Alloc(const platform::CUDAPlace& place,
+                           const cudaStream_t& stream, size_t size);
+
+extern uint64_t Release(const platform::CUDAPlace& place,
+                        const cudaStream_t& stream);
+
+void RecordStream(Allocation* allocation, const cudaStream_t& stream);
+#endif
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/malloc_test.cu b/paddle/fluid/memory/malloc_test.cu
index d015ed7ce693fb..e8d30e6fe98533 100644
--- a/paddle/fluid/memory/malloc_test.cu
+++ b/paddle/fluid/memory/malloc_test.cu
@@ -27,6 +27,11 @@
 #include "gtest/gtest.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/gpu_info.h"
+
+#if defined(PADDLE_WITH_CUDA)
+DECLARE_int64(gpu_allocator_retry_time);
+#endif
 
 namespace paddle {
 namespace memory {
@@ -169,5 +174,144 @@ TEST(Malloc, AllocZero) {
   AllocationPtr allocation_ptr = Alloc(place, 0);
   EXPECT_GE(allocation_ptr->size(), 0);
 }
+
+TEST(Malloc, CUDAAllocRetry) {
+  platform::Place place = platform::CUDAPlace();
+  size_t available_size = platform::GpuAvailableMemToAlloc();
+  // alloc_size < available_size < 2 * alloc_size
+  size_t alloc_size = available_size / 4 * 3;
+
+  auto alloc_fun = [&place, alloc_size]() {
+    return AllocShared(place, alloc_size);
+  };
+  std::shared_ptr<Allocation> allocation = alloc_fun();
+  auto start_time = std::chrono::steady_clock::now();
+  std::thread th(alloc_fun);
+  std::this_thread::sleep_for(std::chrono::seconds(1));
+  allocation.reset();
+  th.join();
+  auto end_time = std::chrono::steady_clock::now();
+
+  std::chrono::duration<double> time = end_time - start_time;
+  VLOG(10) << "time cost = " << time.count() << " s";
+  EXPECT_LE(time.count() * 1000, FLAGS_gpu_allocator_retry_time);
+}
+
+__global__ void add_kernel(int *x, int n) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  for (int i = tid; i < n; i += blockDim.x * gridDim.x) {
+    atomicAdd(x + i, tid);
+  }
+}
+
+class StreamSafeCUDAAllocTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    place_ = platform::CUDAPlace();
+    stream_num_ = 64;
+    grid_num_ = 1;
+    block_num_ = 64;
+    data_num_ = 64;
+    default_stream = nullptr;
+
+    streams_.reserve(stream_num_);
+    streams_.emplace_back(default_stream);
+    for (size_t i = 1; i < stream_num_; ++i) {
+      cudaStream_t stream;
+      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream));
+      streams_.emplace_back(stream);
+    }
+
+    for (size_t i = 0; i < stream_num_; ++i) {
+      size_t allocation_size = data_num_ * sizeof(int);
+      std::shared_ptr<Allocation> allocation =
+          AllocShared(place_, streams_[i], allocation_size);
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          cudaMemset(allocation->ptr(), 0, allocation->size()));
+      allocations_.emplace_back(allocation);
+    }
+  }
+
+  void SingleStreamRun(size_t idx) {
+    for (size_t i = 0; i < stream_num_; ++i) {
+      int *x = reinterpret_cast<int *>(allocations_[i]->ptr());
+      add_kernel<<<grid_num_, block_num_, 0, streams_[idx]>>>(x, data_num_);
+      if (i != idx) {
+        RecordStream(allocations_[i].get(), streams_[idx]);
+      }
+    }
+  }
+
+  void MultiStreamRun() {
+    for (int i = 0; i < stream_num_; ++i) {
+      SingleStreamRun(i);
+    }
+    allocations_.clear();  // fast_gc
+  }
+
+  void MultiThreadMUltiStreamRun() {
+    std::vector<std::thread> threads;
+    for (size_t i = 0; i < stream_num_; ++i) {
+      threads.push_back(
+          std::thread(&StreamSafeCUDAAllocTest::SingleStreamRun, this, i));
+    }
+    for (size_t i = 0; i < stream_num_; ++i) {
+      threads[i].join();
+    }
+    allocations_.clear();  // fast_gc
+  }
+
+  void CheckResult() {
+    auto host_x = std::unique_ptr<int[]>(new int[data_num_]);
+    size_t thread_num = grid_num_ * block_num_;
+    for (int i = 0; i < stream_num_; ++i) {
+      // tricky code, the allocations are still accessible even though
+      // allocations_.clear() has been called
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          cudaMemcpy(host_x.get(), allocations_[i]->ptr(),
+                     data_num_ * sizeof(int), cudaMemcpyDeviceToHost));
+      for (int j = 0; j < data_num_; ++j) {
+        EXPECT_TRUE(host_x[j] == (j % thread_num) * stream_num_);
+      }
+    }
+  }
+
+  void TearDown() override {
+    cudaDeviceSynchronize();
+    for (cudaStream_t stream : streams_) {
+      Release(place_, stream);
+    }
+
+    for (size_t i = 1; i < stream_num_; ++i) {
+      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(streams_[i]));
+    }
+
+    uint64_t cuda_malloc_size =
+        platform::RecordedCudaMallocSize(place_.GetDeviceId());
+    ASSERT_EQ(cuda_malloc_size, 0) << "Found " << cuda_malloc_size
+                                   << " bytes memory that not released yet, "
+                                      "there may be a memory leak problem.";
+  }
+
+  size_t stream_num_;
+  size_t grid_num_;
+  size_t block_num_;
+  size_t data_num_;
+  platform::CUDAPlace place_;
+  cudaStream_t default_stream;
+  std::vector<cudaStream_t> streams_;
+  std::vector<std::shared_ptr<Allocation>> allocations_;
+};
+
+TEST_F(StreamSafeCUDAAllocTest, CUDAMutilStream) {
+  MultiStreamRun();
+  CheckResult();
+}
+
+TEST_F(StreamSafeCUDAAllocTest, CUDAMutilThreadMutilStream) {
+  MultiThreadMUltiStreamRun();
+  CheckResult();
+}
+
 }  // namespace memory
 }  // namespace paddle

From ee154eef809d8764d7792748c368218c8d7123a4 Mon Sep 17 00:00:00 2001
From: chenruibiao <chenruibiao@baidu.com>
Date: Thu, 18 Nov 2021 18:00:26 +0800
Subject: [PATCH 02/13] Do not notify the retrying from other streams when free
 CUDA allocation

---
 paddle/fluid/memory/allocation/allocator.h    |  9 --
 .../memory/allocation/allocator_facade.cc     | 83 +++++--------------
 .../memory/allocation/allocator_facade.h      |  2 -
 .../memory/allocation/retry_allocator.cc      | 19 +++++
 .../fluid/memory/allocation/retry_allocator.h | 19 -----
 .../allocation/stream_safe_cuda_allocator.cc  |  7 +-
 .../allocation/stream_safe_cuda_allocator.h   |  1 -
 paddle/fluid/memory/malloc_test.cu            | 42 ++++++----
 8 files changed, 67 insertions(+), 115 deletions(-)

diff --git a/paddle/fluid/memory/allocation/allocator.h b/paddle/fluid/memory/allocation/allocator.h
index de108434154395..b11c657b96b74c 100644
--- a/paddle/fluid/memory/allocation/allocator.h
+++ b/paddle/fluid/memory/allocation/allocator.h
@@ -39,7 +39,6 @@ struct BadAlloc : public std::exception {
 };
 
 class Allocator;
-extern void NotifyGPURetryThreads();
 
 // Allocation is the object holding the actually pointer. Use
 // `Allocation::ptr()` will returns the pointer that allocated.
@@ -156,16 +155,8 @@ class Allocator {
   class AllocationDeleter {
    public:
     inline void operator()(Allocation* allocation) const {
-      VLOG(10) << "Run AllocationDeleter";
       Allocator* allocator = allocation->TopDecoratedAllocator();
       allocator->Free(allocation);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      // TODO(Ruibiao): Dirty code, consider a better design to notify CUDA
-      // alloc retry
-      if (platform::is_gpu_place(allocation->place())) {
-        NotifyGPURetryThreads();
-      }
-#endif
     }
   };
 
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index d5bab43d60c442..636798d09d4c42 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -170,7 +170,7 @@ class AllocatorFacadePrivate {
             << " " << place << " " << size;
 
     if (platform::is_gpu_place(place) && size > 0) {
-      return GetCUDAAllocator(boost::get<platform::CUDAPlace>(place),
+      return GetCUDAAllocator(BOOST_GET_CONST(platform::CUDAPlace, place),
                               default_cuda_stream_);
     }
 
@@ -206,8 +206,6 @@ class AllocatorFacadePrivate {
 
   cudaStream_t GetDefaultCudaStream() { return default_cuda_stream_; }
 
-  void NotifyGPURetryThreads() { cuda_retry_cv_.notify_all(); }
-
   void RecordStream(Allocation* allocation, const cudaStream_t& stream) {
     PADDLE_ENFORCE_EQ(
         platform::is_gpu_place(allocation->place()), true,
@@ -229,9 +227,9 @@ class AllocatorFacadePrivate {
     try {
       cuda_allocator = GetCUDAAllocator(place, stream);
     } catch (platform::EnforceNotMet& err) {
-      VLOG(9) << "No allocator found for stream " << stream << "in place "
+      VLOG(9) << "No allocator found for stream " << stream << " in place "
               << place << " , build a new one";
-      std::unique_lock<std::mutex> lock(cuda_retry_mutex_);
+      std::unique_lock<std::mutex> lock(cuda_init_mutex_);
       try {
         cuda_allocator = GetCUDAAllocator(place, stream);
       } catch (platform::EnforceNotMet& err) {
@@ -244,61 +242,19 @@ class AllocatorFacadePrivate {
       throw;
     }
 
-    if (FLAGS_gpu_allocator_retry_time <= 0) {
-      return cuda_allocator->Allocate(size);
-    }
-
-    // In fact, we can unify the code of allocation success and failure
-    // But it would add lock even when allocation success at the first time
     try {
       return cuda_allocator->Allocate(size);
     } catch (BadAlloc&) {
       VLOG(9) << "Allocation failed when allocating " << size
               << " bytes for stream " << stream;
       for (auto pair : cuda_allocators_[place]) {
-        std::shared_ptr<Allocator> cuda_allocator = pair.second;
-        std::dynamic_pointer_cast<StreamSafeCUDAAllocator>(cuda_allocator)
-            ->ProcessEventsAndFree();
+        pair.second->Release(place);
       }
       try {
         return cuda_allocator->Allocate(size);
-      } catch (BadAlloc&) {
-        {
-          WaitedAllocateSizeGuard guard(&cuda_waited_allocate_size_, size);
-          VLOG(10)
-              << "Still allocation failed after calling ProcessEventAndFree, "
-              << " cuda_waited_allocate_size_ = " << cuda_waited_allocate_size_;
-          // We can just write allocation retry inside the predicate function of
-          // wait_until. But it needs to acquire the lock when executing
-          // predicate
-          // function. For better performance, we use loop here
-          auto end_time =
-              std::chrono::high_resolution_clock::now() +
-              std::chrono::milliseconds(FLAGS_gpu_allocator_retry_time);
-          auto wait_until = [&end_time, this] {
-            std::unique_lock<std::mutex> lock(cuda_retry_mutex_);
-            return cuda_retry_cv_.wait_until(lock, end_time);
-          };
-
-          size_t retry_times = 0;
-          while (wait_until() != std::cv_status::timeout) {
-            try {
-              return cuda_allocator->Allocate(size);
-            } catch (BadAlloc&) {
-              ++retry_times;
-              VLOG(10) << "Allocation failed when retrying " << retry_times
-                       << " times when allocating " << size
-                       << " bytes. Wait still.";
-            } catch (...) {
-              throw;
-            }
-          }
-        }
-        VLOG(10) << "Allocation failed because of timeout when allocating "
-                 << size << " bytes.";
-        return cuda_allocator->Allocate(
-            size);  // If timeout, try last allocation request
       } catch (...) {
+        VLOG(9) << "Still allocation failed "
+                << "after release memory from all streams";
         throw;
       }
     } catch (...) {
@@ -414,6 +370,7 @@ class AllocatorFacadePrivate {
       }
     }
     WrapStreamSafeCUDAAllocator(p, stream);
+    WrapCUDARetryAllocator(p, stream, FLAGS_gpu_allocator_retry_time);
   }
 
   void InitNaiveBestFitCUDAAllocator(platform::CUDAPlace p,
@@ -501,6 +458,16 @@ class AllocatorFacadePrivate {
     cuda_allocators_[p][stream] = std::make_shared<ThreadLocalCUDAAllocator>(p);
   }
 
+  void WrapCUDARetryAllocator(platform::CUDAPlace p, cudaStream_t stream,
+                              size_t retry_time) {
+    PADDLE_ENFORCE_GT(
+        retry_time, 0,
+        platform::errors::InvalidArgument(
+            "Retry time should be larger than 0, but got %d", retry_time));
+    std::shared_ptr<Allocator>& allocator = cuda_allocators_[p][stream];
+    allocator = std::make_shared<RetryAllocator>(allocator, retry_time);
+  }
+
   void WrapStreamSafeCUDAAllocator(platform::CUDAPlace p, cudaStream_t stream) {
     const std::shared_ptr<Allocator>& underlying_allocator =
         GetCUDAAllocator(p, stream);
@@ -607,10 +574,7 @@ class AllocatorFacadePrivate {
   // a standalone CUDA allocator to support multi-stream GC in new executor
   CUDAAllocatorMap cuda_allocators_;
   cudaStream_t default_cuda_stream_;
-  static std::condition_variable cuda_retry_cv_;
-  std::mutex cuda_retry_mutex_;
   std::mutex cuda_init_mutex_;
-  std::atomic<size_t> cuda_waited_allocate_size_{0};
 #ifdef PADDLE_WITH_CUDA
   std::unordered_map<CUDAGraphID, std::unique_ptr<AllocatorFacadePrivate>>
       cuda_graph_allocator_map_;
@@ -625,9 +589,6 @@ class AllocatorFacadePrivate {
 AllocatorFacadePrivate::AllocatorMap
     AllocatorFacadePrivate::zero_size_allocators_;
 AllocatorFacadePrivate::AllocatorMap AllocatorFacadePrivate::system_allocators_;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-std::condition_variable AllocatorFacadePrivate::cuda_retry_cv_;
-#endif
 
 // Pimpl. Make interface clean.
 AllocatorFacade::AllocatorFacade() : m_(new AllocatorFacadePrivate()) {}
@@ -649,7 +610,7 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
                                      size_t size) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (platform::is_gpu_place(place)) {
-    return Alloc(boost::get<platform::CUDAPlace>(place),
+    return Alloc(BOOST_GET_CONST(platform::CUDAPlace, place),
                  m_->GetDefaultCudaStream(), size);
   }
 #endif
@@ -659,7 +620,7 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
 uint64_t AllocatorFacade::Release(const platform::Place& place) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (platform::is_gpu_place(place)) {
-    return Release(boost::get<platform::CUDAPlace>(place),
+    return Release(BOOST_GET_CONST(platform::CUDAPlace, place),
                    m_->GetDefaultCudaStream());
   }
 #endif
@@ -692,8 +653,6 @@ uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place,
   return m_->GetCUDAAllocator(place, stream)->Release(place);
 }
 
-void AllocatorFacade::NotifyGPURetryThreads() { m_->NotifyGPURetryThreads(); }
-
 void AllocatorFacade::RecordStream(Allocation* allocation,
                                    const cudaStream_t& stream) {
   m_->RecordStream(allocation, stream);
@@ -708,10 +667,6 @@ void AllocatorFacade::RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id) {
   return m_->RemoveMemoryPoolOfCUDAGraph(id);
 }
 #endif
-
-void NotifyGPURetryThreads() {
-  allocation::AllocatorFacade::Instance().NotifyGPURetryThreads();
-}
 #endif
 }  // namespace allocation
 }  // namespace memory
diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h
index 507b36abd5404c..b61e1616fee45e 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.h
+++ b/paddle/fluid/memory/allocation/allocator_facade.h
@@ -65,8 +65,6 @@ class AllocatorFacade {
                       const cudaStream_t& stream, size_t size);
   uint64_t Release(const platform::CUDAPlace& place,
                    const cudaStream_t& stream);
-
-  void NotifyGPURetryThreads();
   void RecordStream(Allocation* allocation, const cudaStream_t& stream);
 #ifdef PADDLE_WITH_CUDA
   void PrepareMemoryPoolForCUDAGraph(CUDAGraphID id);
diff --git a/paddle/fluid/memory/allocation/retry_allocator.cc b/paddle/fluid/memory/allocation/retry_allocator.cc
index 86c7ad3dc5be16..1607af3808b434 100644
--- a/paddle/fluid/memory/allocation/retry_allocator.cc
+++ b/paddle/fluid/memory/allocation/retry_allocator.cc
@@ -20,6 +20,25 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 
+class WaitedAllocateSizeGuard {
+ public:
+  WaitedAllocateSizeGuard(std::atomic<size_t>* waited_size,
+                          size_t requested_size)
+      : waited_size_(waited_size), requested_size_(requested_size) {
+    waited_size_->fetch_add(requested_size_,
+                            std::memory_order::memory_order_relaxed);
+  }
+
+  ~WaitedAllocateSizeGuard() {
+    waited_size_->fetch_sub(requested_size_,
+                            std::memory_order::memory_order_relaxed);
+  }
+
+ private:
+  std::atomic<size_t>* waited_size_;
+  size_t requested_size_;
+};
+
 void RetryAllocator::FreeImpl(Allocation* allocation) {
   // Delete underlying allocation first.
   size_t size = allocation->size();
diff --git a/paddle/fluid/memory/allocation/retry_allocator.h b/paddle/fluid/memory/allocation/retry_allocator.h
index 36ff2e027c1a37..031a5e2b97f178 100644
--- a/paddle/fluid/memory/allocation/retry_allocator.h
+++ b/paddle/fluid/memory/allocation/retry_allocator.h
@@ -28,25 +28,6 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 
-class WaitedAllocateSizeGuard {
- public:
-  WaitedAllocateSizeGuard(std::atomic<size_t>* waited_size,
-                          size_t requested_size)
-      : waited_size_(waited_size), requested_size_(requested_size) {
-    waited_size_->fetch_add(requested_size_,
-                            std::memory_order::memory_order_relaxed);
-  }
-
-  ~WaitedAllocateSizeGuard() {
-    waited_size_->fetch_sub(requested_size_,
-                            std::memory_order::memory_order_relaxed);
-  }
-
- private:
-  std::atomic<size_t>* waited_size_;
-  size_t requested_size_;
-};
-
 class RetryAllocator : public Allocator {
  public:
   RetryAllocator(std::shared_ptr<Allocator> allocator, size_t retry_ms)
diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
index 5558ddcbbb5361..dcc203ae462ad9 100644
--- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
@@ -73,8 +73,8 @@ void StreamSafeCUDAAllocator::ProcessEventsAndFree() {
     if (deque_it == outstanding_events.end()) {
       outstanding_events.clear();
       Allocation* allocation = map_it->first;
-      auto next_it = ++map_it;  // "map_it" may be invalid after calling
-                                // FreeStreamSafeCUDAAllocation
+      // "map_it" may be invalid after calling FreeStreamSafeCUDAAllocation
+      auto next_it = ++map_it;
       FreeStreamSafeCUDAAllocation(allocation);
       map_it = next_it;
     } else {
@@ -151,7 +151,8 @@ StreamSafeCUDAAllocator::GetAllocationInfo(Allocation* allocation) {
   auto it = allocation_info_map_.find(allocation);
   PADDLE_ENFORCE_NE(
       it, allocation_info_map_.end(),
-      "The recorded allocation is not malloced by this allocator.");
+      platform::errors::NotFound(
+          "The recorded allocation is not malloced by this allocator"));
   return it->second;
 }
 
diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
index 1746f537f374cb..33a542ddc97733 100644
--- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
@@ -14,7 +14,6 @@
 
 #pragma once
 
-#include <cuda_runtime.h>
 #include <deque>
 #include <memory>
 #include <mutex>
diff --git a/paddle/fluid/memory/malloc_test.cu b/paddle/fluid/memory/malloc_test.cu
index e8d30e6fe98533..0bf9435ae700c2 100644
--- a/paddle/fluid/memory/malloc_test.cu
+++ b/paddle/fluid/memory/malloc_test.cu
@@ -175,26 +175,30 @@ TEST(Malloc, AllocZero) {
   EXPECT_GE(allocation_ptr->size(), 0);
 }
 
-TEST(Malloc, CUDAAllocRetry) {
-  platform::Place place = platform::CUDAPlace();
+TEST(Malloc, StreamSafeCUDAAllocRetry) {
+  platform::CUDAPlace place = platform::CUDAPlace();
+  cudaStream_t stream1, stream2;
+  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream1));
+  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream2));
+
   size_t available_size = platform::GpuAvailableMemToAlloc();
   // alloc_size < available_size < 2 * alloc_size
   size_t alloc_size = available_size / 4 * 3;
 
-  auto alloc_fun = [&place, alloc_size]() {
-    return AllocShared(place, alloc_size);
-  };
-  std::shared_ptr<Allocation> allocation = alloc_fun();
-  auto start_time = std::chrono::steady_clock::now();
-  std::thread th(alloc_fun);
-  std::this_thread::sleep_for(std::chrono::seconds(1));
-  allocation.reset();
+  std::shared_ptr<Allocation> allocation1 =
+      AllocShared(place, stream1, alloc_size);
+  std::shared_ptr<Allocation> allocation2;
+
+  std::thread th([&allocation2, &place, &stream2, alloc_size]() {
+    std::this_thread::sleep_for(std::chrono::seconds(1));
+    allocation2 = AllocShared(place, stream2, alloc_size);
+  });
+  allocation1.reset();  // free but not release
   th.join();
-  auto end_time = std::chrono::steady_clock::now();
+  EXPECT_GE(allocation2->size(), alloc_size);
 
-  std::chrono::duration<double> time = end_time - start_time;
-  VLOG(10) << "time cost = " << time.count() << " s";
-  EXPECT_LE(time.count() * 1000, FLAGS_gpu_allocator_retry_time);
+  Release(place, stream1);
+  Release(place, stream2);
 }
 
 __global__ void add_kernel(int *x, int n) {
@@ -213,6 +217,8 @@ class StreamSafeCUDAAllocTest : public ::testing::Test {
     block_num_ = 64;
     data_num_ = 64;
     default_stream = nullptr;
+    cuda_malloc_size_before_test_ =
+        platform::RecordedCudaMallocSize(place_.GetDeviceId());
 
     streams_.reserve(stream_num_);
     streams_.emplace_back(default_stream);
@@ -288,15 +294,17 @@ class StreamSafeCUDAAllocTest : public ::testing::Test {
 
     uint64_t cuda_malloc_size =
         platform::RecordedCudaMallocSize(place_.GetDeviceId());
-    ASSERT_EQ(cuda_malloc_size, 0) << "Found " << cuda_malloc_size
-                                   << " bytes memory that not released yet, "
-                                      "there may be a memory leak problem.";
+    ASSERT_EQ(cuda_malloc_size, cuda_malloc_size_before_test_)
+        << "Found " << cuda_malloc_size
+        << " bytes memory that not released yet, there may be a memory leak "
+           "problem.";
   }
 
   size_t stream_num_;
   size_t grid_num_;
   size_t block_num_;
   size_t data_num_;
+  uint64_t cuda_malloc_size_before_test_;
   platform::CUDAPlace place_;
   cudaStream_t default_stream;
   std::vector<cudaStream_t> streams_;

From c9e5291cfef8010b02bfb0fba6228033fc4e9d0e Mon Sep 17 00:00:00 2001
From: chenruibiao <chenruibiao@baidu.com>
Date: Thu, 18 Nov 2021 20:20:20 +0800
Subject: [PATCH 03/13] Fix compile error for CPU

---
 paddle/fluid/memory/allocation/CMakeLists.txt       | 13 ++++++++-----
 paddle/fluid/memory/allocation/allocator_facade.cc  | 13 ++++---------
 .../memory/allocation/stream_safe_cuda_allocator.cc |  1 -
 .../memory/allocation/stream_safe_cuda_allocator.h  |  2 ++
 paddle/fluid/memory/malloc_test.cu                  |  6 ++----
 5 files changed, 16 insertions(+), 19 deletions(-)

diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
index 4b4717cead36e5..4d44c533b7456f 100644
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -15,8 +15,10 @@ endif()
 
 if (WITH_GPU)
   nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard)
-  nv_library(thread_local_allocator SRCS thread_local_allocator.cc DEPS allocator)
   nv_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator)
+  nv_library(stream_safe_cuda_allocator SRCS stream_safe_cuda_allocator.cc DEPS allocator)
+  nv_library(thread_local_allocator SRCS thread_local_allocator.cc DEPS allocator)
+
   cc_test(thread_local_allocator_test SRCS thread_local_allocator_test.cc DEPS thread_local_allocator)
   if(CUDA_VERSION GREATER_EQUAL 10.2)
     nv_library(cuda_virtual_mem_allocator SRCS cuda_virtual_mem_allocator.cc DEPS dynload_cuda)
@@ -25,8 +27,10 @@ endif()
 
 if (WITH_ROCM)
   hip_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard)
-  hip_library(thread_local_allocator SRCS thread_local_allocator.cc DEPS allocator)
   hip_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator)
+  hip_library(stream_safe_cuda_allocator SRCS stream_safe_cuda_allocator.cc DEPS allocator)
+  hip_library(thread_local_allocator SRCS thread_local_allocator.cc DEPS allocator)
+  
   cc_test(thread_local_allocator_test SRCS thread_local_allocator_test.cc DEPS thread_local_allocator)
 endif()
 
@@ -38,7 +42,7 @@ endif()
 cc_library(retry_allocator SRCS retry_allocator.cc DEPS allocator)
 
 if (WITH_GPU OR WITH_ROCM)
-    set(AllocatorFacadeDeps gpu_info cuda_allocator pinned_allocator cuda_device_guard thread_local_allocator)
+    set(AllocatorFacadeDeps gpu_info cuda_allocator pinned_allocator cuda_device_guard thread_local_allocator stream_safe_cuda_allocator)
     if(CUDA_VERSION GREATER_EQUAL 10.2)
       list(APPEND AllocatorFacadeDeps cuda_virtual_mem_allocator)
     endif()
@@ -88,7 +92,7 @@ endif()
 cc_library(aligned_allocator SRCS aligned_allocator.cc DEPS allocator)
 cc_test(test_aligned_allocator SRCS test_aligned_allocator.cc DEPS aligned_allocator)
 cc_library(allocator_strategy SRCS allocator_strategy.cc DEPS gflags ${AllocatorFacadeDeps})
-cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy stream_safe_cuda_allocator)
+cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy)
 
 if (WITH_GPU)
   target_link_libraries(allocator_facade cuda_graph)
@@ -113,7 +117,6 @@ cc_library(auto_growth_best_fit_allocator SRCS auto_growth_best_fit_allocator.cc
 cc_test(auto_growth_best_fit_allocator_facade_test SRCS auto_growth_best_fit_allocator_facade_test.cc DEPS cpu_allocator auto_growth_best_fit_allocator)
 cc_test(auto_growth_best_fit_allocator_test SRCS auto_growth_best_fit_allocator_test.cc DEPS auto_growth_best_fit_allocator)
 
-cc_library(stream_safe_cuda_allocator SRCS stream_safe_cuda_allocator.cc DEPS allocator)
 cc_library(virtual_memory_auto_growth_best_fit_allocator SRCS virtual_memory_auto_growth_best_fit_allocator.cc DEPS allocator aligned_allocator)
 
 if(NOT WIN32)
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 636798d09d4c42..902eba1adfc3d8 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -14,15 +14,7 @@
 
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 
-#include <atomic>
-#include <chrono>
-#include <condition_variable>
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-#include <cuda_runtime.h>
-#endif
 #include "gflags/gflags.h"
-
 #include "paddle/fluid/memory/allocation/aligned_allocator.h"
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/memory/allocation/allocator_strategy.h"
@@ -30,7 +22,6 @@
 #include "paddle/fluid/memory/allocation/cpu_allocator.h"
 #include "paddle/fluid/memory/allocation/naive_best_fit_allocator.h"
 #include "paddle/fluid/memory/allocation/retry_allocator.h"
-#include "paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/npu_info.h"
 #include "paddle/fluid/platform/place.h"
@@ -38,8 +29,10 @@
 #include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
 #endif
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#include <cuda_runtime.h>
 #include "paddle/fluid/memory/allocation/cuda_allocator.h"
 #include "paddle/fluid/memory/allocation/pinned_allocator.h"
+#include "paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h"
 #include "paddle/fluid/memory/allocation/thread_local_allocator.h"
 #include "paddle/fluid/platform/gpu_info.h"
 #endif
@@ -169,10 +162,12 @@ class AllocatorFacadePrivate {
     VLOG(6) << "GetAllocator"
             << " " << place << " " << size;
 
+#ifdef PADDLE_WITH_CUDA
     if (platform::is_gpu_place(place) && size > 0) {
       return GetCUDAAllocator(BOOST_GET_CONST(platform::CUDAPlace, place),
                               default_cuda_stream_);
     }
+#endif
 
     const auto& allocators =
         (size > 0 ? (UNLIKELY(FLAGS_use_system_allocator) ? system_allocators_
diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
index dcc203ae462ad9..0993921ebc0ddc 100644
--- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
@@ -13,7 +13,6 @@
 // limitations under the License.
 
 #include "paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h"
-#include "paddle/fluid/memory/allocation/allocator.h"
 
 namespace paddle {
 namespace memory {
diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
index 33a542ddc97733..8c5f32665b8770 100644
--- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
@@ -14,9 +14,11 @@
 
 #pragma once
 
+#include <cuda_runtime.h>
 #include <deque>
 #include <memory>
 #include <mutex>
+#include <set>
 #include <unordered_map>
 #include "paddle/fluid/memory/allocation/allocator.h"
 
diff --git a/paddle/fluid/memory/malloc_test.cu b/paddle/fluid/memory/malloc_test.cu
index 0bf9435ae700c2..b243acdeabc2c6 100644
--- a/paddle/fluid/memory/malloc_test.cu
+++ b/paddle/fluid/memory/malloc_test.cu
@@ -29,10 +29,6 @@
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/gpu_info.h"
 
-#if defined(PADDLE_WITH_CUDA)
-DECLARE_int64(gpu_allocator_retry_time);
-#endif
-
 namespace paddle {
 namespace memory {
 
@@ -175,6 +171,7 @@ TEST(Malloc, AllocZero) {
   EXPECT_GE(allocation_ptr->size(), 0);
 }
 
+#ifdef PADDLE_WITH_CUDA
 TEST(Malloc, StreamSafeCUDAAllocRetry) {
   platform::CUDAPlace place = platform::CUDAPlace();
   cudaStream_t stream1, stream2;
@@ -320,6 +317,7 @@ TEST_F(StreamSafeCUDAAllocTest, CUDAMutilThreadMutilStream) {
   MultiThreadMUltiStreamRun();
   CheckResult();
 }
+#endif
 
 }  // namespace memory
 }  // namespace paddle

From e1e8012380de0814853eed62aebbf5959ee8c650 Mon Sep 17 00:00:00 2001
From: chenruibiao <chenruibiao@baidu.com>
Date: Thu, 18 Nov 2021 21:38:57 +0800
Subject: [PATCH 04/13] Fix compile error for HIP

---
 .../memory/allocation/allocator_facade.cc     | 64 ++++++++++---------
 .../memory/allocation/allocator_facade.h      |  9 ++-
 .../allocation/stream_safe_cuda_allocator.cc  | 43 +++++++++----
 .../allocation/stream_safe_cuda_allocator.h   | 26 ++++----
 paddle/fluid/memory/malloc.cc                 | 10 +--
 paddle/fluid/memory/malloc.h                  |  8 +--
 paddle/fluid/memory/malloc_test.cu            | 43 ++++++++++---
 7 files changed, 129 insertions(+), 74 deletions(-)

diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 902eba1adfc3d8..a47dba54932646 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -25,29 +25,36 @@
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/npu_info.h"
 #include "paddle/fluid/platform/place.h"
-#ifdef PADDLE_WITH_ASCEND_CL
-#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
-#endif
+
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-#include <cuda_runtime.h>
 #include "paddle/fluid/memory/allocation/cuda_allocator.h"
 #include "paddle/fluid/memory/allocation/pinned_allocator.h"
 #include "paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h"
 #include "paddle/fluid/memory/allocation/thread_local_allocator.h"
 #include "paddle/fluid/platform/gpu_info.h"
+
+#ifdef PADDLE_WITH_CUDA
+#include <cuda_runtime.h>
+#include "paddle/fluid/platform/cuda_graph.h"
+#else
+#include <hip/hip_runtime.h>
 #endif
+
 #if CUDA_VERSION >= 10020
 #include "paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.h"
 #include "paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.h"
 #include "paddle/fluid/platform/dynload/cuda_driver.h"
 #endif
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/cuda_graph.h"
 #endif
+
 #ifdef PADDLE_WITH_XPU
 #include "paddle/fluid/platform/xpu/xpu_info.h"
 #endif
 
+#ifdef PADDLE_WITH_ASCEND_CL
+#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
+#endif
+
 PADDLE_DEFINE_EXPORTED_int64(
     gpu_allocator_retry_time, 10000,
     "The retry time (milliseconds) when allocator fails "
@@ -119,7 +126,7 @@ class AllocatorFacadePrivate {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   using CUDAAllocatorMap =
       std::map<platform::CUDAPlace,
-               std::map<cudaStream_t, std::shared_ptr<Allocator>>>;
+               std::map<gpuStream_t, std::shared_ptr<Allocator>>>;
 #endif
 
   explicit AllocatorFacadePrivate(bool allow_free_idle_chunk = true) {
@@ -130,9 +137,9 @@ class AllocatorFacadePrivate {
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     allow_free_idle_chunk_ = allow_free_idle_chunk;
-    default_cuda_stream_ = nullptr;
+    default_stream_ = nullptr;
     for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount(); ++dev_id) {
-      InitCUDAAllocator(platform::CUDAPlace(dev_id), default_cuda_stream_);
+      InitCUDAAllocator(platform::CUDAPlace(dev_id), default_stream_);
     }
     InitNaiveBestFitCUDAPinnedAllocator();
 #endif
@@ -165,7 +172,7 @@ class AllocatorFacadePrivate {
 #ifdef PADDLE_WITH_CUDA
     if (platform::is_gpu_place(place) && size > 0) {
       return GetCUDAAllocator(BOOST_GET_CONST(platform::CUDAPlace, place),
-                              default_cuda_stream_);
+                              default_stream_);
     }
 #endif
 
@@ -182,13 +189,13 @@ class AllocatorFacadePrivate {
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   const std::shared_ptr<Allocator>& GetCUDAAllocator(
-      const platform::CUDAPlace& place, const cudaStream_t& stream) {
+      const platform::CUDAPlace& place, const gpuStream_t& stream) {
     auto place_it = cuda_allocators_.find(place);
     PADDLE_ENFORCE_NE(place_it, cuda_allocators_.end(),
                       platform::errors::NotFound(
                           "No allocator found for the place %s", place));
 
-    const std::map<cudaStream_t, std::shared_ptr<Allocator>>& allocator_map =
+    const std::map<gpuStream_t, std::shared_ptr<Allocator>>& allocator_map =
         place_it->second;
     auto stream_it = allocator_map.find(stream);
     PADDLE_ENFORCE_NE(
@@ -199,9 +206,9 @@ class AllocatorFacadePrivate {
     return stream_it->second;
   }
 
-  cudaStream_t GetDefaultCudaStream() { return default_cuda_stream_; }
+  gpuStream_t GetDefaultStream() { return default_stream_; }
 
-  void RecordStream(Allocation* allocation, const cudaStream_t& stream) {
+  void RecordStream(Allocation* allocation, const gpuStream_t& stream) {
     PADDLE_ENFORCE_EQ(
         platform::is_gpu_place(allocation->place()), true,
         platform::errors::InvalidArgument(
@@ -211,7 +218,7 @@ class AllocatorFacadePrivate {
   }
 
   AllocationPtr CUDAAlloc(const platform::CUDAPlace& place,
-                          const cudaStream_t& stream, size_t size) {
+                          const gpuStream_t& stream, size_t size) {
     std::shared_ptr<Allocator> cuda_allocator;
     /* NOTE(Ruibiao): This code does not lead to lock competition
      * for seraching initialized CUDA allocator in multithreaded scenario.
@@ -345,7 +352,7 @@ class AllocatorFacadePrivate {
         std::make_shared<NaiveBestFitAllocator>(platform::CUDAPinnedPlace());
   }
 
-  void InitCUDAAllocator(platform::CUDAPlace p, cudaStream_t stream) {
+  void InitCUDAAllocator(platform::CUDAPlace p, gpuStream_t stream) {
     switch (strategy_) {
       case AllocatorStrategy::kNaiveBestFit: {
         InitNaiveBestFitCUDAAllocator(p, stream);
@@ -369,11 +376,11 @@ class AllocatorFacadePrivate {
   }
 
   void InitNaiveBestFitCUDAAllocator(platform::CUDAPlace p,
-                                     cudaStream_t stream) {
+                                     gpuStream_t stream) {
     cuda_allocators_[p][stream] = std::make_shared<NaiveBestFitAllocator>(p);
   }
 
-  void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p, cudaStream_t stream) {
+  void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p, gpuStream_t stream) {
 #if defined(PADDLE_WITH_HIP)
     auto cuda_allocator = std::make_shared<CUDAAllocator>(p);
     cuda_allocators_[p][stream] = std::make_shared<AutoGrowthBestFitAllocator>(
@@ -448,12 +455,11 @@ class AllocatorFacadePrivate {
 #endif
   }
 
-  void InitThreadLocalCUDAAllocator(platform::CUDAPlace p,
-                                    cudaStream_t stream) {
+  void InitThreadLocalCUDAAllocator(platform::CUDAPlace p, gpuStream_t stream) {
     cuda_allocators_[p][stream] = std::make_shared<ThreadLocalCUDAAllocator>(p);
   }
 
-  void WrapCUDARetryAllocator(platform::CUDAPlace p, cudaStream_t stream,
+  void WrapCUDARetryAllocator(platform::CUDAPlace p, gpuStream_t stream,
                               size_t retry_time) {
     PADDLE_ENFORCE_GT(
         retry_time, 0,
@@ -463,7 +469,7 @@ class AllocatorFacadePrivate {
     allocator = std::make_shared<RetryAllocator>(allocator, retry_time);
   }
 
-  void WrapStreamSafeCUDAAllocator(platform::CUDAPlace p, cudaStream_t stream) {
+  void WrapStreamSafeCUDAAllocator(platform::CUDAPlace p, gpuStream_t stream) {
     const std::shared_ptr<Allocator>& underlying_allocator =
         GetCUDAAllocator(p, stream);
     cuda_allocators_[p][stream] =
@@ -568,7 +574,7 @@ class AllocatorFacadePrivate {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   // a standalone CUDA allocator to support multi-stream GC in new executor
   CUDAAllocatorMap cuda_allocators_;
-  cudaStream_t default_cuda_stream_;
+  gpuStream_t default_stream_;
   std::mutex cuda_init_mutex_;
 #ifdef PADDLE_WITH_CUDA
   std::unordered_map<CUDAGraphID, std::unique_ptr<AllocatorFacadePrivate>>
@@ -606,7 +612,7 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (platform::is_gpu_place(place)) {
     return Alloc(BOOST_GET_CONST(platform::CUDAPlace, place),
-                 m_->GetDefaultCudaStream(), size);
+                 m_->GetDefaultStream(), size);
   }
 #endif
   return m_->GetAllocator(place, size)->Allocate(size);
@@ -616,7 +622,7 @@ uint64_t AllocatorFacade::Release(const platform::Place& place) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (platform::is_gpu_place(place)) {
     return Release(BOOST_GET_CONST(platform::CUDAPlace, place),
-                   m_->GetDefaultCudaStream());
+                   m_->GetDefaultStream());
   }
 #endif
   return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1)
@@ -630,12 +636,12 @@ const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 std::shared_ptr<Allocation> AllocatorFacade::AllocShared(
-    const platform::CUDAPlace& place, const cudaStream_t& stream, size_t size) {
+    const platform::CUDAPlace& place, const gpuStream_t& stream, size_t size) {
   return std::shared_ptr<Allocation>(Alloc(place, stream, size));
 }
 
 AllocationPtr AllocatorFacade::Alloc(const platform::CUDAPlace& place,
-                                     const cudaStream_t& stream, size_t size) {
+                                     const gpuStream_t& stream, size_t size) {
   if (size > 0) {
     return m_->CUDAAlloc(place, stream, size);
   } else {
@@ -644,12 +650,12 @@ AllocationPtr AllocatorFacade::Alloc(const platform::CUDAPlace& place,
 }
 
 uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place,
-                                  const cudaStream_t& stream) {
+                                  const gpuStream_t& stream) {
   return m_->GetCUDAAllocator(place, stream)->Release(place);
 }
 
 void AllocatorFacade::RecordStream(Allocation* allocation,
-                                   const cudaStream_t& stream) {
+                                   const gpuStream_t& stream) {
   m_->RecordStream(allocation, stream);
 }
 
diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h
index b61e1616fee45e..813cf62c1366c9 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.h
+++ b/paddle/fluid/memory/allocation/allocator_facade.h
@@ -59,13 +59,12 @@ class AllocatorFacade {
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   std::shared_ptr<Allocation> AllocShared(const platform::CUDAPlace& place,
-                                          const cudaStream_t& stream,
+                                          const gpuStream_t& stream,
                                           size_t size);
   AllocationPtr Alloc(const platform::CUDAPlace& place,
-                      const cudaStream_t& stream, size_t size);
-  uint64_t Release(const platform::CUDAPlace& place,
-                   const cudaStream_t& stream);
-  void RecordStream(Allocation* allocation, const cudaStream_t& stream);
+                      const gpuStream_t& stream, size_t size);
+  uint64_t Release(const platform::CUDAPlace& place, const gpuStream_t& stream);
+  void RecordStream(Allocation* allocation, const gpuStream_t& stream);
 #ifdef PADDLE_WITH_CUDA
   void PrepareMemoryPoolForCUDAGraph(CUDAGraphID id);
   void RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id);
diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
index 0993921ebc0ddc..d7515baa2787b2 100644
--- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
@@ -19,14 +19,14 @@ namespace memory {
 namespace allocation {
 
 StreamSafeCUDAAllocation::StreamSafeCUDAAllocation(
-    AllocationPtr underlying_allocation, cudaStream_t owning_stream)
+    AllocationPtr underlying_allocation, gpuStream_t owning_stream)
     : Allocation(underlying_allocation->ptr(), underlying_allocation->size(),
                  underlying_allocation->place()),
       underlying_allocation_(std::move(underlying_allocation)),
       owning_stream_(owning_stream),
-      recorded_streams_(std::make_shared<std::set<cudaStream_t>>()) {}
+      recorded_streams_(std::make_shared<std::set<gpuStream_t>>()) {}
 
-void StreamSafeCUDAAllocation::RecordStream(cudaStream_t stream) {
+void StreamSafeCUDAAllocation::RecordStream(gpuStream_t stream) {
   VLOG(8) << "Record stream " << stream << " to " << ptr();
   if (stream == owning_stream_) {
     return;
@@ -35,14 +35,14 @@ void StreamSafeCUDAAllocation::RecordStream(cudaStream_t stream) {
   recorded_streams_->insert(stream);
 }
 
-std::shared_ptr<std::set<cudaStream_t>>
+std::shared_ptr<std::set<gpuStream_t>>
 StreamSafeCUDAAllocation::GetRecordedStreams() {
   return recorded_streams_;
 }
 
 StreamSafeCUDAAllocator::StreamSafeCUDAAllocator(
     const std::shared_ptr<Allocator>& underlying_allocator,
-    const cudaStream_t default_stream)
+    const gpuStream_t default_stream)
     : underlying_allocator_(underlying_allocator),
       default_stream_(default_stream) {}
 
@@ -51,13 +51,14 @@ bool StreamSafeCUDAAllocator::IsAllocThreadSafe() const { return true; }
 void StreamSafeCUDAAllocator::ProcessEventsAndFree() {
   for (auto map_it = allocation_info_map_.begin();
        map_it != allocation_info_map_.end();) {
-    std::deque<cudaEvent_t>& outstanding_events =
+    std::deque<gpuEvent_t>& outstanding_events =
         map_it->second->outstanding_events;
     VLOG(10) << "Check " << outstanding_events.size()
              << " outstanding events for " << map_it->first->ptr();
     auto deque_it = outstanding_events.begin();
     while (deque_it != outstanding_events.end()) {
-      cudaError_t err = cudaEventQuery(*deque_it);
+#ifdef PADDLE_WITH_CUDA
+      gpuError_t err = cudaEventQuery(*deque_it);
       if (err == cudaErrorNotReady) {
         VLOG(10) << "Event " << *deque_it << " for " << map_it->first->ptr()
                  << " is not complete";
@@ -67,6 +68,18 @@ void StreamSafeCUDAAllocator::ProcessEventsAndFree() {
       PADDLE_ENFORCE_CUDA_SUCCESS(err);
       PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(*deque_it));
       ++deque_it;
+#else
+      gpuError_t err = hipEventQuery(*deque_it);
+      if (err == hipErrorNotReady) {
+        VLOG(10) << "Event " << *deque_it << " for " << map_it->first->ptr()
+                 << " is not complete";
+        outstanding_events.erase(outstanding_events.begin(), deque_it);
+        break;
+      }
+      PADDLE_ENFORCE_CUDA_SUCCESS(err);
+      PADDLE_ENFORCE_CUDA_SUCCESS(hipEventDestroy(*deque_it));
+      ++deque_it;
+#endif
     }
 
     if (deque_it == outstanding_events.end()) {
@@ -107,13 +120,19 @@ uint64_t StreamSafeCUDAAllocator::ReleaseImpl(const platform::Place& place) {
 }
 
 void StreamSafeCUDAAllocator::CreateEventForAllRecordedStream(
-    std::set<cudaStream_t>* recorded_streams,
-    std::deque<cudaEvent_t>* outstanding_events) {
-  for (cudaStream_t stream : *recorded_streams) {
-    cudaEvent_t event;
+    std::set<gpuStream_t>* recorded_streams,
+    std::deque<gpuEvent_t>* outstanding_events) {
+  for (gpuStream_t stream : *recorded_streams) {
+    gpuEvent_t event;
+#ifdef PADDLE_WITH_CUDA
     PADDLE_ENFORCE_CUDA_SUCCESS(
         cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
     PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event, stream));
+#else
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        hipEventCreateWithFlags(&event, hipEventDisableTiming));
+    PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(event, stream));
+#endif
     outstanding_events->emplace_back(event);
     VLOG(9) << "Record event " << event << " in stream " << stream;
   }
@@ -128,7 +147,7 @@ void StreamSafeCUDAAllocator::FreeStreamSafeCUDAAllocation(
     return;
   }
 
-  std::deque<cudaEvent_t>& outstanding_events =
+  std::deque<gpuEvent_t>& outstanding_events =
       allocation_info->outstanding_events;
   CreateEventForAllRecordedStream(
       dynamic_cast<StreamSafeCUDAAllocation*>(allocation)
diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
index 8c5f32665b8770..f74c01e970c109 100644
--- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
@@ -13,8 +13,12 @@
 // limitations under the License.
 
 #pragma once
-
+#ifdef PADDLE_WITH_CUDA
 #include <cuda_runtime.h>
+#else
+#include <hip/hip_runtime.h>
+#endif
+
 #include <deque>
 #include <memory>
 #include <mutex>
@@ -29,14 +33,14 @@ namespace allocation {
 class StreamSafeCUDAAllocation : public Allocation {
  public:
   StreamSafeCUDAAllocation(AllocationPtr underlying_allocation,
-                           cudaStream_t owning_stream);
-  void RecordStream(cudaStream_t stream);
-  std::shared_ptr<std::set<cudaStream_t>> GetRecordedStreams();
+                           gpuStream_t owning_stream);
+  void RecordStream(gpuStream_t stream);
+  std::shared_ptr<std::set<gpuStream_t>> GetRecordedStreams();
 
  private:
   AllocationPtr underlying_allocation_;
-  cudaStream_t owning_stream_;
-  std::shared_ptr<std::set<cudaStream_t>> recorded_streams_;
+  gpuStream_t owning_stream_;
+  std::shared_ptr<std::set<gpuStream_t>> recorded_streams_;
   std::mutex mutex_;
 };
 
@@ -44,7 +48,7 @@ class StreamSafeCUDAAllocator : public Allocator {
  public:
   StreamSafeCUDAAllocator(
       const std::shared_ptr<Allocator> &underlying_allocator,
-      const cudaStream_t default_stream);
+      const gpuStream_t default_stream);
   bool IsAllocThreadSafe() const override;
   void ProcessEventsAndFree();
 
@@ -55,18 +59,18 @@ class StreamSafeCUDAAllocator : public Allocator {
 
  private:
   struct AllocationInfo {
-    std::deque<cudaEvent_t> outstanding_events;
+    std::deque<gpuEvent_t> outstanding_events;
     bool can_be_freed{false};
   };
 
   void CreateEventForAllRecordedStream(
-      std::set<cudaStream_t> *recorded_streams,
-      std::deque<cudaEvent_t> *outstanding_events);
+      std::set<gpuStream_t> *recorded_streams,
+      std::deque<gpuEvent_t> *outstanding_events);
   void FreeStreamSafeCUDAAllocation(Allocation *allocation);
   std::shared_ptr<AllocationInfo> GetAllocationInfo(Allocation *);
 
   std::shared_ptr<Allocator> underlying_allocator_;
-  cudaStream_t default_stream_;
+  gpuStream_t default_stream_;
   std::unordered_map<Allocation *, std::shared_ptr<AllocationInfo>>
       allocation_info_map_;
   mutable std::recursive_mutex mutex_;
diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc
index 55b4657df3ac62..9cd391acca6a45 100644
--- a/paddle/fluid/memory/malloc.cc
+++ b/paddle/fluid/memory/malloc.cc
@@ -35,22 +35,22 @@ uint64_t Release(const platform::Place& place) {
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 std::shared_ptr<Allocation> AllocShared(const platform::CUDAPlace& place,
-                                        const cudaStream_t& stream,
+                                        const gpuStream_t& stream,
                                         size_t size) {
   return allocation::AllocatorFacade::Instance().AllocShared(place, stream,
                                                              size);
 }
 
-AllocationPtr Alloc(const platform::CUDAPlace& place,
-                    const cudaStream_t& stream, size_t size) {
+AllocationPtr Alloc(const platform::CUDAPlace& place, const gpuStream_t& stream,
+                    size_t size) {
   return allocation::AllocatorFacade::Instance().Alloc(place, stream, size);
 }
 
-uint64_t Release(const platform::CUDAPlace& place, const cudaStream_t& stream) {
+uint64_t Release(const platform::CUDAPlace& place, const gpuStream_t& stream) {
   return allocation::AllocatorFacade::Instance().Release(place, stream);
 }
 
-void RecordStream(Allocation* allocation, const cudaStream_t& stream) {
+void RecordStream(Allocation* allocation, const gpuStream_t& stream) {
   return allocation::AllocatorFacade::Instance().RecordStream(allocation,
                                                               stream);
 }
diff --git a/paddle/fluid/memory/malloc.h b/paddle/fluid/memory/malloc.h
index 0b49e1cbee1047..4926dc572cad4c 100644
--- a/paddle/fluid/memory/malloc.h
+++ b/paddle/fluid/memory/malloc.h
@@ -42,16 +42,16 @@ extern uint64_t Release(const platform::Place& place);
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 extern std::shared_ptr<Allocation> AllocShared(const platform::CUDAPlace& place,
-                                               const cudaStream_t& stream,
+                                               const gpuStream_t& stream,
                                                size_t size);
 
 extern AllocationPtr Alloc(const platform::CUDAPlace& place,
-                           const cudaStream_t& stream, size_t size);
+                           const gpuStream_t& stream, size_t size);
 
 extern uint64_t Release(const platform::CUDAPlace& place,
-                        const cudaStream_t& stream);
+                        const gpuStream_t& stream);
 
-void RecordStream(Allocation* allocation, const cudaStream_t& stream);
+void RecordStream(Allocation* allocation, const gpuStream_t& stream);
 #endif
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/malloc_test.cu b/paddle/fluid/memory/malloc_test.cu
index b243acdeabc2c6..3d0c1896bfaafa 100644
--- a/paddle/fluid/memory/malloc_test.cu
+++ b/paddle/fluid/memory/malloc_test.cu
@@ -174,10 +174,14 @@ TEST(Malloc, AllocZero) {
 #ifdef PADDLE_WITH_CUDA
 TEST(Malloc, StreamSafeCUDAAllocRetry) {
   platform::CUDAPlace place = platform::CUDAPlace();
-  cudaStream_t stream1, stream2;
+  gpuStream_t stream1, stream2;
+#ifdef PADDLE_WITH_CUDA
   PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream1));
   PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream2));
-
+#else
+  PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreate(&stream1));
+  PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreate(&stream2));
+#endif
   size_t available_size = platform::GpuAvailableMemToAlloc();
   // alloc_size < available_size < 2 * alloc_size
   size_t alloc_size = available_size / 4 * 3;
@@ -220,8 +224,12 @@ class StreamSafeCUDAAllocTest : public ::testing::Test {
     streams_.reserve(stream_num_);
     streams_.emplace_back(default_stream);
     for (size_t i = 1; i < stream_num_; ++i) {
-      cudaStream_t stream;
+      gpuStream_t stream;
+#ifdef PADDLE_WITH_CUDA
       PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream));
+#else
+      PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreate(&stream));
+#endif
       streams_.emplace_back(stream);
     }
 
@@ -229,8 +237,13 @@ class StreamSafeCUDAAllocTest : public ::testing::Test {
       size_t allocation_size = data_num_ * sizeof(int);
       std::shared_ptr<Allocation> allocation =
           AllocShared(place_, streams_[i], allocation_size);
+#ifdef PADDLE_WITH_CUDA
       PADDLE_ENFORCE_CUDA_SUCCESS(
           cudaMemset(allocation->ptr(), 0, allocation->size()));
+#else
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          hipMemset(allocation->ptr(), 0, allocation->size()));
+#endif
       allocations_.emplace_back(allocation);
     }
   }
@@ -268,11 +281,17 @@ class StreamSafeCUDAAllocTest : public ::testing::Test {
     auto host_x = std::unique_ptr<int[]>(new int[data_num_]);
     size_t thread_num = grid_num_ * block_num_;
     for (int i = 0; i < stream_num_; ++i) {
-      // tricky code, the allocations are still accessible even though
-      // allocations_.clear() has been called
+// tricky code, the allocations are still accessible even though
+// allocations_.clear() has been called
+#ifdef PADDLE_WITH_CUDA
       PADDLE_ENFORCE_CUDA_SUCCESS(
           cudaMemcpy(host_x.get(), allocations_[i]->ptr(),
                      data_num_ * sizeof(int), cudaMemcpyDeviceToHost));
+#else
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          hipMemcpy(host_x.get(), allocations_[i]->ptr(),
+                    data_num_ * sizeof(int), hipMemcpyDeviceToHost));
+#endif
       for (int j = 0; j < data_num_; ++j) {
         EXPECT_TRUE(host_x[j] == (j % thread_num) * stream_num_);
       }
@@ -280,13 +299,21 @@ class StreamSafeCUDAAllocTest : public ::testing::Test {
   }
 
   void TearDown() override {
+#ifdef PADDLE_WITH_CUDA
     cudaDeviceSynchronize();
-    for (cudaStream_t stream : streams_) {
+#else
+    hipDeviceSynchronize();
+#endif
+    for (gpuStream_t stream : streams_) {
       Release(place_, stream);
     }
 
     for (size_t i = 1; i < stream_num_; ++i) {
+#ifdef PADDLE_WITH_CUDA
       PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(streams_[i]));
+#else
+      PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamDestroy(streams_[i]));
+#endif
     }
 
     uint64_t cuda_malloc_size =
@@ -303,8 +330,8 @@ class StreamSafeCUDAAllocTest : public ::testing::Test {
   size_t data_num_;
   uint64_t cuda_malloc_size_before_test_;
   platform::CUDAPlace place_;
-  cudaStream_t default_stream;
-  std::vector<cudaStream_t> streams_;
+  gpuStream_t default_stream;
+  std::vector<gpuStream_t> streams_;
   std::vector<std::shared_ptr<Allocation>> allocations_;
 };
 

From dc7a05503ae590ff3e310b118827aa1b4a6d19b2 Mon Sep 17 00:00:00 2001
From: chenruibiao <chenruibiao@baidu.com>
Date: Fri, 19 Nov 2021 11:23:20 +0800
Subject: [PATCH 05/13] Release memory for StreamSafeCUDAAllocaRetry in
 malloc_test

---
 .../allocation/stream_safe_cuda_allocator.cc  | 94 +++++++++----------
 .../allocation/stream_safe_cuda_allocator.h   |  2 +-
 paddle/fluid/memory/malloc_test.cu            | 23 ++---
 3 files changed, 60 insertions(+), 59 deletions(-)

diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
index d7515baa2787b2..7ac20ab9b0f967 100644
--- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
@@ -48,53 +48,6 @@ StreamSafeCUDAAllocator::StreamSafeCUDAAllocator(
 
 bool StreamSafeCUDAAllocator::IsAllocThreadSafe() const { return true; }
 
-void StreamSafeCUDAAllocator::ProcessEventsAndFree() {
-  for (auto map_it = allocation_info_map_.begin();
-       map_it != allocation_info_map_.end();) {
-    std::deque<gpuEvent_t>& outstanding_events =
-        map_it->second->outstanding_events;
-    VLOG(10) << "Check " << outstanding_events.size()
-             << " outstanding events for " << map_it->first->ptr();
-    auto deque_it = outstanding_events.begin();
-    while (deque_it != outstanding_events.end()) {
-#ifdef PADDLE_WITH_CUDA
-      gpuError_t err = cudaEventQuery(*deque_it);
-      if (err == cudaErrorNotReady) {
-        VLOG(10) << "Event " << *deque_it << " for " << map_it->first->ptr()
-                 << " is not complete";
-        outstanding_events.erase(outstanding_events.begin(), deque_it);
-        break;
-      }
-      PADDLE_ENFORCE_CUDA_SUCCESS(err);
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(*deque_it));
-      ++deque_it;
-#else
-      gpuError_t err = hipEventQuery(*deque_it);
-      if (err == hipErrorNotReady) {
-        VLOG(10) << "Event " << *deque_it << " for " << map_it->first->ptr()
-                 << " is not complete";
-        outstanding_events.erase(outstanding_events.begin(), deque_it);
-        break;
-      }
-      PADDLE_ENFORCE_CUDA_SUCCESS(err);
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipEventDestroy(*deque_it));
-      ++deque_it;
-#endif
-    }
-
-    if (deque_it == outstanding_events.end()) {
-      outstanding_events.clear();
-      Allocation* allocation = map_it->first;
-      // "map_it" may be invalid after calling FreeStreamSafeCUDAAllocation
-      auto next_it = ++map_it;
-      FreeStreamSafeCUDAAllocation(allocation);
-      map_it = next_it;
-    } else {
-      ++map_it;
-    }
-  }
-}
-
 Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) {
   std::lock_guard<std::recursive_mutex> lock(mutex_);
   ProcessEventsAndFree();
@@ -174,6 +127,53 @@ StreamSafeCUDAAllocator::GetAllocationInfo(Allocation* allocation) {
   return it->second;
 }
 
+void StreamSafeCUDAAllocator::ProcessEventsAndFree() {
+  for (auto map_it = allocation_info_map_.begin();
+       map_it != allocation_info_map_.end();) {
+    std::deque<gpuEvent_t>& outstanding_events =
+        map_it->second->outstanding_events;
+    VLOG(10) << "Check " << outstanding_events.size()
+             << " outstanding events for " << map_it->first->ptr();
+    auto deque_it = outstanding_events.begin();
+    while (deque_it != outstanding_events.end()) {
+#ifdef PADDLE_WITH_CUDA
+      gpuError_t err = cudaEventQuery(*deque_it);
+      if (err == cudaErrorNotReady) {
+        VLOG(10) << "Event " << *deque_it << " for " << map_it->first->ptr()
+                 << " is not complete";
+        outstanding_events.erase(outstanding_events.begin(), deque_it);
+        break;
+      }
+      PADDLE_ENFORCE_CUDA_SUCCESS(err);
+      PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(*deque_it));
+      ++deque_it;
+#else
+      gpuError_t err = hipEventQuery(*deque_it);
+      if (err == hipErrorNotReady) {
+        VLOG(10) << "Event " << *deque_it << " for " << map_it->first->ptr()
+                 << " is not complete";
+        outstanding_events.erase(outstanding_events.begin(), deque_it);
+        break;
+      }
+      PADDLE_ENFORCE_CUDA_SUCCESS(err);
+      PADDLE_ENFORCE_CUDA_SUCCESS(hipEventDestroy(*deque_it));
+      ++deque_it;
+#endif
+    }
+
+    if (deque_it == outstanding_events.end()) {
+      outstanding_events.clear();
+      Allocation* allocation = map_it->first;
+      // "map_it" may be invalid after calling FreeStreamSafeCUDAAllocation
+      auto next_it = ++map_it;
+      FreeStreamSafeCUDAAllocation(allocation);
+      map_it = next_it;
+    } else {
+      ++map_it;
+    }
+  }
+}
+
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
index f74c01e970c109..f625a8063becd6 100644
--- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
@@ -50,7 +50,6 @@ class StreamSafeCUDAAllocator : public Allocator {
       const std::shared_ptr<Allocator> &underlying_allocator,
       const gpuStream_t default_stream);
   bool IsAllocThreadSafe() const override;
-  void ProcessEventsAndFree();
 
  protected:
   Allocation *AllocateImpl(size_t size) override;
@@ -68,6 +67,7 @@ class StreamSafeCUDAAllocator : public Allocator {
       std::deque<gpuEvent_t> *outstanding_events);
   void FreeStreamSafeCUDAAllocation(Allocation *allocation);
   std::shared_ptr<AllocationInfo> GetAllocationInfo(Allocation *);
+  void ProcessEventsAndFree();
 
   std::shared_ptr<Allocator> underlying_allocator_;
   gpuStream_t default_stream_;
diff --git a/paddle/fluid/memory/malloc_test.cu b/paddle/fluid/memory/malloc_test.cu
index 3d0c1896bfaafa..cf7e1276e88b5a 100644
--- a/paddle/fluid/memory/malloc_test.cu
+++ b/paddle/fluid/memory/malloc_test.cu
@@ -171,7 +171,6 @@ TEST(Malloc, AllocZero) {
   EXPECT_GE(allocation_ptr->size(), 0);
 }
 
-#ifdef PADDLE_WITH_CUDA
 TEST(Malloc, StreamSafeCUDAAllocRetry) {
   platform::CUDAPlace place = platform::CUDAPlace();
   gpuStream_t stream1, stream2;
@@ -197,6 +196,13 @@ TEST(Malloc, StreamSafeCUDAAllocRetry) {
   allocation1.reset();  // free but not release
   th.join();
   EXPECT_GE(allocation2->size(), alloc_size);
+  allocation2.reset();
+
+#ifdef PADDLE_WITH_CUDA
+  PADDLE_ENFORCE_CUDA_SUCCESS(cudaDeviceSynchronize());
+#else
+  PADDLE_ENFORCE_CUDA_SUCCESS(hipDeviceSynchronize());
+#endif
 
   Release(place, stream1);
   Release(place, stream2);
@@ -218,8 +224,6 @@ class StreamSafeCUDAAllocTest : public ::testing::Test {
     block_num_ = 64;
     data_num_ = 64;
     default_stream = nullptr;
-    cuda_malloc_size_before_test_ =
-        platform::RecordedCudaMallocSize(place_.GetDeviceId());
 
     streams_.reserve(stream_num_);
     streams_.emplace_back(default_stream);
@@ -300,9 +304,9 @@ class StreamSafeCUDAAllocTest : public ::testing::Test {
 
   void TearDown() override {
 #ifdef PADDLE_WITH_CUDA
-    cudaDeviceSynchronize();
+    PADDLE_ENFORCE_CUDA_SUCCESS(cudaDeviceSynchronize());
 #else
-    hipDeviceSynchronize();
+    PADDLE_ENFORCE_CUDA_SUCCESS(hipDeviceSynchronize());
 #endif
     for (gpuStream_t stream : streams_) {
       Release(place_, stream);
@@ -318,17 +322,15 @@ class StreamSafeCUDAAllocTest : public ::testing::Test {
 
     uint64_t cuda_malloc_size =
         platform::RecordedCudaMallocSize(place_.GetDeviceId());
-    ASSERT_EQ(cuda_malloc_size, cuda_malloc_size_before_test_)
-        << "Found " << cuda_malloc_size
-        << " bytes memory that not released yet, there may be a memory leak "
-           "problem.";
+    ASSERT_EQ(cuda_malloc_size, 0) << "Found " << cuda_malloc_size
+                                   << " bytes memory that not released yet,"
+                                   << " there may be a memory leak problem";
   }
 
   size_t stream_num_;
   size_t grid_num_;
   size_t block_num_;
   size_t data_num_;
-  uint64_t cuda_malloc_size_before_test_;
   platform::CUDAPlace place_;
   gpuStream_t default_stream;
   std::vector<gpuStream_t> streams_;
@@ -344,7 +346,6 @@ TEST_F(StreamSafeCUDAAllocTest, CUDAMutilThreadMutilStream) {
   MultiThreadMUltiStreamRun();
   CheckResult();
 }
-#endif
 
 }  // namespace memory
 }  // namespace paddle

From 165d4999704517797d97d4355a56d1f6dab82910 Mon Sep 17 00:00:00 2001
From: chenruibiao <chenruibiao@baidu.com>
Date: Sat, 20 Nov 2021 21:12:25 +0800
Subject: [PATCH 06/13] Add FLAGS_use_stream_safe_cuda_allocator

---
 paddle/fluid/memory/CMakeLists.txt            |   6 +
 .../memory/allocation/allocator_facade.cc     | 320 +++++++++++++-----
 paddle/fluid/memory/malloc_test.cu            | 178 ----------
 .../memory/stream_safe_cuda_alloc_test.cu     | 211 ++++++++++++
 4 files changed, 454 insertions(+), 261 deletions(-)
 create mode 100644 paddle/fluid/memory/stream_safe_cuda_alloc_test.cu

diff --git a/paddle/fluid/memory/CMakeLists.txt b/paddle/fluid/memory/CMakeLists.txt
index 75b1bffca31f84..0de114ae40728b 100644
--- a/paddle/fluid/memory/CMakeLists.txt
+++ b/paddle/fluid/memory/CMakeLists.txt
@@ -17,6 +17,12 @@ if (WITH_GPU)
     nv_test(malloc_test
             SRCS malloc_test.cu
             DEPS device_context malloc)
+    nv_test(stream_safe_cuda_alloc_test
+            SRCS stream_safe_cuda_alloc_test.cu
+            DEPS malloc)
+    set_tests_properties(stream_safe_cuda_alloc_test PROPERTIES 
+                         ENVIRONMENT "FLAGS_enable_stream_safe_cuda_allocator=true"
+                         ENVIRONMENT "FLAGS_allocator_strategy=auto_growth")            
 endif()
 
 if (WITH_ROCM)
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index a47dba54932646..0c1d458687a1ec 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -68,6 +68,12 @@ PADDLE_DEFINE_EXPORTED_bool(
 PADDLE_DEFINE_EXPORTED_bool(use_virtual_memory_auto_growth, false,
                             "Use VirtualMemoryAutoGrowthBestFitAllocator.");
 
+// NOTE(Ruibiao): This FLAGS is just to be compatible with the old single-stream
+// CUDA allocator, and will be remove after StreamSafeCudaAllocator has been
+// fully tested
+PADDLE_DEFINE_EXPORTED_bool(use_stream_safe_cuda_allocator, true,
+                            "Enable StreamSafeCUDAAllocator");
+
 DECLARE_string(allocator_strategy);
 
 namespace paddle {
@@ -131,36 +137,88 @@ class AllocatorFacadePrivate {
 
   explicit AllocatorFacadePrivate(bool allow_free_idle_chunk = true) {
     strategy_ = GetAllocatorStrategy();
-    CheckStrategy(strategy_);
-
-    InitNaiveBestFitCPUAllocator();
+    switch (strategy_) {
+      case AllocatorStrategy::kNaiveBestFit: {
+        InitNaiveBestFitCPUAllocator();
+#ifdef PADDLE_WITH_XPU
+        for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
+          InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
+        }
+#endif
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+        for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
+             ++dev_id) {
+          InitNaiveBestFitCUDAAllocator(platform::CUDAPlace(dev_id));
+        }
+        InitNaiveBestFitCUDAPinnedAllocator();
+#endif
+#ifdef PADDLE_WITH_ASCEND_CL
+        for (int dev_id = 0; dev_id < platform::GetNPUDeviceCount(); ++dev_id) {
+          InitNaiveBestFitNPUAllocator(platform::NPUPlace(dev_id));
+        }
+        InitNaiveBestFitNPUPinnedAllocator();
+#endif
+        break;
+      }
 
+      case AllocatorStrategy::kAutoGrowth: {
+        InitNaiveBestFitCPUAllocator();
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    allow_free_idle_chunk_ = allow_free_idle_chunk;
-    default_stream_ = nullptr;
-    for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount(); ++dev_id) {
-      InitCUDAAllocator(platform::CUDAPlace(dev_id), default_stream_);
-    }
-    InitNaiveBestFitCUDAPinnedAllocator();
+        allow_free_idle_chunk_ = allow_free_idle_chunk;
+        if (FLAGS_use_stream_safe_cuda_allocator) {
+          // TODO(Ruibiao): Support multi-stream allocator for other strategies
+          default_stream_ = nullptr;
+          for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
+               ++dev_id) {
+            InitStreamSafeCUDAAllocator(platform::CUDAPlace(dev_id),
+                                        default_stream_);
+          }
+        } else {
+          for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
+               ++dev_id) {
+            InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id),
+                                        allow_free_idle_chunk_);
+          }
+        }
+        InitNaiveBestFitCUDAPinnedAllocator();
 #endif
+#ifdef PADDLE_WITH_XPU
+        for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
+          InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
+        }
+#endif
+        break;
+      }
 
+      case AllocatorStrategy::kThreadLocal: {
+        InitNaiveBestFitCPUAllocator();
 #ifdef PADDLE_WITH_XPU
-    for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
-      InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
-    }
+        for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
+          InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
+        }
+#endif
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+        for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
+             ++dev_id) {
+          InitThreadLocalCUDAAllocator(platform::CUDAPlace(dev_id));
+        }
+        InitNaiveBestFitCUDAPinnedAllocator();
 #endif
+        break;
+      }
 
-#ifdef PADDLE_WITH_ASCEND_CL
-    if (strategy_ == AllocatorStrategy::kNaiveBestFit) {
-      for (int dev_id = 0; dev_id < platform::GetNPUDeviceCount(); ++dev_id) {
-        InitNaiveBestFitNPUAllocator(platform::NPUPlace(dev_id));
+      default: {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "Unsupported allocator strategy: %d", static_cast<int>(strategy_)));
       }
-      InitNaiveBestFitNPUPinnedAllocator();
     }
-#endif
-
     InitZeroSizeAllocators();
     InitSystemAllocators();
+
+    if (FLAGS_gpu_allocator_retry_time > 0) {
+      WrapCUDARetryAllocator(FLAGS_gpu_allocator_retry_time);
+    }
+
     CheckAllocThreadSafe();
   }
 
@@ -220,12 +278,6 @@ class AllocatorFacadePrivate {
   AllocationPtr CUDAAlloc(const platform::CUDAPlace& place,
                           const gpuStream_t& stream, size_t size) {
     std::shared_ptr<Allocator> cuda_allocator;
-    /* NOTE(Ruibiao): This code does not lead to lock competition
-     * for seraching initialized CUDA allocator in multithreaded scenario.
-     * However, when the corresponding CUDA allocator is not initialized,
-     * it may result in large lookup overhead,
-     * which call GetCUDAAAllocator 3 times in the worst case.
-    **/
     try {
       cuda_allocator = GetCUDAAllocator(place, stream);
     } catch (platform::EnforceNotMet& err) {
@@ -234,8 +286,10 @@ class AllocatorFacadePrivate {
       std::unique_lock<std::mutex> lock(cuda_init_mutex_);
       try {
         cuda_allocator = GetCUDAAllocator(place, stream);
+        VLOG(9) << "Other thread had build a new allocator for stream "
+                << stream << " in place " << place;
       } catch (platform::EnforceNotMet& err) {
-        InitCUDAAllocator(place, stream);
+        InitStreamSafeCUDAAllocator(place, stream);
         cuda_allocator = GetCUDAAllocator(place, stream);
       } catch (...) {
         throw;
@@ -314,15 +368,6 @@ class AllocatorFacadePrivate {
     platform::Place place_;
   };
 
-  void CheckStrategy(AllocatorStrategy strategy) {
-    if (strategy != AllocatorStrategy::kNaiveBestFit &&
-        strategy != AllocatorStrategy::kAutoGrowth &&
-        strategy != AllocatorStrategy::kThreadLocal) {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Unsupported allocator strategy: %d", static_cast<int>(strategy_)));
-    }
-  }
-
   const AllocatorMap& GetAllocatorMap() {
 #ifdef PADDLE_WITH_CUDA
     if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
@@ -352,32 +397,23 @@ class AllocatorFacadePrivate {
         std::make_shared<NaiveBestFitAllocator>(platform::CUDAPinnedPlace());
   }
 
-  void InitCUDAAllocator(platform::CUDAPlace p, gpuStream_t stream) {
-    switch (strategy_) {
-      case AllocatorStrategy::kNaiveBestFit: {
-        InitNaiveBestFitCUDAAllocator(p, stream);
-        break;
-      }
-      case AllocatorStrategy::kAutoGrowth: {
-        InitAutoGrowthCUDAAllocator(p, stream);
-        break;
-      }
-      case AllocatorStrategy::kThreadLocal: {
-        InitThreadLocalCUDAAllocator(p, stream);
-        break;
-      }
-      default: {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "Unsupported allocator strategy: %d", static_cast<int>(strategy_)));
-      }
-    }
+  void InitStreamSafeCUDAAllocator(platform::CUDAPlace p, gpuStream_t stream) {
+    PADDLE_ENFORCE_EQ(
+        strategy_, AllocatorStrategy::kAutoGrowth,
+        platform::errors::Unimplemented(
+            "Now only support auto-growth strategey for "
+            "StreamSafeCUDAAllocator, "
+            "the allocator strategy %d is unsupported for multi-stream",
+            static_cast<int>(strategy_)));
+    VLOG(9) << "Init CUDA allocator for stream " << stream << " in place " << p;
+
+    InitAutoGrowthCUDAAllocator(p, stream);
     WrapStreamSafeCUDAAllocator(p, stream);
     WrapCUDARetryAllocator(p, stream, FLAGS_gpu_allocator_retry_time);
   }
 
-  void InitNaiveBestFitCUDAAllocator(platform::CUDAPlace p,
-                                     gpuStream_t stream) {
-    cuda_allocators_[p][stream] = std::make_shared<NaiveBestFitAllocator>(p);
+  void InitNaiveBestFitCUDAAllocator(platform::CUDAPlace p) {
+    allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
   }
 
   void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p, gpuStream_t stream) {
@@ -455,8 +491,83 @@ class AllocatorFacadePrivate {
 #endif
   }
 
-  void InitThreadLocalCUDAAllocator(platform::CUDAPlace p, gpuStream_t stream) {
-    cuda_allocators_[p][stream] = std::make_shared<ThreadLocalCUDAAllocator>(p);
+  // NOTE(Ruibiao): Old single-stream version, will be removed later
+  void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p,
+                                   bool allow_free_idle_chunk) {
+#if defined(PADDLE_WITH_HIP)
+    auto cuda_allocator = std::make_shared<CUDAAllocator>(p);
+    allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
+        cuda_allocator, platform::GpuMinChunkSize(), allow_free_idle_chunk);
+#endif
+
+#if defined(PADDLE_WITH_CUDA)
+#if CUDA_VERSION >= 10020
+    CUdevice device;
+    int val;
+    try {
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          paddle::platform::dynload::cuDeviceGet(&device, p.GetDeviceId()));
+
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          paddle::platform::dynload::cuDeviceGetAttribute(
+              &val, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED,
+              device));
+    } catch (...) {
+      val = 0;
+    }
+
+    if (val > 0 && FLAGS_use_virtual_memory_auto_growth) {
+      auto cuda_allocator = std::make_shared<CUDAVirtualMemAllocator>(p);
+      allocators_[p] =
+          std::make_shared<VirtualMemoryAutoGrowthBestFitAllocator>(
+              cuda_allocator, platform::GpuMinChunkSize(), p);
+    } else {
+      auto cuda_allocator = std::make_shared<CUDAAllocator>(p);
+      allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
+          cuda_allocator, platform::GpuMinChunkSize(), allow_free_idle_chunk);
+    }
+
+#else
+    auto cuda_allocator = std::make_shared<CUDAAllocator>(p);
+    auto alignment = platform::GpuMinChunkSize();
+    bool need_addr_align = true;
+    // NOTE: sometimes, since cuda runtime can not be forked, calling any cuda
+    // API in that case may got cuda error(3), i.e.,
+    // cudaErrorInitializationError. And, the CUDAAllocator is only initialized
+    // but not really used.
+    // Here, the try-catch block is added to handle the case that
+    // GetDeviceProperties() may failed in the multiple process(for example, in
+    // dataloader with num_worker > 0)
+    try {
+      const auto& prop = platform::GetDeviceProperties(p.GetDeviceId());
+      need_addr_align = prop.textureAlignment < alignment;
+      VLOG(4) << "GetDeviceProperties ok, textureAlignment: "
+              << prop.textureAlignment
+              << ", set need_addr_align=" << need_addr_align;
+    } catch (...) {
+      need_addr_align = true;
+      VLOG(4) << "GetDeviceProperties failed, set need_addr_align=true";
+    }
+    // The address returned is aligned already,
+    // ref:
+    // https://stackoverflow.com/questions/14082964/cuda-alignment-256bytes-seriously/14083295#14083295
+    std::shared_ptr<Allocator> underlying_allocator{nullptr};
+    if (need_addr_align) {
+      VLOG(10) << "use AlignedAllocator with alignment: " << alignment;
+      underlying_allocator =
+          std::make_shared<AlignedAllocator>(underlying_allocator, alignment);
+    } else {
+      VLOG(10) << "not use AlignedAllocator with alignment: " << alignment;
+      underlying_allocator = cuda_allocator;
+    }
+    allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
+        underlying_allocator, alignment, 0, allow_free_idle_chunk);
+#endif
+#endif
+  }
+
+  void InitThreadLocalCUDAAllocator(platform::CUDAPlace p) {
+    allocators_[p] = std::make_shared<ThreadLocalCUDAAllocator>(p);
   }
 
   void WrapCUDARetryAllocator(platform::CUDAPlace p, gpuStream_t stream,
@@ -504,6 +615,27 @@ class AllocatorFacadePrivate {
   }
 #endif
 
+  void InitSystemAllocators() {
+    if (!system_allocators_.empty()) return;
+    system_allocators_[platform::CPUPlace()] = std::make_shared<CPUAllocator>();
+#ifdef PADDLE_WITH_XPU
+    int device_count = platform::GetXPUDeviceCount();
+    for (int i = 0; i < device_count; ++i) {
+      platform::XPUPlace p(i);
+      system_allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
+    }
+#endif
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    system_allocators_[platform::CUDAPinnedPlace()] =
+        std::make_shared<CPUPinnedAllocator>();
+    int device_count = platform::GetCUDADeviceCount();
+    for (int i = 0; i < device_count; ++i) {
+      platform::CUDAPlace p(i);
+      system_allocators_[p] = std::make_shared<CUDAAllocator>(p);
+    }
+#endif
+  }
+
   void InitZeroSizeAllocators() {
     if (!zero_size_allocators_.empty()) return;
     std::vector<platform::Place> places;
@@ -541,36 +673,30 @@ class AllocatorFacadePrivate {
     }
   }
 
-  void InitSystemAllocators() {
-    if (!system_allocators_.empty()) return;
-    system_allocators_[platform::CPUPlace()] = std::make_shared<CPUAllocator>();
-#ifdef PADDLE_WITH_XPU
-    int device_count = platform::GetXPUDeviceCount();
-    for (int i = 0; i < device_count; ++i) {
-      platform::XPUPlace p(i);
-      system_allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
-    }
-#endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    system_allocators_[platform::CUDAPinnedPlace()] =
-        std::make_shared<CPUPinnedAllocator>();
-    int device_count = platform::GetCUDADeviceCount();
-    for (int i = 0; i < device_count; ++i) {
-      platform::CUDAPlace p(i);
-      system_allocators_[p] = std::make_shared<CUDAAllocator>(p);
-    }
-#endif
-  }
-
   void CheckAllocThreadSafe() const {
     CheckAllocThreadSafe(allocators_);
     CheckAllocThreadSafe(zero_size_allocators_);
     CheckAllocThreadSafe(system_allocators_);
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    CheckCUDAAllocThreadSafe(cuda_allocators_);
+    if (FLAGS_use_stream_safe_cuda_allocator) {
+      CheckCUDAAllocThreadSafe(cuda_allocators_);
+    }
 #endif
   }
 
+  // NOTE(Ruibiao): Old single-stream version, will be removed later
+  void WrapCUDARetryAllocator(size_t retry_time) {
+    PADDLE_ENFORCE_GT(
+        retry_time, 0,
+        platform::errors::InvalidArgument(
+            "Retry time should be larger than 0, but got %d", retry_time));
+    for (auto& pair : allocators_) {
+      if (platform::is_gpu_place(pair.first)) {
+        pair.second = std::make_shared<RetryAllocator>(pair.second, retry_time);
+      }
+    }
+  }
+
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   // a standalone CUDA allocator to support multi-stream GC in new executor
   CUDAAllocatorMap cuda_allocators_;
@@ -610,7 +736,7 @@ std::shared_ptr<Allocation> AllocatorFacade::AllocShared(
 AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
                                      size_t size) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  if (platform::is_gpu_place(place)) {
+  if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place)) {
     return Alloc(BOOST_GET_CONST(platform::CUDAPlace, place),
                  m_->GetDefaultStream(), size);
   }
@@ -620,7 +746,7 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
 
 uint64_t AllocatorFacade::Release(const platform::Place& place) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  if (platform::is_gpu_place(place)) {
+  if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place)) {
     return Release(BOOST_GET_CONST(platform::CUDAPlace, place),
                    m_->GetDefaultStream());
   }
@@ -637,11 +763,25 @@ const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 std::shared_ptr<Allocation> AllocatorFacade::AllocShared(
     const platform::CUDAPlace& place, const gpuStream_t& stream, size_t size) {
+  PADDLE_ENFORCE_EQ(
+      FLAGS_use_stream_safe_cuda_allocator, true,
+      platform::errors::Unimplemented(
+          "StreamSafeCUDAAllocator is disabled, you should not call this "
+          "multi-stream 'AllocaShared' function. "
+          "To enable it, you can enter 'export "
+          "FLAGS_use_stream_safe_cuda_allocator=true' in the terminal."));
   return std::shared_ptr<Allocation>(Alloc(place, stream, size));
 }
 
 AllocationPtr AllocatorFacade::Alloc(const platform::CUDAPlace& place,
                                      const gpuStream_t& stream, size_t size) {
+  PADDLE_ENFORCE_EQ(
+      FLAGS_use_stream_safe_cuda_allocator, true,
+      platform::errors::Unimplemented(
+          "StreamSafeCUDAAllocator is disabled, you should not call this "
+          "multi-stream 'Alloca' function. "
+          "To enable it, you can enter 'export "
+          "FLAGS_use_stream_safe_cuda_allocator=true' in the terminal."));
   if (size > 0) {
     return m_->CUDAAlloc(place, stream, size);
   } else {
@@ -651,11 +791,25 @@ AllocationPtr AllocatorFacade::Alloc(const platform::CUDAPlace& place,
 
 uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place,
                                   const gpuStream_t& stream) {
+  PADDLE_ENFORCE_EQ(
+      FLAGS_use_stream_safe_cuda_allocator, true,
+      platform::errors::Unimplemented(
+          "StreamSafeCUDAAllocator is disabled, you should not call this "
+          "multi-stream 'Release' function. "
+          "To enable it, you can enter 'export "
+          "FLAGS_use_stream_safe_cuda_allocator=true' in the terminal."));
   return m_->GetCUDAAllocator(place, stream)->Release(place);
 }
 
 void AllocatorFacade::RecordStream(Allocation* allocation,
                                    const gpuStream_t& stream) {
+  PADDLE_ENFORCE_EQ(
+      FLAGS_use_stream_safe_cuda_allocator, true,
+      platform::errors::Unimplemented(
+          "StreamSafeCUDAAllocator is disabled, you should not call this "
+          "'RecordStream' function. "
+          "To enable it, you can enter 'export "
+          "FLAGS_use_stream_safe_cuda_allocator=true' in the terminal."));
   m_->RecordStream(allocation, stream);
 }
 
diff --git a/paddle/fluid/memory/malloc_test.cu b/paddle/fluid/memory/malloc_test.cu
index cf7e1276e88b5a..d015ed7ce693fb 100644
--- a/paddle/fluid/memory/malloc_test.cu
+++ b/paddle/fluid/memory/malloc_test.cu
@@ -27,7 +27,6 @@
 #include "gtest/gtest.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/gpu_info.h"
 
 namespace paddle {
 namespace memory {
@@ -170,182 +169,5 @@ TEST(Malloc, AllocZero) {
   AllocationPtr allocation_ptr = Alloc(place, 0);
   EXPECT_GE(allocation_ptr->size(), 0);
 }
-
-TEST(Malloc, StreamSafeCUDAAllocRetry) {
-  platform::CUDAPlace place = platform::CUDAPlace();
-  gpuStream_t stream1, stream2;
-#ifdef PADDLE_WITH_CUDA
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream1));
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream2));
-#else
-  PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreate(&stream1));
-  PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreate(&stream2));
-#endif
-  size_t available_size = platform::GpuAvailableMemToAlloc();
-  // alloc_size < available_size < 2 * alloc_size
-  size_t alloc_size = available_size / 4 * 3;
-
-  std::shared_ptr<Allocation> allocation1 =
-      AllocShared(place, stream1, alloc_size);
-  std::shared_ptr<Allocation> allocation2;
-
-  std::thread th([&allocation2, &place, &stream2, alloc_size]() {
-    std::this_thread::sleep_for(std::chrono::seconds(1));
-    allocation2 = AllocShared(place, stream2, alloc_size);
-  });
-  allocation1.reset();  // free but not release
-  th.join();
-  EXPECT_GE(allocation2->size(), alloc_size);
-  allocation2.reset();
-
-#ifdef PADDLE_WITH_CUDA
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaDeviceSynchronize());
-#else
-  PADDLE_ENFORCE_CUDA_SUCCESS(hipDeviceSynchronize());
-#endif
-
-  Release(place, stream1);
-  Release(place, stream2);
-}
-
-__global__ void add_kernel(int *x, int n) {
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  for (int i = tid; i < n; i += blockDim.x * gridDim.x) {
-    atomicAdd(x + i, tid);
-  }
-}
-
-class StreamSafeCUDAAllocTest : public ::testing::Test {
- protected:
-  void SetUp() override {
-    place_ = platform::CUDAPlace();
-    stream_num_ = 64;
-    grid_num_ = 1;
-    block_num_ = 64;
-    data_num_ = 64;
-    default_stream = nullptr;
-
-    streams_.reserve(stream_num_);
-    streams_.emplace_back(default_stream);
-    for (size_t i = 1; i < stream_num_; ++i) {
-      gpuStream_t stream;
-#ifdef PADDLE_WITH_CUDA
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream));
-#else
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreate(&stream));
-#endif
-      streams_.emplace_back(stream);
-    }
-
-    for (size_t i = 0; i < stream_num_; ++i) {
-      size_t allocation_size = data_num_ * sizeof(int);
-      std::shared_ptr<Allocation> allocation =
-          AllocShared(place_, streams_[i], allocation_size);
-#ifdef PADDLE_WITH_CUDA
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          cudaMemset(allocation->ptr(), 0, allocation->size()));
-#else
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          hipMemset(allocation->ptr(), 0, allocation->size()));
-#endif
-      allocations_.emplace_back(allocation);
-    }
-  }
-
-  void SingleStreamRun(size_t idx) {
-    for (size_t i = 0; i < stream_num_; ++i) {
-      int *x = reinterpret_cast<int *>(allocations_[i]->ptr());
-      add_kernel<<<grid_num_, block_num_, 0, streams_[idx]>>>(x, data_num_);
-      if (i != idx) {
-        RecordStream(allocations_[i].get(), streams_[idx]);
-      }
-    }
-  }
-
-  void MultiStreamRun() {
-    for (int i = 0; i < stream_num_; ++i) {
-      SingleStreamRun(i);
-    }
-    allocations_.clear();  // fast_gc
-  }
-
-  void MultiThreadMUltiStreamRun() {
-    std::vector<std::thread> threads;
-    for (size_t i = 0; i < stream_num_; ++i) {
-      threads.push_back(
-          std::thread(&StreamSafeCUDAAllocTest::SingleStreamRun, this, i));
-    }
-    for (size_t i = 0; i < stream_num_; ++i) {
-      threads[i].join();
-    }
-    allocations_.clear();  // fast_gc
-  }
-
-  void CheckResult() {
-    auto host_x = std::unique_ptr<int[]>(new int[data_num_]);
-    size_t thread_num = grid_num_ * block_num_;
-    for (int i = 0; i < stream_num_; ++i) {
-// tricky code, the allocations are still accessible even though
-// allocations_.clear() has been called
-#ifdef PADDLE_WITH_CUDA
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          cudaMemcpy(host_x.get(), allocations_[i]->ptr(),
-                     data_num_ * sizeof(int), cudaMemcpyDeviceToHost));
-#else
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          hipMemcpy(host_x.get(), allocations_[i]->ptr(),
-                    data_num_ * sizeof(int), hipMemcpyDeviceToHost));
-#endif
-      for (int j = 0; j < data_num_; ++j) {
-        EXPECT_TRUE(host_x[j] == (j % thread_num) * stream_num_);
-      }
-    }
-  }
-
-  void TearDown() override {
-#ifdef PADDLE_WITH_CUDA
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaDeviceSynchronize());
-#else
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipDeviceSynchronize());
-#endif
-    for (gpuStream_t stream : streams_) {
-      Release(place_, stream);
-    }
-
-    for (size_t i = 1; i < stream_num_; ++i) {
-#ifdef PADDLE_WITH_CUDA
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(streams_[i]));
-#else
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamDestroy(streams_[i]));
-#endif
-    }
-
-    uint64_t cuda_malloc_size =
-        platform::RecordedCudaMallocSize(place_.GetDeviceId());
-    ASSERT_EQ(cuda_malloc_size, 0) << "Found " << cuda_malloc_size
-                                   << " bytes memory that not released yet,"
-                                   << " there may be a memory leak problem";
-  }
-
-  size_t stream_num_;
-  size_t grid_num_;
-  size_t block_num_;
-  size_t data_num_;
-  platform::CUDAPlace place_;
-  gpuStream_t default_stream;
-  std::vector<gpuStream_t> streams_;
-  std::vector<std::shared_ptr<Allocation>> allocations_;
-};
-
-TEST_F(StreamSafeCUDAAllocTest, CUDAMutilStream) {
-  MultiStreamRun();
-  CheckResult();
-}
-
-TEST_F(StreamSafeCUDAAllocTest, CUDAMutilThreadMutilStream) {
-  MultiThreadMUltiStreamRun();
-  CheckResult();
-}
-
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu b/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
new file mode 100644
index 00000000000000..a75ef88141d6ca
--- /dev/null
+++ b/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
@@ -0,0 +1,211 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef PADDLE_WITH_CUDA
+#include <cuda.h>
+#include <cuda_runtime.h>
+#endif
+
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_runtime.h>
+#endif
+
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/platform/gpu_info.h"
+
+namespace paddle {
+namespace memory {
+
+__global__ void add_kernel(int *x, int n) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  for (int i = tid; i < n; i += blockDim.x * gridDim.x) {
+    atomicAdd(x + i, tid);
+  }
+}
+
+class StreamSafeCUDAAllocTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    place_ = platform::CUDAPlace();
+    stream_num_ = 64;
+    grid_num_ = 1;
+    block_num_ = 64;
+    data_num_ = 64;
+    default_stream = nullptr;
+
+    streams_.reserve(stream_num_);
+    streams_.emplace_back(default_stream);
+    for (size_t i = 1; i < stream_num_; ++i) {
+      gpuStream_t stream;
+#ifdef PADDLE_WITH_CUDA
+      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream));
+#else
+      PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreate(&stream));
+#endif
+      streams_.emplace_back(stream);
+    }
+
+    for (size_t i = 0; i < stream_num_; ++i) {
+      size_t allocation_size = data_num_ * sizeof(int);
+      std::shared_ptr<Allocation> allocation =
+          AllocShared(place_, streams_[i], allocation_size);
+#ifdef PADDLE_WITH_CUDA
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          cudaMemset(allocation->ptr(), 0, allocation->size()));
+#else
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          hipMemset(allocation->ptr(), 0, allocation->size()));
+#endif
+      allocations_.emplace_back(allocation);
+    }
+  }
+
+  void SingleStreamRun(size_t idx) {
+    for (size_t i = 0; i < stream_num_; ++i) {
+      int *x = reinterpret_cast<int *>(allocations_[i]->ptr());
+      add_kernel<<<grid_num_, block_num_, 0, streams_[idx]>>>(x, data_num_);
+      if (i != idx) {
+        RecordStream(allocations_[i].get(), streams_[idx]);
+      }
+    }
+  }
+
+  void MultiStreamRun() {
+    for (int i = 0; i < stream_num_; ++i) {
+      SingleStreamRun(i);
+    }
+    allocations_.clear();  // fast_gc
+  }
+
+  void MultiThreadMUltiStreamRun() {
+    std::vector<std::thread> threads;
+    for (size_t i = 0; i < stream_num_; ++i) {
+      threads.push_back(
+          std::thread(&StreamSafeCUDAAllocTest::SingleStreamRun, this, i));
+    }
+    for (size_t i = 0; i < stream_num_; ++i) {
+      threads[i].join();
+    }
+    allocations_.clear();  // fast_gc
+  }
+
+  void CheckResult() {
+    auto host_x = std::unique_ptr<int[]>(new int[data_num_]);
+    size_t thread_num = grid_num_ * block_num_;
+    for (int i = 0; i < stream_num_; ++i) {
+// tricky code, the allocations are still accessible even though
+// allocations_.clear() has been called
+#ifdef PADDLE_WITH_CUDA
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          cudaMemcpy(host_x.get(), allocations_[i]->ptr(),
+                     data_num_ * sizeof(int), cudaMemcpyDeviceToHost));
+#else
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          hipMemcpy(host_x.get(), allocations_[i]->ptr(),
+                    data_num_ * sizeof(int), hipMemcpyDeviceToHost));
+#endif
+      for (int j = 0; j < data_num_; ++j) {
+        EXPECT_TRUE(host_x[j] == (j % thread_num) * stream_num_);
+      }
+    }
+  }
+
+  void TearDown() override {
+#ifdef PADDLE_WITH_CUDA
+    PADDLE_ENFORCE_CUDA_SUCCESS(cudaDeviceSynchronize());
+#else
+    PADDLE_ENFORCE_CUDA_SUCCESS(hipDeviceSynchronize());
+#endif
+    for (gpuStream_t stream : streams_) {
+      Release(place_, stream);
+    }
+
+    for (size_t i = 1; i < stream_num_; ++i) {
+#ifdef PADDLE_WITH_CUDA
+      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(streams_[i]));
+#else
+      PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamDestroy(streams_[i]));
+#endif
+    }
+
+    uint64_t cuda_malloc_size =
+        platform::RecordedCudaMallocSize(place_.GetDeviceId());
+    ASSERT_EQ(cuda_malloc_size, 0) << "Found " << cuda_malloc_size
+                                   << " bytes memory that not released yet,"
+                                   << " there may be a memory leak problem";
+  }
+
+  size_t stream_num_;
+  size_t grid_num_;
+  size_t block_num_;
+  size_t data_num_;
+  platform::CUDAPlace place_;
+  gpuStream_t default_stream;
+  std::vector<gpuStream_t> streams_;
+  std::vector<std::shared_ptr<Allocation>> allocations_;
+};
+
+TEST_F(StreamSafeCUDAAllocTest, CUDAMutilStream) {
+  MultiStreamRun();
+  CheckResult();
+}
+
+TEST_F(StreamSafeCUDAAllocTest, CUDAMutilThreadMutilStream) {
+  MultiThreadMUltiStreamRun();
+  CheckResult();
+}
+
+TEST(StreamSafeCUDAAllocRetryTest, StreamSafeCUDAAllocRetry) {
+  platform::CUDAPlace place = platform::CUDAPlace();
+  gpuStream_t stream1, stream2;
+#ifdef PADDLE_WITH_CUDA
+  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream1));
+  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream2));
+#else
+  PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreate(&stream1));
+  PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreate(&stream2));
+#endif
+  size_t available_size = platform::GpuAvailableMemToAlloc();
+  // alloc_size < available_size < 2 * alloc_size
+  size_t alloc_size = available_size / 4 * 3;
+
+  std::shared_ptr<Allocation> allocation1 =
+      AllocShared(place, stream1, alloc_size);
+  std::shared_ptr<Allocation> allocation2;
+
+  std::thread th([&allocation2, &place, &stream2, alloc_size]() {
+    std::this_thread::sleep_for(std::chrono::seconds(1));
+    allocation2 = AllocShared(place, stream2, alloc_size);
+  });
+  allocation1.reset();  // free but not release
+  th.join();
+  EXPECT_GE(allocation2->size(), alloc_size);
+  allocation2.reset();
+
+#ifdef PADDLE_WITH_CUDA
+  PADDLE_ENFORCE_CUDA_SUCCESS(cudaDeviceSynchronize());
+#else
+  PADDLE_ENFORCE_CUDA_SUCCESS(hipDeviceSynchronize());
+#endif
+
+  Release(place, stream1);
+  Release(place, stream2);
+}
+
+}  // namespace memory
+}  // namespace paddle

From dc50b288e0634e980c4494f6e3b47bb3e927dd38 Mon Sep 17 00:00:00 2001
From: chenruibiao <chenruibiao@baidu.com>
Date: Sat, 20 Nov 2021 21:28:59 +0800
Subject: [PATCH 07/13] Fix CI error for 'set_tests_properties'

---
 paddle/fluid/memory/CMakeLists.txt | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/memory/CMakeLists.txt b/paddle/fluid/memory/CMakeLists.txt
index 0de114ae40728b..64fd04b5ea82ba 100644
--- a/paddle/fluid/memory/CMakeLists.txt
+++ b/paddle/fluid/memory/CMakeLists.txt
@@ -20,9 +20,12 @@ if (WITH_GPU)
     nv_test(stream_safe_cuda_alloc_test
             SRCS stream_safe_cuda_alloc_test.cu
             DEPS malloc)
-    set_tests_properties(stream_safe_cuda_alloc_test PROPERTIES 
-                         ENVIRONMENT "FLAGS_enable_stream_safe_cuda_allocator=true"
-                         ENVIRONMENT "FLAGS_allocator_strategy=auto_growth")            
+    
+    if(WITH_TESTING AND TEST stream_safe_cuda_alloc_test)
+        set_tests_properties(stream_safe_cuda_alloc_test PROPERTIES 
+                             ENVIRONMENT "FLAGS_enable_stream_safe_cuda_allocator=true"
+                             ENVIRONMENT "FLAGS_allocator_strategy=auto_growth")   
+    endif()
 endif()
 
 if (WITH_ROCM)

From 43353c3db54c24350d5cbd69d1ddcf08caebd0fd Mon Sep 17 00:00:00 2001
From: chenruibiao <chenruibiao@baidu.com>
Date: Sat, 20 Nov 2021 21:54:40 +0800
Subject: [PATCH 08/13] Invalidate stream safe CUDA allocator for
 naive_best_fit and thread_local strategy

---
 .../memory/allocation/allocator_facade.cc     | 24 ++++++++++++++-----
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 0c1d458687a1ec..9f5dec1a7c4cc9 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -140,18 +140,23 @@ class AllocatorFacadePrivate {
     switch (strategy_) {
       case AllocatorStrategy::kNaiveBestFit: {
         InitNaiveBestFitCPUAllocator();
-#ifdef PADDLE_WITH_XPU
-        for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
-          InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
-        }
-#endif
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+        if (FLAGS_use_stream_safe_cuda_allocator) {
+          LOG(WARNING) << "FLAGS_use_stream_safe_cuda_allocator is invalid for "
+                          "naive_best_fit strategy";
+          FLAGS_use_stream_safe_cuda_allocator = false;
+        }
         for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
              ++dev_id) {
           InitNaiveBestFitCUDAAllocator(platform::CUDAPlace(dev_id));
         }
         InitNaiveBestFitCUDAPinnedAllocator();
 #endif
+#ifdef PADDLE_WITH_XPU
+        for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
+          InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
+        }
+#endif
 #ifdef PADDLE_WITH_ASCEND_CL
         for (int dev_id = 0; dev_id < platform::GetNPUDeviceCount(); ++dev_id) {
           InitNaiveBestFitNPUAllocator(platform::NPUPlace(dev_id));
@@ -198,6 +203,12 @@ class AllocatorFacadePrivate {
         }
 #endif
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+        if (FLAGS_use_stream_safe_cuda_allocator) {
+          LOG(WARNING) << "FLAGS_use_stream_safe_cuda_allocator is invalid for "
+                          "thread_local strategy";
+          FLAGS_use_stream_safe_cuda_allocator = false;
+        }
+
         for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
              ++dev_id) {
           InitThreadLocalCUDAAllocator(platform::CUDAPlace(dev_id));
@@ -228,7 +239,8 @@ class AllocatorFacadePrivate {
             << " " << place << " " << size;
 
 #ifdef PADDLE_WITH_CUDA
-    if (platform::is_gpu_place(place) && size > 0) {
+    if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
+        size > 0) {
       return GetCUDAAllocator(BOOST_GET_CONST(platform::CUDAPlace, place),
                               default_stream_);
     }

From dc1f33c0026fc29b0485bba84dc19c2daafd8c45 Mon Sep 17 00:00:00 2001
From: chenruibiao <chenruibiao@baidu.com>
Date: Mon, 22 Nov 2021 15:47:25 +0800
Subject: [PATCH 09/13] Performance improvement: insert allocation pair to
 outstanding_events_map when free but not alloc; replace recursive_mutex with
 SpinLock

---
 .../memory/allocation/allocator_facade.cc     |  7 +--
 .../allocation/stream_safe_cuda_allocator.cc  | 45 ++++++-------------
 .../allocation/stream_safe_cuda_allocator.h   | 16 +++----
 3 files changed, 22 insertions(+), 46 deletions(-)

diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 9f5dec1a7c4cc9..8e3aa349e72bce 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -68,9 +68,10 @@ PADDLE_DEFINE_EXPORTED_bool(
 PADDLE_DEFINE_EXPORTED_bool(use_virtual_memory_auto_growth, false,
                             "Use VirtualMemoryAutoGrowthBestFitAllocator.");
 
-// NOTE(Ruibiao): This FLAGS is just to be compatible with the old single-stream
-// CUDA allocator, and will be remove after StreamSafeCudaAllocator has been
-// fully tested
+// NOTE(Ruibiao): This FLAGS is just to be compatibled with the old
+// single-stream
+// CUDA allocator. It will be removed after StreamSafeCudaAllocator has been
+// fully tested.
 PADDLE_DEFINE_EXPORTED_bool(use_stream_safe_cuda_allocator, true,
                             "Enable StreamSafeCUDAAllocator");
 
diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
index 7ac20ab9b0f967..5996786995656d 100644
--- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
@@ -31,7 +31,7 @@ void StreamSafeCUDAAllocation::RecordStream(gpuStream_t stream) {
   if (stream == owning_stream_) {
     return;
   }
-  std::lock_guard<std::mutex> lock(mutex_);
+  std::lock_guard<SpinLock> lock_guard(spin_lock_);
   recorded_streams_->insert(stream);
 }
 
@@ -49,24 +49,22 @@ StreamSafeCUDAAllocator::StreamSafeCUDAAllocator(
 bool StreamSafeCUDAAllocator::IsAllocThreadSafe() const { return true; }
 
 Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) {
-  std::lock_guard<std::recursive_mutex> lock(mutex_);
+  std::lock_guard<SpinLock> lock_guard(spin_lock_);
   ProcessEventsAndFree();
   AllocationPtr underlying_allocation = underlying_allocator_->Allocate(size);
   StreamSafeCUDAAllocation* allocation = new StreamSafeCUDAAllocation(
       std::move(underlying_allocation), default_stream_);
-  allocation_info_map_[allocation] = std::make_shared<AllocationInfo>();
   return allocation;
 }
 
 void StreamSafeCUDAAllocator::FreeImpl(Allocation* allocation) {
-  std::lock_guard<std::recursive_mutex> lock(mutex_);
-  GetAllocationInfo(allocation)->can_be_freed = true;
+  std::lock_guard<SpinLock> lock_guard(spin_lock_);
   FreeStreamSafeCUDAAllocation(allocation);
 }
 
 uint64_t StreamSafeCUDAAllocator::ReleaseImpl(const platform::Place& place) {
   /*lock_guard*/ {
-    std::lock_guard<std::recursive_mutex> lock(mutex_);
+    std::lock_guard<SpinLock> lock_guard(spin_lock_);
     ProcessEventsAndFree();
   }
   return underlying_allocator_->Release(place);
@@ -94,14 +92,8 @@ void StreamSafeCUDAAllocator::CreateEventForAllRecordedStream(
 
 void StreamSafeCUDAAllocator::FreeStreamSafeCUDAAllocation(
     Allocation* allocation) {
-  std::shared_ptr<AllocationInfo> allocation_info =
-      GetAllocationInfo(allocation);
-  if (!allocation_info->can_be_freed) {
-    return;
-  }
-
   std::deque<gpuEvent_t>& outstanding_events =
-      allocation_info->outstanding_events;
+      outstanding_events_map_[allocation];
   CreateEventForAllRecordedStream(
       dynamic_cast<StreamSafeCUDAAllocation*>(allocation)
           ->GetRecordedStreams()
@@ -113,25 +105,14 @@ void StreamSafeCUDAAllocator::FreeStreamSafeCUDAAllocation(
   }
 
   VLOG(8) << "Free " << allocation->ptr();
-  allocation_info_map_.erase(allocation);
+  outstanding_events_map_.erase(allocation);
   delete allocation;
 }
 
-std::shared_ptr<StreamSafeCUDAAllocator::AllocationInfo>
-StreamSafeCUDAAllocator::GetAllocationInfo(Allocation* allocation) {
-  auto it = allocation_info_map_.find(allocation);
-  PADDLE_ENFORCE_NE(
-      it, allocation_info_map_.end(),
-      platform::errors::NotFound(
-          "The recorded allocation is not malloced by this allocator"));
-  return it->second;
-}
-
 void StreamSafeCUDAAllocator::ProcessEventsAndFree() {
-  for (auto map_it = allocation_info_map_.begin();
-       map_it != allocation_info_map_.end();) {
-    std::deque<gpuEvent_t>& outstanding_events =
-        map_it->second->outstanding_events;
+  for (auto map_it = outstanding_events_map_.begin();
+       map_it != outstanding_events_map_.end();) {
+    std::deque<gpuEvent_t>& outstanding_events = map_it->second;
     VLOG(10) << "Check " << outstanding_events.size()
              << " outstanding events for " << map_it->first->ptr();
     auto deque_it = outstanding_events.begin();
@@ -140,25 +121,25 @@ void StreamSafeCUDAAllocator::ProcessEventsAndFree() {
       gpuError_t err = cudaEventQuery(*deque_it);
       if (err == cudaErrorNotReady) {
         VLOG(10) << "Event " << *deque_it << " for " << map_it->first->ptr()
-                 << " is not complete";
+                 << " is not completed";
         outstanding_events.erase(outstanding_events.begin(), deque_it);
         break;
       }
       PADDLE_ENFORCE_CUDA_SUCCESS(err);
       PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(*deque_it));
-      ++deque_it;
 #else
       gpuError_t err = hipEventQuery(*deque_it);
       if (err == hipErrorNotReady) {
         VLOG(10) << "Event " << *deque_it << " for " << map_it->first->ptr()
-                 << " is not complete";
+                 << " is not completed";
+        // Erase the completded event before "deque_it"
         outstanding_events.erase(outstanding_events.begin(), deque_it);
         break;
       }
       PADDLE_ENFORCE_CUDA_SUCCESS(err);
       PADDLE_ENFORCE_CUDA_SUCCESS(hipEventDestroy(*deque_it));
-      ++deque_it;
 #endif
+      ++deque_it;
     }
 
     if (deque_it == outstanding_events.end()) {
diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
index f625a8063becd6..93da1fd664f018 100644
--- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
@@ -20,11 +20,12 @@
 #endif
 
 #include <deque>
+#include <map>
 #include <memory>
 #include <mutex>
 #include <set>
-#include <unordered_map>
 #include "paddle/fluid/memory/allocation/allocator.h"
+#include "paddle/fluid/memory/allocation/spin_lock.h"
 
 namespace paddle {
 namespace memory {
@@ -41,7 +42,7 @@ class StreamSafeCUDAAllocation : public Allocation {
   AllocationPtr underlying_allocation_;
   gpuStream_t owning_stream_;
   std::shared_ptr<std::set<gpuStream_t>> recorded_streams_;
-  std::mutex mutex_;
+  SpinLock spin_lock_;
 };
 
 class StreamSafeCUDAAllocator : public Allocator {
@@ -57,23 +58,16 @@ class StreamSafeCUDAAllocator : public Allocator {
   uint64_t ReleaseImpl(const platform::Place &place) override;
 
  private:
-  struct AllocationInfo {
-    std::deque<gpuEvent_t> outstanding_events;
-    bool can_be_freed{false};
-  };
-
   void CreateEventForAllRecordedStream(
       std::set<gpuStream_t> *recorded_streams,
       std::deque<gpuEvent_t> *outstanding_events);
   void FreeStreamSafeCUDAAllocation(Allocation *allocation);
-  std::shared_ptr<AllocationInfo> GetAllocationInfo(Allocation *);
   void ProcessEventsAndFree();
 
   std::shared_ptr<Allocator> underlying_allocator_;
   gpuStream_t default_stream_;
-  std::unordered_map<Allocation *, std::shared_ptr<AllocationInfo>>
-      allocation_info_map_;
-  mutable std::recursive_mutex mutex_;
+  std::map<Allocation *, std::deque<gpuEvent_t>> outstanding_events_map_;
+  SpinLock spin_lock_;
 };
 
 }  // namespace allocation

From 9b881d45d2bed00632b9f5a8885dce91c9324fd4 Mon Sep 17 00:00:00 2001
From: chenruibiao <chenruibiao@baidu.com>
Date: Mon, 22 Nov 2021 16:33:43 +0800
Subject: [PATCH 10/13] FLAGS priority changes: FLAGS_use_system_allocator >
 FLAGS_use_stream_safe_cuda_allocator

---
 paddle/fluid/memory/CMakeLists.txt            |  1 +
 .../memory/allocation/allocator_facade.cc     | 35 ++++++++++---------
 2 files changed, 19 insertions(+), 17 deletions(-)

diff --git a/paddle/fluid/memory/CMakeLists.txt b/paddle/fluid/memory/CMakeLists.txt
index 64fd04b5ea82ba..69134e1c76bb74 100644
--- a/paddle/fluid/memory/CMakeLists.txt
+++ b/paddle/fluid/memory/CMakeLists.txt
@@ -23,6 +23,7 @@ if (WITH_GPU)
     
     if(WITH_TESTING AND TEST stream_safe_cuda_alloc_test)
         set_tests_properties(stream_safe_cuda_alloc_test PROPERTIES 
+                             ENVIRONMENT "FLAGS_use_system_allocator=false"
                              ENVIRONMENT "FLAGS_enable_stream_safe_cuda_allocator=true"
                              ENVIRONMENT "FLAGS_allocator_strategy=auto_growth")   
     endif()
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 8e3aa349e72bce..f9f5ee3d29be5f 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -239,14 +239,6 @@ class AllocatorFacadePrivate {
     VLOG(6) << "GetAllocator"
             << " " << place << " " << size;
 
-#ifdef PADDLE_WITH_CUDA
-    if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
-        size > 0) {
-      return GetCUDAAllocator(BOOST_GET_CONST(platform::CUDAPlace, place),
-                              default_stream_);
-    }
-#endif
-
     const auto& allocators =
         (size > 0 ? (UNLIKELY(FLAGS_use_system_allocator) ? system_allocators_
                                                           : GetAllocatorMap())
@@ -259,7 +251,7 @@ class AllocatorFacadePrivate {
   }
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  const std::shared_ptr<Allocator>& GetCUDAAllocator(
+  const std::shared_ptr<Allocator>& GetAllocator(
       const platform::CUDAPlace& place, const gpuStream_t& stream) {
     auto place_it = cuda_allocators_.find(place);
     PADDLE_ENFORCE_NE(place_it, cuda_allocators_.end(),
@@ -292,18 +284,18 @@ class AllocatorFacadePrivate {
                           const gpuStream_t& stream, size_t size) {
     std::shared_ptr<Allocator> cuda_allocator;
     try {
-      cuda_allocator = GetCUDAAllocator(place, stream);
+      cuda_allocator = GetAllocator(place, stream);
     } catch (platform::EnforceNotMet& err) {
       VLOG(9) << "No allocator found for stream " << stream << " in place "
               << place << " , build a new one";
       std::unique_lock<std::mutex> lock(cuda_init_mutex_);
       try {
-        cuda_allocator = GetCUDAAllocator(place, stream);
+        cuda_allocator = GetAllocator(place, stream);
         VLOG(9) << "Other thread had build a new allocator for stream "
                 << stream << " in place " << place;
       } catch (platform::EnforceNotMet& err) {
         InitStreamSafeCUDAAllocator(place, stream);
-        cuda_allocator = GetCUDAAllocator(place, stream);
+        cuda_allocator = GetAllocator(place, stream);
       } catch (...) {
         throw;
       }
@@ -595,7 +587,7 @@ class AllocatorFacadePrivate {
 
   void WrapStreamSafeCUDAAllocator(platform::CUDAPlace p, gpuStream_t stream) {
     const std::shared_ptr<Allocator>& underlying_allocator =
-        GetCUDAAllocator(p, stream);
+        GetAllocator(p, stream);
     cuda_allocators_[p][stream] =
         std::make_shared<StreamSafeCUDAAllocator>(underlying_allocator, stream);
   }
@@ -749,7 +741,8 @@ std::shared_ptr<Allocation> AllocatorFacade::AllocShared(
 AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
                                      size_t size) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place)) {
+  if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
+      size > 0 && FLAGS_use_system_allocator == false) {
     return Alloc(BOOST_GET_CONST(platform::CUDAPlace, place),
                  m_->GetDefaultStream(), size);
   }
@@ -759,7 +752,8 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
 
 uint64_t AllocatorFacade::Release(const platform::Place& place) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place)) {
+  if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
+      FLAGS_use_system_allocator == false) {
     return Release(BOOST_GET_CONST(platform::CUDAPlace, place),
                    m_->GetDefaultStream());
   }
@@ -770,6 +764,13 @@ uint64_t AllocatorFacade::Release(const platform::Place& place) {
 
 const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
     const platform::Place& place) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
+      FLAGS_use_system_allocator == false) {
+    return m_->GetAllocator(BOOST_GET_CONST(platform::CUDAPlace, place),
+                            m_->GetDefaultStream());
+  }
+#endif
   return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);
 }
 
@@ -795,7 +796,7 @@ AllocationPtr AllocatorFacade::Alloc(const platform::CUDAPlace& place,
           "multi-stream 'Alloca' function. "
           "To enable it, you can enter 'export "
           "FLAGS_use_stream_safe_cuda_allocator=true' in the terminal."));
-  if (size > 0) {
+  if (LIKELY(size > 0 && FLAGS_use_system_allocator == false)) {
     return m_->CUDAAlloc(place, stream, size);
   } else {
     return m_->GetAllocator(place, size)->Allocate(size);
@@ -811,7 +812,7 @@ uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place,
           "multi-stream 'Release' function. "
           "To enable it, you can enter 'export "
           "FLAGS_use_stream_safe_cuda_allocator=true' in the terminal."));
-  return m_->GetCUDAAllocator(place, stream)->Release(place);
+  return m_->GetAllocator(place, stream)->Release(place);
 }
 
 void AllocatorFacade::RecordStream(Allocation* allocation,

From b78b055b821e67f64b869869c0529b44bc3231af Mon Sep 17 00:00:00 2001
From: chenruibiao <chenruibiao@baidu.com>
Date: Mon, 22 Nov 2021 17:54:26 +0800
Subject: [PATCH 11/13] Performance improvement: directly delete allocation
 when the recorded_streams is empty in FreeImpl of StreamSafeCUDAAllocator

---
 .../memory/allocation/allocator_facade.cc     | 28 +++++++++----------
 .../allocation/stream_safe_cuda_allocator.cc  |  8 +++++-
 2 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index f9f5ee3d29be5f..4447dacd8baff6 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -68,10 +68,9 @@ PADDLE_DEFINE_EXPORTED_bool(
 PADDLE_DEFINE_EXPORTED_bool(use_virtual_memory_auto_growth, false,
                             "Use VirtualMemoryAutoGrowthBestFitAllocator.");
 
-// NOTE(Ruibiao): This FLAGS is just to be compatibled with the old
-// single-stream
-// CUDA allocator. It will be removed after StreamSafeCudaAllocator has been
-// fully tested.
+// NOTE(Ruibiao): This FLAGS is just to be compatibled with
+// the old single-stream CUDA allocator. It will be removed
+// after StreamSafeCudaAllocator has been fully tested.
 PADDLE_DEFINE_EXPORTED_bool(use_stream_safe_cuda_allocator, true,
                             "Enable StreamSafeCUDAAllocator");
 
@@ -234,11 +233,10 @@ class AllocatorFacadePrivate {
     CheckAllocThreadSafe();
   }
 
-  const std::shared_ptr<Allocator>& GetAllocator(const platform::Place& place,
-                                                 size_t size) {
+  inline const std::shared_ptr<Allocator>& GetAllocator(
+      const platform::Place& place, size_t size) {
     VLOG(6) << "GetAllocator"
             << " " << place << " " << size;
-
     const auto& allocators =
         (size > 0 ? (UNLIKELY(FLAGS_use_system_allocator) ? system_allocators_
                                                           : GetAllocatorMap())
@@ -575,23 +573,23 @@ class AllocatorFacadePrivate {
     allocators_[p] = std::make_shared<ThreadLocalCUDAAllocator>(p);
   }
 
+  void WrapStreamSafeCUDAAllocator(platform::CUDAPlace p, gpuStream_t stream) {
+    const std::shared_ptr<Allocator>& underlying_allocator =
+        GetAllocator(p, stream);
+    cuda_allocators_[p][stream] =
+        std::make_shared<StreamSafeCUDAAllocator>(underlying_allocator, stream);
+  }
+
   void WrapCUDARetryAllocator(platform::CUDAPlace p, gpuStream_t stream,
                               size_t retry_time) {
     PADDLE_ENFORCE_GT(
         retry_time, 0,
         platform::errors::InvalidArgument(
             "Retry time should be larger than 0, but got %d", retry_time));
-    std::shared_ptr<Allocator>& allocator = cuda_allocators_[p][stream];
+    std::shared_ptr<Allocator> allocator = GetAllocator(p, stream);
     allocator = std::make_shared<RetryAllocator>(allocator, retry_time);
   }
 
-  void WrapStreamSafeCUDAAllocator(platform::CUDAPlace p, gpuStream_t stream) {
-    const std::shared_ptr<Allocator>& underlying_allocator =
-        GetAllocator(p, stream);
-    cuda_allocators_[p][stream] =
-        std::make_shared<StreamSafeCUDAAllocator>(underlying_allocator, stream);
-  }
-
   static void CheckCUDAAllocThreadSafe(const CUDAAllocatorMap& allocators) {
     for (auto& place_pair : allocators) {
       for (auto& stream_pair : place_pair.second) {
diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
index 5996786995656d..26ad68468aa505 100644
--- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
@@ -59,7 +59,13 @@ Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) {
 
 void StreamSafeCUDAAllocator::FreeImpl(Allocation* allocation) {
   std::lock_guard<SpinLock> lock_guard(spin_lock_);
-  FreeStreamSafeCUDAAllocation(allocation);
+  if (dynamic_cast<StreamSafeCUDAAllocation*>(allocation)
+          ->GetRecordedStreams()
+          ->empty()) {
+    delete allocation;
+  } else {
+    FreeStreamSafeCUDAAllocation(allocation);
+  }
 }
 
 uint64_t StreamSafeCUDAAllocator::ReleaseImpl(const platform::Place& place) {

From 896a67a3485d9f05f64dcf4bda70554c426f576d Mon Sep 17 00:00:00 2001
From: chenruibiao <chenruibiao@baidu.com>
Date: Tue, 23 Nov 2021 14:26:38 +0800
Subject: [PATCH 12/13] Add UT for alloc interface

---
 .../memory/stream_safe_cuda_alloc_test.cu     | 24 ++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu b/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
index a75ef88141d6ca..e63bc789064276 100644
--- a/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
+++ b/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
@@ -160,17 +160,35 @@ class StreamSafeCUDAAllocTest : public ::testing::Test {
   std::vector<std::shared_ptr<Allocation>> allocations_;
 };
 
-TEST_F(StreamSafeCUDAAllocTest, CUDAMutilStream) {
+TEST_F(StreamSafeCUDAAllocTest, CUDAMutilStreamTest) {
   MultiStreamRun();
   CheckResult();
 }
 
-TEST_F(StreamSafeCUDAAllocTest, CUDAMutilThreadMutilStream) {
+TEST_F(StreamSafeCUDAAllocTest, CUDAMutilThreadMutilStreamTest) {
   MultiThreadMUltiStreamRun();
   CheckResult();
 }
 
-TEST(StreamSafeCUDAAllocRetryTest, StreamSafeCUDAAllocRetry) {
+TEST(StreamSafeCUDAAllocInterfaceTest, AllocInterfaceTest) {
+  platform::CUDAPlace place = platform::CUDAPlace();
+  size_t alloc_size = 256;
+
+  std::shared_ptr<Allocation> allocation_implicit_stream =
+      AllocShared(place, alloc_size);
+  EXPECT_GE(allocation_implicit_stream->size(), alloc_size);
+
+  void *address = allocation_implicit_stream->ptr();
+  allocation_implicit_stream.reset();
+
+  gpuStream_t default_stream = nullptr;
+  allocation::AllocationPtr allocation_unique =
+      Alloc(place, default_stream, alloc_size);
+  EXPECT_GE(allocation_unique->size(), alloc_size);
+  EXPECT_EQ(allocation_unique->ptr(), address);
+}
+
+TEST(StreamSafeCUDAAllocRetryTest, RetryTest) {
   platform::CUDAPlace place = platform::CUDAPlace();
   gpuStream_t stream1, stream2;
 #ifdef PADDLE_WITH_CUDA

From 5a300a3ac34dc40aa75cf8a00bb4a1440b3305fc Mon Sep 17 00:00:00 2001
From: chenruibiao <chenruibiao@baidu.com>
Date: Tue, 23 Nov 2021 19:52:49 +0800
Subject: [PATCH 13/13] Changes multi-stream interface; move retry code from
 AllocatorFacadePrivate to StreamSafeCUDAAllocator

---
 .../memory/allocation/allocator_facade.cc     | 120 +++++++-----------
 .../memory/allocation/allocator_facade.h      |  12 +-
 .../allocation/stream_safe_cuda_allocator.cc  |  58 +++++++--
 .../allocation/stream_safe_cuda_allocator.h   |  12 +-
 paddle/fluid/memory/malloc.cc                 |  14 +-
 paddle/fluid/memory/malloc.h                  |   8 +-
 .../memory/stream_safe_cuda_alloc_test.cu     |   8 +-
 7 files changed, 125 insertions(+), 107 deletions(-)

diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 4447dacd8baff6..a2b4cdddd30555 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -250,7 +250,8 @@ class AllocatorFacadePrivate {
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   const std::shared_ptr<Allocator>& GetAllocator(
-      const platform::CUDAPlace& place, const gpuStream_t& stream) {
+      const platform::CUDAPlace& place, const gpuStream_t& stream,
+      bool create_if_not_found = false) {
     auto place_it = cuda_allocators_.find(place);
     PADDLE_ENFORCE_NE(place_it, cuda_allocators_.end(),
                       platform::errors::NotFound(
@@ -259,11 +260,15 @@ class AllocatorFacadePrivate {
     const std::map<gpuStream_t, std::shared_ptr<Allocator>>& allocator_map =
         place_it->second;
     auto stream_it = allocator_map.find(stream);
-    PADDLE_ENFORCE_NE(
-        stream_it, allocator_map.end(),
-        platform::errors::NotFound(
+    if (stream_it == allocator_map.end()) {
+      if (create_if_not_found) {
+        InitStreamSafeCUDAAllocator(place, stream);
+        return cuda_allocators_[place][stream];
+      } else {
+        PADDLE_THROW(platform::errors::NotFound(
             "No allocator found for stream %s in place %s", stream, place));
-
+      }
+    }
     return stream_it->second;
   }
 
@@ -278,49 +283,6 @@ class AllocatorFacadePrivate {
     dynamic_cast<StreamSafeCUDAAllocation*>(allocation)->RecordStream(stream);
   }
 
-  AllocationPtr CUDAAlloc(const platform::CUDAPlace& place,
-                          const gpuStream_t& stream, size_t size) {
-    std::shared_ptr<Allocator> cuda_allocator;
-    try {
-      cuda_allocator = GetAllocator(place, stream);
-    } catch (platform::EnforceNotMet& err) {
-      VLOG(9) << "No allocator found for stream " << stream << " in place "
-              << place << " , build a new one";
-      std::unique_lock<std::mutex> lock(cuda_init_mutex_);
-      try {
-        cuda_allocator = GetAllocator(place, stream);
-        VLOG(9) << "Other thread had build a new allocator for stream "
-                << stream << " in place " << place;
-      } catch (platform::EnforceNotMet& err) {
-        InitStreamSafeCUDAAllocator(place, stream);
-        cuda_allocator = GetAllocator(place, stream);
-      } catch (...) {
-        throw;
-      }
-    } catch (...) {
-      throw;
-    }
-
-    try {
-      return cuda_allocator->Allocate(size);
-    } catch (BadAlloc&) {
-      VLOG(9) << "Allocation failed when allocating " << size
-              << " bytes for stream " << stream;
-      for (auto pair : cuda_allocators_[place]) {
-        pair.second->Release(place);
-      }
-      try {
-        return cuda_allocator->Allocate(size);
-      } catch (...) {
-        VLOG(9) << "Still allocation failed "
-                << "after release memory from all streams";
-        throw;
-      }
-    } catch (...) {
-      throw;
-    }
-  }
-
 #ifdef PADDLE_WITH_CUDA
   void PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) {
     PADDLE_ENFORCE_EQ(strategy_, AllocatorStrategy::kAutoGrowth,
@@ -404,15 +366,22 @@ class AllocatorFacadePrivate {
     PADDLE_ENFORCE_EQ(
         strategy_, AllocatorStrategy::kAutoGrowth,
         platform::errors::Unimplemented(
-            "Now only support auto-growth strategey for "
-            "StreamSafeCUDAAllocator, "
+            "Only support auto-growth strategey for StreamSafeCUDAAllocator, "
             "the allocator strategy %d is unsupported for multi-stream",
             static_cast<int>(strategy_)));
     VLOG(9) << "Init CUDA allocator for stream " << stream << " in place " << p;
-
-    InitAutoGrowthCUDAAllocator(p, stream);
-    WrapStreamSafeCUDAAllocator(p, stream);
-    WrapCUDARetryAllocator(p, stream, FLAGS_gpu_allocator_retry_time);
+    std::lock_guard<SpinLock> lock_guard(cuda_allocators_lock_);
+    try {
+      GetAllocator(p, stream);
+      VLOG(9) << "Other thread had build a allocator for stream " << stream
+              << " in place " << p;
+    } catch (platform::EnforceNotMet&) {
+      InitAutoGrowthCUDAAllocator(p, stream);
+      WrapStreamSafeCUDAAllocator(p, stream);
+      WrapCUDARetryAllocator(p, stream, FLAGS_gpu_allocator_retry_time);
+    } catch (...) {
+      throw;
+    }
   }
 
   void InitNaiveBestFitCUDAAllocator(platform::CUDAPlace p) {
@@ -576,8 +545,8 @@ class AllocatorFacadePrivate {
   void WrapStreamSafeCUDAAllocator(platform::CUDAPlace p, gpuStream_t stream) {
     const std::shared_ptr<Allocator>& underlying_allocator =
         GetAllocator(p, stream);
-    cuda_allocators_[p][stream] =
-        std::make_shared<StreamSafeCUDAAllocator>(underlying_allocator, stream);
+    cuda_allocators_[p][stream] = std::make_shared<StreamSafeCUDAAllocator>(
+        underlying_allocator, p, stream);
   }
 
   void WrapCUDARetryAllocator(platform::CUDAPlace p, gpuStream_t stream,
@@ -704,7 +673,7 @@ class AllocatorFacadePrivate {
   // a standalone CUDA allocator to support multi-stream GC in new executor
   CUDAAllocatorMap cuda_allocators_;
   gpuStream_t default_stream_;
-  std::mutex cuda_init_mutex_;
+  SpinLock cuda_allocators_lock_;
 #ifdef PADDLE_WITH_CUDA
   std::unordered_map<CUDAGraphID, std::unique_ptr<AllocatorFacadePrivate>>
       cuda_graph_allocator_map_;
@@ -731,6 +700,18 @@ AllocatorFacade& AllocatorFacade::Instance() {
   return instance;
 }
 
+const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
+    const platform::Place& place) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
+      FLAGS_use_system_allocator == false) {
+    return m_->GetAllocator(BOOST_GET_CONST(platform::CUDAPlace, place),
+                            m_->GetDefaultStream());
+  }
+#endif
+  return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);
+}
+
 std::shared_ptr<Allocation> AllocatorFacade::AllocShared(
     const platform::Place& place, size_t size) {
   return std::shared_ptr<Allocation>(Alloc(place, size));
@@ -741,8 +722,8 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
       size > 0 && FLAGS_use_system_allocator == false) {
-    return Alloc(BOOST_GET_CONST(platform::CUDAPlace, place),
-                 m_->GetDefaultStream(), size);
+    return Alloc(BOOST_GET_CONST(platform::CUDAPlace, place), size,
+                 m_->GetDefaultStream());
   }
 #endif
   return m_->GetAllocator(place, size)->Allocate(size);
@@ -760,21 +741,9 @@ uint64_t AllocatorFacade::Release(const platform::Place& place) {
       ->Release(place);
 }
 
-const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
-    const platform::Place& place) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
-      FLAGS_use_system_allocator == false) {
-    return m_->GetAllocator(BOOST_GET_CONST(platform::CUDAPlace, place),
-                            m_->GetDefaultStream());
-  }
-#endif
-  return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);
-}
-
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 std::shared_ptr<Allocation> AllocatorFacade::AllocShared(
-    const platform::CUDAPlace& place, const gpuStream_t& stream, size_t size) {
+    const platform::CUDAPlace& place, size_t size, const gpuStream_t& stream) {
   PADDLE_ENFORCE_EQ(
       FLAGS_use_stream_safe_cuda_allocator, true,
       platform::errors::Unimplemented(
@@ -782,11 +751,11 @@ std::shared_ptr<Allocation> AllocatorFacade::AllocShared(
           "multi-stream 'AllocaShared' function. "
           "To enable it, you can enter 'export "
           "FLAGS_use_stream_safe_cuda_allocator=true' in the terminal."));
-  return std::shared_ptr<Allocation>(Alloc(place, stream, size));
+  return std::shared_ptr<Allocation>(Alloc(place, size, stream));
 }
 
 AllocationPtr AllocatorFacade::Alloc(const platform::CUDAPlace& place,
-                                     const gpuStream_t& stream, size_t size) {
+                                     size_t size, const gpuStream_t& stream) {
   PADDLE_ENFORCE_EQ(
       FLAGS_use_stream_safe_cuda_allocator, true,
       platform::errors::Unimplemented(
@@ -795,7 +764,8 @@ AllocationPtr AllocatorFacade::Alloc(const platform::CUDAPlace& place,
           "To enable it, you can enter 'export "
           "FLAGS_use_stream_safe_cuda_allocator=true' in the terminal."));
   if (LIKELY(size > 0 && FLAGS_use_system_allocator == false)) {
-    return m_->CUDAAlloc(place, stream, size);
+    return m_->GetAllocator(place, stream, /* creat_if_not_found = */ true)
+        ->Allocate(size);
   } else {
     return m_->GetAllocator(place, size)->Allocate(size);
   }
diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h
index 813cf62c1366c9..4cd8b4e91e614e 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.h
+++ b/paddle/fluid/memory/allocation/allocator_facade.h
@@ -47,6 +47,8 @@ class AllocatorFacade {
 
   static AllocatorFacade& Instance();
 
+  const std::shared_ptr<Allocator>& GetAllocator(const platform::Place& place);
+
   // Allocate a shared allocation.
   std::shared_ptr<Allocation> AllocShared(const platform::Place& place,
                                           size_t size);
@@ -55,14 +57,12 @@ class AllocatorFacade {
   // Release unused memory pool.
   uint64_t Release(const platform::Place& place);
 
-  const std::shared_ptr<Allocator>& GetAllocator(const platform::Place& place);
-
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   std::shared_ptr<Allocation> AllocShared(const platform::CUDAPlace& place,
-                                          const gpuStream_t& stream,
-                                          size_t size);
-  AllocationPtr Alloc(const platform::CUDAPlace& place,
-                      const gpuStream_t& stream, size_t size);
+                                          size_t size,
+                                          const gpuStream_t& stream);
+  AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size,
+                      const gpuStream_t& stream);
   uint64_t Release(const platform::CUDAPlace& place, const gpuStream_t& stream);
   void RecordStream(Allocation* allocation, const gpuStream_t& stream);
 #ifdef PADDLE_WITH_CUDA
diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
index 26ad68468aa505..b2e13af6ef956e 100644
--- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h"
+#include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace memory {
@@ -42,38 +43,67 @@ StreamSafeCUDAAllocation::GetRecordedStreams() {
 
 StreamSafeCUDAAllocator::StreamSafeCUDAAllocator(
     const std::shared_ptr<Allocator>& underlying_allocator,
-    const gpuStream_t default_stream)
+    const platform::CUDAPlace& place, const gpuStream_t default_stream)
     : underlying_allocator_(underlying_allocator),
-      default_stream_(default_stream) {}
+      place_(place),
+      default_stream_(default_stream) {
+  std::lock_guard<SpinLock> lock_guard(allocators_map_lock_);
+  allocators_map_[place].emplace_back(this);
+}
+
+StreamSafeCUDAAllocator::~StreamSafeCUDAAllocator() {
+  std::lock_guard<SpinLock> lock_guard(allocators_map_lock_);
+  std::vector<StreamSafeCUDAAllocator*>& allocators = allocators_map_[place_];
+  allocators.erase(std::remove(allocators.begin(), allocators.end(), this),
+                   allocators.end());
+}
 
 bool StreamSafeCUDAAllocator::IsAllocThreadSafe() const { return true; }
 
 Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) {
-  std::lock_guard<SpinLock> lock_guard(spin_lock_);
   ProcessEventsAndFree();
-  AllocationPtr underlying_allocation = underlying_allocator_->Allocate(size);
+  AllocationPtr underlying_allocation;
+  try {
+    underlying_allocation = underlying_allocator_->Allocate(size);
+  } catch (BadAlloc&) {
+    VLOG(9) << "Allocation failed when allocating " << size << " bytes";
+    uint64_t release_size = ReleaseImpl(place_);
+    VLOG(9) << "Release " << release_size << " bytes memory from all streams";
+    try {
+      underlying_allocation = underlying_allocator_->Allocate(size);
+    } catch (...) {
+      VLOG(9) << "Still allocation failed after release memory";
+      throw;
+    }
+  } catch (...) {
+    throw;
+  }
+
   StreamSafeCUDAAllocation* allocation = new StreamSafeCUDAAllocation(
       std::move(underlying_allocation), default_stream_);
   return allocation;
 }
 
 void StreamSafeCUDAAllocator::FreeImpl(Allocation* allocation) {
-  std::lock_guard<SpinLock> lock_guard(spin_lock_);
   if (dynamic_cast<StreamSafeCUDAAllocation*>(allocation)
           ->GetRecordedStreams()
           ->empty()) {
     delete allocation;
   } else {
+    std::lock_guard<SpinLock> lock_guard(outstanding_events_map_lock_);
     FreeStreamSafeCUDAAllocation(allocation);
   }
 }
 
 uint64_t StreamSafeCUDAAllocator::ReleaseImpl(const platform::Place& place) {
-  /*lock_guard*/ {
-    std::lock_guard<SpinLock> lock_guard(spin_lock_);
-    ProcessEventsAndFree();
+  std::lock_guard<SpinLock> lock_guard(allocators_map_lock_);
+  std::vector<StreamSafeCUDAAllocator*>& allocators =
+      allocators_map_[BOOST_GET_CONST(platform::CUDAPlace, place)];
+  uint64_t release_size = 0;
+  for (StreamSafeCUDAAllocator* allocator : allocators) {
+    release_size += allocator->ProcessEventsAndFreeWithRelease();
   }
-  return underlying_allocator_->Release(place);
+  return release_size;
 }
 
 void StreamSafeCUDAAllocator::CreateEventForAllRecordedStream(
@@ -116,6 +146,7 @@ void StreamSafeCUDAAllocator::FreeStreamSafeCUDAAllocation(
 }
 
 void StreamSafeCUDAAllocator::ProcessEventsAndFree() {
+  std::lock_guard<SpinLock> lock_guard(outstanding_events_map_lock_);
   for (auto map_it = outstanding_events_map_.begin();
        map_it != outstanding_events_map_.end();) {
     std::deque<gpuEvent_t>& outstanding_events = map_it->second;
@@ -161,6 +192,15 @@ void StreamSafeCUDAAllocator::ProcessEventsAndFree() {
   }
 }
 
+uint64_t StreamSafeCUDAAllocator::ProcessEventsAndFreeWithRelease() {
+  ProcessEventsAndFree();
+  return underlying_allocator_->Release(place_);
+}
+
+std::map<platform::CUDAPlace, std::vector<StreamSafeCUDAAllocator*>>
+    StreamSafeCUDAAllocator::allocators_map_;
+SpinLock StreamSafeCUDAAllocator::allocators_map_lock_;
+
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
index 93da1fd664f018..a516558228be63 100644
--- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
@@ -26,6 +26,7 @@
 #include <set>
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/memory/allocation/spin_lock.h"
+#include "paddle/fluid/platform/place.h"
 
 namespace paddle {
 namespace memory {
@@ -49,7 +50,8 @@ class StreamSafeCUDAAllocator : public Allocator {
  public:
   StreamSafeCUDAAllocator(
       const std::shared_ptr<Allocator> &underlying_allocator,
-      const gpuStream_t default_stream);
+      const platform::CUDAPlace &place, const gpuStream_t default_stream);
+  ~StreamSafeCUDAAllocator();
   bool IsAllocThreadSafe() const override;
 
  protected:
@@ -63,11 +65,17 @@ class StreamSafeCUDAAllocator : public Allocator {
       std::deque<gpuEvent_t> *outstanding_events);
   void FreeStreamSafeCUDAAllocation(Allocation *allocation);
   void ProcessEventsAndFree();
+  uint64_t ProcessEventsAndFreeWithRelease();
+
+  static std::map<platform::CUDAPlace, std::vector<StreamSafeCUDAAllocator *>>
+      allocators_map_;
+  static SpinLock allocators_map_lock_;
 
   std::shared_ptr<Allocator> underlying_allocator_;
+  platform::CUDAPlace place_;
   gpuStream_t default_stream_;
   std::map<Allocation *, std::deque<gpuEvent_t>> outstanding_events_map_;
-  SpinLock spin_lock_;
+  SpinLock outstanding_events_map_lock_;
 };
 
 }  // namespace allocation
diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc
index 9cd391acca6a45..4921b87ccd99e9 100644
--- a/paddle/fluid/memory/malloc.cc
+++ b/paddle/fluid/memory/malloc.cc
@@ -35,15 +35,15 @@ uint64_t Release(const platform::Place& place) {
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 std::shared_ptr<Allocation> AllocShared(const platform::CUDAPlace& place,
-                                        const gpuStream_t& stream,
-                                        size_t size) {
-  return allocation::AllocatorFacade::Instance().AllocShared(place, stream,
-                                                             size);
+                                        size_t size,
+                                        const gpuStream_t& stream) {
+  return allocation::AllocatorFacade::Instance().AllocShared(place, size,
+                                                             stream);
 }
 
-AllocationPtr Alloc(const platform::CUDAPlace& place, const gpuStream_t& stream,
-                    size_t size) {
-  return allocation::AllocatorFacade::Instance().Alloc(place, stream, size);
+AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size,
+                    const gpuStream_t& stream) {
+  return allocation::AllocatorFacade::Instance().Alloc(place, size, stream);
 }
 
 uint64_t Release(const platform::CUDAPlace& place, const gpuStream_t& stream) {
diff --git a/paddle/fluid/memory/malloc.h b/paddle/fluid/memory/malloc.h
index 4926dc572cad4c..2aa9fbe6ada8fe 100644
--- a/paddle/fluid/memory/malloc.h
+++ b/paddle/fluid/memory/malloc.h
@@ -42,11 +42,11 @@ extern uint64_t Release(const platform::Place& place);
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 extern std::shared_ptr<Allocation> AllocShared(const platform::CUDAPlace& place,
-                                               const gpuStream_t& stream,
-                                               size_t size);
+                                               size_t size,
+                                               const gpuStream_t& stream);
 
-extern AllocationPtr Alloc(const platform::CUDAPlace& place,
-                           const gpuStream_t& stream, size_t size);
+extern AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size,
+                           const gpuStream_t& stream);
 
 extern uint64_t Release(const platform::CUDAPlace& place,
                         const gpuStream_t& stream);
diff --git a/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu b/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
index e63bc789064276..6a5818fd9603be 100644
--- a/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
+++ b/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
@@ -63,7 +63,7 @@ class StreamSafeCUDAAllocTest : public ::testing::Test {
     for (size_t i = 0; i < stream_num_; ++i) {
       size_t allocation_size = data_num_ * sizeof(int);
       std::shared_ptr<Allocation> allocation =
-          AllocShared(place_, streams_[i], allocation_size);
+          AllocShared(place_, allocation_size, streams_[i]);
 #ifdef PADDLE_WITH_CUDA
       PADDLE_ENFORCE_CUDA_SUCCESS(
           cudaMemset(allocation->ptr(), 0, allocation->size()));
@@ -183,7 +183,7 @@ TEST(StreamSafeCUDAAllocInterfaceTest, AllocInterfaceTest) {
 
   gpuStream_t default_stream = nullptr;
   allocation::AllocationPtr allocation_unique =
-      Alloc(place, default_stream, alloc_size);
+      Alloc(place, alloc_size, default_stream);
   EXPECT_GE(allocation_unique->size(), alloc_size);
   EXPECT_EQ(allocation_unique->ptr(), address);
 }
@@ -203,12 +203,12 @@ TEST(StreamSafeCUDAAllocRetryTest, RetryTest) {
   size_t alloc_size = available_size / 4 * 3;
 
   std::shared_ptr<Allocation> allocation1 =
-      AllocShared(place, stream1, alloc_size);
+      AllocShared(place, alloc_size, stream1);
   std::shared_ptr<Allocation> allocation2;
 
   std::thread th([&allocation2, &place, &stream2, alloc_size]() {
     std::this_thread::sleep_for(std::chrono::seconds(1));
-    allocation2 = AllocShared(place, stream2, alloc_size);
+    allocation2 = AllocShared(place, alloc_size, stream2);
   });
   allocation1.reset();  // free but not release
   th.join();