[SYCL][CUDA] Don't restore CUDA contexts (#4442)

npmiller · web-flow · commit 4ccfd2155790 · 2021-09-03T14:50:42.000+03:00
This patch fixes #4171. The issue highlighted in that ticket is that CUDA contexts are bound to threads and PI calls are executed both in the main thread and in threads of the thread pool. And to ensure the active contexts are correct the CUDA plugin uses `ScopedContext` RAII struct to set the active context to the PI context and restore the previous active context at the end of the PI call. However for optimization purposes `ScopedContext` skips the context recovery if there was no active context on the thread originally, which means it leaves the PI context active on the thread. In addition deleting a CUDA context only deactivates it on the thread where it is deleted, it will stay active in other threads after being deleted. Which means that if you start from an application with no CUDA context active, create a SYCL queue, run an operation then delete the SYCL queue, the context on the current thread will be created, set active, deactivated and deleted properly. However it won't be deactivated in the threads of the thread pools, which means that if we create another queue and run SYCL operations on the thread pool again, that second queue will setup its own context in the threads but then try to restore the deleted context from the previous queue. This patch aims to fix that issue by simply never restoring previous active context, which means that PI calls from the second queue running in the thread pool would just override the deleted context and not try to restore it. This should work well in SYCL only code as all the PI calls are guarded by the `ScopedContext` and will change the active context accordingly, in fact it may even provide performance improvement in certain multi-context scenarios, because the current implementation would only really prevent context switches for the first context used, this will prevent context switches for the latest context used instead. In CUDA interop scenarios, however it does mean that after running any SYCL code CUDA interop code cannot make assumptions about the active context and needs to reset it to whatever context it needs. But as far as I'm aware, this is already the current practice in `oneMKL` and `oneDNN`, where they also use a `ScopedContext` mechanism. In summary this patch should: * Fix trying to restore deleted contexts in internal SYCL threads * May improve performance in certain multi-context scenarios * Break assumptions on the active context for CUDA interop code
diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
@@ -137,47 +137,48 @@ pi_result check_error(CUresult result, const char *function, int line,
 /// \cond NODOXY
 #define PI_CHECK_ERROR(result) check_error(result, __func__, __LINE__, __FILE__)
 
-/// RAII type to guarantee recovering original CUDA context
-/// Scoped context is used across all PI CUDA plugin implementation
-/// to activate the PI Context on the current thread, matching the
-/// CUDA driver semantics where the context used for the CUDA Driver
-/// API is the one active on the thread.
-/// The implementation tries to avoid replacing the CUcontext if it cans
+/// ScopedContext is used across all PI CUDA plugin implementation to ensure
+/// that the proper CUDA context is active for the given PI context.
+//
+/// This class will only replace the context if necessary, and will leave the
+/// new context active on the current thread. If there was an active context
+/// already it will simply be replaced.
+//
+/// Previously active contexts are not restored for two reasons:
+/// * Performance: context switches are expensive so leaving the context active
+///   means subsequent SYCL calls with the same context will be cheaper.
+/// * Multi-threading cleanup: contexts are set active per thread and deleting a
+///   context will only deactivate it for the current thread. This means other
+///   threads may end up with deleted active contexts. In particular this can
+///   happen with host_tasks as they run in a thread pool. When the context
+///   associated with these tasks is deleted it will remain active in the
+///   threads of the thread pool. So it would be invalid for any other task
+///   running on these threads to try to restore the deleted context. With the
+///   current implementation this is not an issue because the active deleted
+///   context will just be replaced.
+//
+/// This approach does mean that CUDA interop tasks should NOT expect their
+/// contexts to be restored by SYCL.
 class ScopedContext {
-  pi_context placedContext_;
-  CUcontext original_;
-  bool needToRecover_;
-
 public:
-  ScopedContext(pi_context ctxt) : placedContext_{ctxt}, needToRecover_{false} {
-
-    if (!placedContext_) {
+  ScopedContext(pi_context ctxt) {
+    if (!ctxt) {
       throw PI_INVALID_CONTEXT;
     }
 
-    CUcontext desired = placedContext_->get();
-    PI_CHECK_ERROR(cuCtxGetCurrent(&original_));
-    if (original_ != desired) {
-      // Sets the desired context as the active one for the thread
+    CUcontext desired = ctxt->get();
+    CUcontext original = nullptr;
+
+    PI_CHECK_ERROR(cuCtxGetCurrent(&original));
+
+    // Make sure the desired context is active on the current thread, setting
+    // it if necessary
+    if (original != desired) {
       PI_CHECK_ERROR(cuCtxSetCurrent(desired));
-      if (original_ == nullptr) {
-        // No context is installed on the current thread
-        // This is the most common case. We can activate the context in the
-        // thread and leave it there until all the PI context referring to the
-        // same underlying CUDA context are destroyed. This emulates
-        // the behaviour of the CUDA runtime api, and avoids costly context
-        // switches. No action is required on this side of the if.
-      } else {
-        needToRecover_ = true;
-      }
     }
   }
 
-  ~ScopedContext() {
-    if (needToRecover_) {
-      PI_CHECK_ERROR(cuCtxSetCurrent(original_));
-    }
-  }
+  ~ScopedContext() {}
 };
 
 /// \cond NODOXY
diff --git a/sycl/unittests/pi/cuda/CMakeLists.txt b/sycl/unittests/pi/cuda/CMakeLists.txt
@@ -2,6 +2,7 @@ set(LLVM_REQUIRES_EH 1)
 add_sycl_unittest(PiCudaTests OBJECT
   test_base_objects.cpp
   test_commands.cpp
+  test_contexts.cpp
   test_device.cpp
   test_interop_get_native.cpp
   test_kernels.cpp
diff --git a/sycl/unittests/pi/cuda/test_contexts.cpp b/sycl/unittests/pi/cuda/test_contexts.cpp
@@ -0,0 +1,247 @@
+//==---- test_contexts.cpp --- PI unit tests -------------------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "gtest/gtest.h"
+
+#include <condition_variable>
+#include <thread>
+#include <mutex>
+
+#include <cuda.h>
+
+#include "TestGetPlugin.hpp"
+#include <CL/sycl.hpp>
+#include <CL/sycl/detail/pi.hpp>
+#include <detail/plugin.hpp>
+#include <pi_cuda.hpp>
+
+using namespace cl::sycl;
+
+struct CudaContextsTest : public ::testing::Test {
+
+protected:
+  detail::plugin *plugin = pi::initializeAndGet(backend::cuda);
+
+  pi_platform platform_;
+  pi_device device_;
+
+  void SetUp() override {
+    // skip the tests if the CUDA backend is not available
+    if (!plugin) {
+      GTEST_SKIP();
+    }
+
+    pi_uint32 numPlatforms = 0;
+    ASSERT_EQ(plugin->getBackend(), backend::cuda);
+
+    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piPlatformsGet>(
+                  0, nullptr, &numPlatforms)),
+              PI_SUCCESS)
+        << "piPlatformsGet failed.\n";
+
+    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piPlatformsGet>(
+                  numPlatforms, &platform_, nullptr)),
+              PI_SUCCESS)
+        << "piPlatformsGet failed.\n";
+
+    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piDevicesGet>(
+                  platform_, PI_DEVICE_TYPE_GPU, 1, &device_, nullptr)),
+              PI_SUCCESS);
+  }
+
+  void TearDown() override {}
+
+  CudaContextsTest() = default;
+
+  ~CudaContextsTest() = default;
+};
+
+TEST_F(CudaContextsTest, ContextLifetime) {
+  // start with no active context
+  cuCtxSetCurrent(nullptr);
+
+  // create a context
+  pi_context context;
+  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piContextCreate>(
+                nullptr, 1, &device_, nullptr, nullptr, &context)),
+            PI_SUCCESS);
+  ASSERT_NE(context, nullptr);
+
+  // create a queue from the context, this should use the ScopedContext
+  pi_queue queue;
+  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piQueueCreate>(
+                context, device_, 0, &queue)),
+            PI_SUCCESS);
+  ASSERT_NE(queue, nullptr);
+
+  // ensure the queue has the correct context
+  ASSERT_EQ(context, queue->get_context());
+
+  // check that the context is now the active CUDA context
+  CUcontext cudaCtxt = nullptr;
+  cuCtxGetCurrent(&cudaCtxt);
+  ASSERT_EQ(cudaCtxt, context->get());
+
+  plugin->call<detail::PiApiKind::piQueueRelease>(queue);
+  plugin->call<detail::PiApiKind::piContextRelease>(context);
+
+  // check that the context was cleaned up properly by the destructor
+  cuCtxGetCurrent(&cudaCtxt);
+  ASSERT_EQ(cudaCtxt, nullptr);
+}
+
+TEST_F(CudaContextsTest, ContextLifetimeExisting) {
+  // start by setting up a CUDA context on the thread
+  CUcontext original;
+  cuCtxCreate(&original, CU_CTX_MAP_HOST, device_->get());
+
+  // ensure the CUDA context is active
+  CUcontext current = nullptr;
+  cuCtxGetCurrent(&current);
+  ASSERT_EQ(original, current);
+
+  // create a PI context
+  pi_context context;
+  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piContextCreate>(
+                nullptr, 1, &device_, nullptr, nullptr, &context)),
+            PI_SUCCESS);
+  ASSERT_NE(context, nullptr);
+
+  // create a queue from the context, this should use the ScopedContext
+  pi_queue queue;
+  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piQueueCreate>(
+                context, device_, 0, &queue)),
+            PI_SUCCESS);
+  ASSERT_NE(queue, nullptr);
+
+  // ensure the queue has the correct context
+  ASSERT_EQ(context, queue->get_context());
+
+  // check that the context is now the active CUDA context
+  cuCtxGetCurrent(&current);
+  ASSERT_EQ(current, context->get());
+
+  plugin->call<detail::PiApiKind::piQueueRelease>(queue);
+  plugin->call<detail::PiApiKind::piContextRelease>(context);
+
+  // check that the context was cleaned up, the old context will be restored
+  // automatically by cuCtxDestroy in piContextRelease, as it was pushed on the
+  // stack bu cuCtxCreate
+  cuCtxGetCurrent(&current);
+  ASSERT_EQ(current, original);
+
+  // release original context
+  cuCtxDestroy(original);
+}
+
+// In some cases (for host_task), the SYCL runtime may call PI API functions
+// from threads of the thread pool, this can cause issues because with the CUDA
+// plugin these functions will set an active CUDA context on these threads, but
+// never clean it up, as it will only get cleaned up in the main thread.
+//
+// So the following test aims to reproduce the scenario where there is a
+// dangling deleted context in a separate thread and seeing if the PI calls are
+// still able to work correctly in that thread.
+TEST_F(CudaContextsTest, ContextThread) {
+  // start with no active context
+  cuCtxSetCurrent(nullptr);
+
+  // create two PI contexts
+  pi_context context1;
+  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piContextCreate>(
+                nullptr, 1, &device_, nullptr, nullptr, &context1)),
+            PI_SUCCESS);
+  ASSERT_NE(context1, nullptr);
+
+  pi_context context2;
+  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piContextCreate>(
+                nullptr, 1, &device_, nullptr, nullptr, &context2)),
+            PI_SUCCESS);
+  ASSERT_NE(context2, nullptr);
+
+  // setup synchronization variables between the main thread and the testing
+  // thread
+  std::mutex m;
+  std::condition_variable cv;
+  bool released = false;
+  bool thread_done = false;
+
+  // create a testing thread that will create a queue with the first context,
+  // release the queue, then wait for the main thread to release the first
+  // context, and then create and release another queue with the second context
+  // this time
+  auto test_thread = std::thread([&] {
+    CUcontext current = nullptr;
+
+    // create a queue with the first context
+    pi_queue queue;
+    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piQueueCreate>(
+                  context1, device_, 0, &queue)),
+              PI_SUCCESS);
+    ASSERT_NE(queue, nullptr);
+
+    // ensure the queue has the correct context
+    ASSERT_EQ(context1, queue->get_context());
+
+    // check that the first context is now the active CUDA context
+    cuCtxGetCurrent(&current);
+    ASSERT_EQ(current, context1->get());
+
+    plugin->call<detail::PiApiKind::piQueueRelease>(queue);
+
+    // mark the first set of processing as done and notify the main thread
+    std::unique_lock<std::mutex> lock(m);
+    thread_done = true;
+    lock.unlock();
+    cv.notify_one();
+
+    // wait for the main thread to release the first context
+    lock.lock();
+    cv.wait(lock, [&] { return released; });
+
+    // check that the first context is still active, this is because deleting a
+    // context only cleans up the current thread
+    cuCtxGetCurrent(&current);
+    ASSERT_EQ(current, context1->get());
+
+    // create a queue with the second context
+    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piQueueCreate>(
+                  context2, device_, 0, &queue)),
+              PI_SUCCESS);
+    ASSERT_NE(queue, nullptr);
+
+    // ensure the queue has the correct context
+    ASSERT_EQ(context2, queue->get_context());
+
+    // check that the second context is now the active CUDA context
+    cuCtxGetCurrent(&current);
+    ASSERT_EQ(current, context2->get());
+
+    plugin->call<detail::PiApiKind::piQueueRelease>(queue);
+  });
+
+  // wait for the thread to be done with the first queue to release the first context
+  std::unique_lock<std::mutex> lock(m);
+  cv.wait(lock, [&] { return thread_done; });
+  plugin->call<detail::PiApiKind::piContextRelease>(context1);
+
+  // notify the other thread that the context was released
+  released = true;
+  lock.unlock();
+  cv.notify_one();
+
+  // wait for the thread to finish
+  test_thread.join();
+
+  plugin->call<detail::PiApiKind::piContextRelease>(context2);
+
+  // check that there is no context set on the main thread
+  CUcontext current = nullptr;
+  cuCtxGetCurrent(&current);
+  ASSERT_EQ(current, nullptr);
+}