intel · romanovvlad · Sep 3, 2021 · Aug 25, 2021 · Sep 1, 2021 · Sep 2, 2021
@@ -137,47 +137,48 @@ pi_result check_error(CUresult result, const char *function, int line,
 /// \cond NODOXY
 #define PI_CHECK_ERROR(result) check_error(result, __func__, __LINE__, __FILE__)
 
-/// RAII type to guarantee recovering original CUDA context
-/// Scoped context is used across all PI CUDA plugin implementation
-/// to activate the PI Context on the current thread, matching the
-/// CUDA driver semantics where the context used for the CUDA Driver
-/// API is the one active on the thread.
-/// The implementation tries to avoid replacing the CUcontext if it cans
+/// ScopedContext is used across all PI CUDA plugin implementation to ensure
+/// that the proper CUDA context is active for the given PI context.
+//
+/// This class will only replace the context if necessary, and will leave the
+/// new context active on the current thread. If there was an active context
+/// already it will simply be replaced.
+//
+/// Previously active contexts are not restored for two reasons:
+/// * Performance: context switches are expensive so leaving the context active
+///   means subsequent SYCL calls with the same context will be cheaper.
+/// * Multi-threading cleanup: contexts are set active per thread and deleting a
+///   context will only deactivate it for the current thread. This means other
+///   threads may end up with deleted active contexts. In particular this can
+///   happen with host_tasks as they run in a thread pool. When the context
+///   associated with these tasks is deleted it will remain active in the
+///   threads of the thread pool. So it would be invalid for any other task
+///   running on these threads to try to restore the deleted context. With the
+///   current implementation this is not an issue because the active deleted
+///   context will just be replaced.
+//
+/// This approach does mean that CUDA interop tasks should NOT expect their
+/// contexts to be restored by SYCL.
 class ScopedContext {
-  pi_context placedContext_;
-  CUcontext original_;
-  bool needToRecover_;
-
 public:
-  ScopedContext(pi_context ctxt) : placedContext_{ctxt}, needToRecover_{false} {
-
-    if (!placedContext_) {
+  ScopedContext(pi_context ctxt) {
+    if (!ctxt) {
       throw PI_INVALID_CONTEXT;
     }
 
-    CUcontext desired = placedContext_->get();
-    PI_CHECK_ERROR(cuCtxGetCurrent(&original_));
-    if (original_ != desired) {
-      // Sets the desired context as the active one for the thread
+    CUcontext desired = ctxt->get();
+    CUcontext original = nullptr;
+
+    PI_CHECK_ERROR(cuCtxGetCurrent(&original));
+
+    // Make sure the desired context is active on the current thread, setting
+    // it if necessary
+    if (original != desired) {
       PI_CHECK_ERROR(cuCtxSetCurrent(desired));
-      if (original_ == nullptr) {
-        // No context is installed on the current thread
-        // This is the most common case. We can activate the context in the
-        // thread and leave it there until all the PI context referring to the
-        // same underlying CUDA context are destroyed. This emulates
-        // the behaviour of the CUDA runtime api, and avoids costly context
-        // switches. No action is required on this side of the if.
-      } else {
-        needToRecover_ = true;
-      }
     }
   }
 
-  ~ScopedContext() {
-    if (needToRecover_) {
-      PI_CHECK_ERROR(cuCtxSetCurrent(original_));
-    }
-  }
+  ~ScopedContext() {}
 };
 
 /// \cond NODOXY

@@ -2,6 +2,7 @@ set(LLVM_REQUIRES_EH 1)
 add_sycl_unittest(PiCudaTests OBJECT
   test_base_objects.cpp
   test_commands.cpp
+  test_contexts.cpp
   test_device.cpp
   test_interop_get_native.cpp
   test_kernels.cpp

@@ -0,0 +1,247 @@
+//==---- test_contexts.cpp --- PI unit tests -------------------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "gtest/gtest.h"
+
+#include <condition_variable>
+#include <thread>
+#include <mutex>
+
+#include <cuda.h>
+
+#include "TestGetPlugin.hpp"
+#include <CL/sycl.hpp>
+#include <CL/sycl/detail/pi.hpp>
+#include <detail/plugin.hpp>
+#include <pi_cuda.hpp>
+
+using namespace cl::sycl;
+
+struct CudaContextsTest : public ::testing::Test {
+
+protected:
+  detail::plugin *plugin = pi::initializeAndGet(backend::cuda);
+
+  pi_platform platform_;
+  pi_device device_;
+
+  void SetUp() override {
+    // skip the tests if the CUDA backend is not available
+    if (!plugin) {
+      GTEST_SKIP();
+    }
+
+    pi_uint32 numPlatforms = 0;
+    ASSERT_EQ(plugin->getBackend(), backend::cuda);
+
+    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piPlatformsGet>(
+                  0, nullptr, &numPlatforms)),
+              PI_SUCCESS)
+        << "piPlatformsGet failed.\n";
+
+    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piPlatformsGet>(
+                  numPlatforms, &platform_, nullptr)),
+              PI_SUCCESS)
+        << "piPlatformsGet failed.\n";
+
+    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piDevicesGet>(
+                  platform_, PI_DEVICE_TYPE_GPU, 1, &device_, nullptr)),
+              PI_SUCCESS);
+  }
+
+  void TearDown() override {}
+
+  CudaContextsTest() = default;
+
+  ~CudaContextsTest() = default;
+};
+
+TEST_F(CudaContextsTest, ContextLifetime) {
+  // start with no active context
+  cuCtxSetCurrent(nullptr);
+
+  // create a context
+  pi_context context;
+  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piContextCreate>(
+                nullptr, 1, &device_, nullptr, nullptr, &context)),
+            PI_SUCCESS);
+  ASSERT_NE(context, nullptr);
+
+  // create a queue from the context, this should use the ScopedContext
+  pi_queue queue;
+  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piQueueCreate>(
+                context, device_, 0, &queue)),
+            PI_SUCCESS);
+  ASSERT_NE(queue, nullptr);
+
+  // ensure the queue has the correct context
+  ASSERT_EQ(context, queue->get_context());
+
+  // check that the context is now the active CUDA context
+  CUcontext cudaCtxt = nullptr;
+  cuCtxGetCurrent(&cudaCtxt);
+  ASSERT_EQ(cudaCtxt, context->get());
+
+  plugin->call<detail::PiApiKind::piQueueRelease>(queue);
+  plugin->call<detail::PiApiKind::piContextRelease>(context);
+
+  // check that the context was cleaned up properly by the destructor
+  cuCtxGetCurrent(&cudaCtxt);
+  ASSERT_EQ(cudaCtxt, nullptr);
+}
+
+TEST_F(CudaContextsTest, ContextLifetimeExisting) {
+  // start by setting up a CUDA context on the thread
+  CUcontext original;
+  cuCtxCreate(&original, CU_CTX_MAP_HOST, device_->get());
+
+  // ensure the CUDA context is active
+  CUcontext current = nullptr;
+  cuCtxGetCurrent(&current);
+  ASSERT_EQ(original, current);
+
+  // create a PI context
+  pi_context context;
+  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piContextCreate>(
+                nullptr, 1, &device_, nullptr, nullptr, &context)),
+            PI_SUCCESS);
+  ASSERT_NE(context, nullptr);
+
+  // create a queue from the context, this should use the ScopedContext
+  pi_queue queue;
+  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piQueueCreate>(
+                context, device_, 0, &queue)),
+            PI_SUCCESS);
+  ASSERT_NE(queue, nullptr);
+
+  // ensure the queue has the correct context
+  ASSERT_EQ(context, queue->get_context());
+
+  // check that the context is now the active CUDA context
+  cuCtxGetCurrent(&current);
+  ASSERT_EQ(current, context->get());
+
+  plugin->call<detail::PiApiKind::piQueueRelease>(queue);
+  plugin->call<detail::PiApiKind::piContextRelease>(context);
+
+  // check that the context was cleaned up, the old context will be restored
+  // automatically by cuCtxDestroy in piContextRelease, as it was pushed on the
+  // stack bu cuCtxCreate
+  cuCtxGetCurrent(&current);
+  ASSERT_EQ(current, original);
+
+  // release original context
+  cuCtxDestroy(original);
+}
+
+// In some cases (for host_task), the SYCL runtime may call PI API functions
+// from threads of the thread pool, this can cause issues because with the CUDA
+// plugin these functions will set an active CUDA context on these threads, but
+// never clean it up, as it will only get cleaned up in the main thread.
+//
+// So the following test aims to reproduce the scenario where there is a
+// dangling deleted context in a separate thread and seeing if the PI calls are
+// still able to work correctly in that thread.
+TEST_F(CudaContextsTest, ContextThread) {
+  // start with no active context
+  cuCtxSetCurrent(nullptr);
+
+  // create two PI contexts
+  pi_context context1;
+  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piContextCreate>(
+                nullptr, 1, &device_, nullptr, nullptr, &context1)),
+            PI_SUCCESS);
+  ASSERT_NE(context1, nullptr);
+
+  pi_context context2;
+  ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piContextCreate>(
+                nullptr, 1, &device_, nullptr, nullptr, &context2)),
+            PI_SUCCESS);
+  ASSERT_NE(context2, nullptr);
+
+  // setup synchronization variables between the main thread and the testing
+  // thread
+  std::mutex m;
+  std::condition_variable cv;
+  bool released = false;
+  bool thread_done = false;
+
+  // create a testing thread that will create a queue with the first context,
+  // release the queue, then wait for the main thread to release the first
+  // context, and then create and release another queue with the second context
+  // this time
+  auto test_thread = std::thread([&] {
+    CUcontext current = nullptr;
+
+    // create a queue with the first context
+    pi_queue queue;
+    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piQueueCreate>(
+                  context1, device_, 0, &queue)),
+              PI_SUCCESS);
+    ASSERT_NE(queue, nullptr);
+
+    // ensure the queue has the correct context
+    ASSERT_EQ(context1, queue->get_context());
+
+    // check that the first context is now the active CUDA context
+    cuCtxGetCurrent(&current);
+    ASSERT_EQ(current, context1->get());
+
+    plugin->call<detail::PiApiKind::piQueueRelease>(queue);
+
+    // mark the first set of processing as done and notify the main thread
+    std::unique_lock<std::mutex> lock(m);
+    thread_done = true;
+    lock.unlock();
+    cv.notify_one();
+
+    // wait for the main thread to release the first context
+    lock.lock();
+    cv.wait(lock, [&] { return released; });
+
+    // check that the first context is still active, this is because deleting a
+    // context only cleans up the current thread
+    cuCtxGetCurrent(&current);
+    ASSERT_EQ(current, context1->get());
+
+    // create a queue with the second context
+    ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piQueueCreate>(
+                  context2, device_, 0, &queue)),
+              PI_SUCCESS);
+    ASSERT_NE(queue, nullptr);
+
+    // ensure the queue has the correct context
+    ASSERT_EQ(context2, queue->get_context());
+
+    // check that the second context is now the active CUDA context
+    cuCtxGetCurrent(&current);
+    ASSERT_EQ(current, context2->get());
+
+    plugin->call<detail::PiApiKind::piQueueRelease>(queue);
+  });
+
+  // wait for the thread to be done with the first queue to release the first context
+  std::unique_lock<std::mutex> lock(m);
+  cv.wait(lock, [&] { return thread_done; });
+  plugin->call<detail::PiApiKind::piContextRelease>(context1);
+
+  // notify the other thread that the context was released
+  released = true;
+  lock.unlock();
+  cv.notify_one();
+
+  // wait for the thread to finish
+  test_thread.join();
+
+  plugin->call<detail::PiApiKind::piContextRelease>(context2);
+
+  // check that there is no context set on the main thread
+  CUcontext current = nullptr;
+  cuCtxGetCurrent(&current);
+  ASSERT_EQ(current, nullptr);
+}