Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SYCL][CUDA] Don't restore CUDA contexts #4442

Merged
merged 3 commits into from
Sep 3, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 33 additions & 32 deletions sycl/plugins/cuda/pi_cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -137,47 +137,48 @@ pi_result check_error(CUresult result, const char *function, int line,
/// \cond NODOXY
#define PI_CHECK_ERROR(result) check_error(result, __func__, __LINE__, __FILE__)

/// RAII type to guarantee recovering original CUDA context
/// Scoped context is used across all PI CUDA plugin implementation
/// to activate the PI Context on the current thread, matching the
/// CUDA driver semantics where the context used for the CUDA Driver
/// API is the one active on the thread.
/// The implementation tries to avoid replacing the CUcontext if it cans
/// ScopedContext is used across all PI CUDA plugin implementation to ensure
/// that the proper CUDA context is active for the given PI context.
//
/// This class will only replace the context if necessary, and will leave the
/// new context active on the current thread. If there was an active context
/// already it will simply be replaced.
//
/// Previously active contexts are not restored for two reasons:
/// * Performance: context switches are expensive so leaving the context active
/// means subsequent SYCL calls with the same context will be cheaper.
/// * Multi-threading cleanup: contexts are set active per thread and deleting a
/// context will only deactivate it for the current thread. This means other
/// threads may end up with deleted active contexts. In particular this can
/// happen with host_tasks as they run in a thread pool. When the context
/// associated with these tasks is deleted it will remain active in the
/// threads of the thread pool. So it would be invalid for any other task
/// running on these threads to try to restore the deleted context. With the
/// current implementation this is not an issue because the active deleted
/// context will just be replaced.
//
/// This approach does mean that CUDA interop tasks should NOT expect their
/// contexts to be restored by SYCL.
class ScopedContext {
pi_context placedContext_;
CUcontext original_;
bool needToRecover_;

public:
ScopedContext(pi_context ctxt) : placedContext_{ctxt}, needToRecover_{false} {

if (!placedContext_) {
ScopedContext(pi_context ctxt) {
if (!ctxt) {
throw PI_INVALID_CONTEXT;
}

CUcontext desired = placedContext_->get();
PI_CHECK_ERROR(cuCtxGetCurrent(&original_));
if (original_ != desired) {
// Sets the desired context as the active one for the thread
CUcontext desired = ctxt->get();
CUcontext original = nullptr;

PI_CHECK_ERROR(cuCtxGetCurrent(&original));

// Make sure the desired context is active on the current thread, setting
// it if necessary
if (original != desired) {
PI_CHECK_ERROR(cuCtxSetCurrent(desired));
if (original_ == nullptr) {
// No context is installed on the current thread
// This is the most common case. We can activate the context in the
// thread and leave it there until all the PI context referring to the
// same underlying CUDA context are destroyed. This emulates
// the behaviour of the CUDA runtime api, and avoids costly context
// switches. No action is required on this side of the if.
} else {
needToRecover_ = true;
}
}
}

~ScopedContext() {
if (needToRecover_) {
PI_CHECK_ERROR(cuCtxSetCurrent(original_));
}
}
~ScopedContext() {}
};

/// \cond NODOXY
Expand Down
1 change: 1 addition & 0 deletions sycl/unittests/pi/cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ set(LLVM_REQUIRES_EH 1)
add_sycl_unittest(PiCudaTests OBJECT
test_base_objects.cpp
test_commands.cpp
test_contexts.cpp
test_device.cpp
test_interop_get_native.cpp
test_kernels.cpp
Expand Down
247 changes: 247 additions & 0 deletions sycl/unittests/pi/cuda/test_contexts.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,247 @@
//==---- test_contexts.cpp --- PI unit tests -------------------------------==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "gtest/gtest.h"

#include <condition_variable>
#include <thread>
#include <mutex>

#include <cuda.h>

#include "TestGetPlugin.hpp"
#include <CL/sycl.hpp>
#include <CL/sycl/detail/pi.hpp>
#include <detail/plugin.hpp>
#include <pi_cuda.hpp>

using namespace cl::sycl;

struct CudaContextsTest : public ::testing::Test {

protected:
detail::plugin *plugin = pi::initializeAndGet(backend::cuda);

pi_platform platform_;
pi_device device_;

void SetUp() override {
// skip the tests if the CUDA backend is not available
if (!plugin) {
GTEST_SKIP();
}

pi_uint32 numPlatforms = 0;
ASSERT_EQ(plugin->getBackend(), backend::cuda);

ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piPlatformsGet>(
0, nullptr, &numPlatforms)),
PI_SUCCESS)
<< "piPlatformsGet failed.\n";

ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piPlatformsGet>(
numPlatforms, &platform_, nullptr)),
PI_SUCCESS)
<< "piPlatformsGet failed.\n";

ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piDevicesGet>(
platform_, PI_DEVICE_TYPE_GPU, 1, &device_, nullptr)),
PI_SUCCESS);
}

void TearDown() override {}

CudaContextsTest() = default;

~CudaContextsTest() = default;
};

TEST_F(CudaContextsTest, ContextLifetime) {
// start with no active context
cuCtxSetCurrent(nullptr);

// create a context
pi_context context;
ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piContextCreate>(
nullptr, 1, &device_, nullptr, nullptr, &context)),
PI_SUCCESS);
ASSERT_NE(context, nullptr);

// create a queue from the context, this should use the ScopedContext
pi_queue queue;
ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piQueueCreate>(
context, device_, 0, &queue)),
PI_SUCCESS);
ASSERT_NE(queue, nullptr);

// ensure the queue has the correct context
ASSERT_EQ(context, queue->get_context());

// check that the context is now the active CUDA context
CUcontext cudaCtxt = nullptr;
cuCtxGetCurrent(&cudaCtxt);
ASSERT_EQ(cudaCtxt, context->get());

plugin->call<detail::PiApiKind::piQueueRelease>(queue);
plugin->call<detail::PiApiKind::piContextRelease>(context);

// check that the context was cleaned up properly by the destructor
cuCtxGetCurrent(&cudaCtxt);
ASSERT_EQ(cudaCtxt, nullptr);
}

TEST_F(CudaContextsTest, ContextLifetimeExisting) {
// start by setting up a CUDA context on the thread
CUcontext original;
cuCtxCreate(&original, CU_CTX_MAP_HOST, device_->get());

// ensure the CUDA context is active
CUcontext current = nullptr;
cuCtxGetCurrent(&current);
ASSERT_EQ(original, current);

// create a PI context
pi_context context;
ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piContextCreate>(
nullptr, 1, &device_, nullptr, nullptr, &context)),
PI_SUCCESS);
ASSERT_NE(context, nullptr);

// create a queue from the context, this should use the ScopedContext
pi_queue queue;
ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piQueueCreate>(
context, device_, 0, &queue)),
PI_SUCCESS);
ASSERT_NE(queue, nullptr);

// ensure the queue has the correct context
ASSERT_EQ(context, queue->get_context());

// check that the context is now the active CUDA context
cuCtxGetCurrent(&current);
ASSERT_EQ(current, context->get());

plugin->call<detail::PiApiKind::piQueueRelease>(queue);
plugin->call<detail::PiApiKind::piContextRelease>(context);

// check that the context was cleaned up, the old context will be restored
// automatically by cuCtxDestroy in piContextRelease, as it was pushed on the
// stack bu cuCtxCreate
cuCtxGetCurrent(&current);
ASSERT_EQ(current, original);

// release original context
cuCtxDestroy(original);
}

// In some cases (for host_task), the SYCL runtime may call PI API functions
// from threads of the thread pool, this can cause issues because with the CUDA
// plugin these functions will set an active CUDA context on these threads, but
// never clean it up, as it will only get cleaned up in the main thread.
//
// So the following test aims to reproduce the scenario where there is a
// dangling deleted context in a separate thread and seeing if the PI calls are
// still able to work correctly in that thread.
TEST_F(CudaContextsTest, ContextThread) {
// start with no active context
cuCtxSetCurrent(nullptr);

// create two PI contexts
pi_context context1;
ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piContextCreate>(
nullptr, 1, &device_, nullptr, nullptr, &context1)),
PI_SUCCESS);
ASSERT_NE(context1, nullptr);

pi_context context2;
ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piContextCreate>(
nullptr, 1, &device_, nullptr, nullptr, &context2)),
PI_SUCCESS);
ASSERT_NE(context2, nullptr);

// setup synchronization variables between the main thread and the testing
// thread
std::mutex m;
std::condition_variable cv;
bool released = false;
bool thread_done = false;

// create a testing thread that will create a queue with the first context,
// release the queue, then wait for the main thread to release the first
// context, and then create and release another queue with the second context
// this time
auto test_thread = std::thread([&] {
CUcontext current = nullptr;

// create a queue with the first context
pi_queue queue;
ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piQueueCreate>(
context1, device_, 0, &queue)),
PI_SUCCESS);
ASSERT_NE(queue, nullptr);

// ensure the queue has the correct context
ASSERT_EQ(context1, queue->get_context());

// check that the first context is now the active CUDA context
cuCtxGetCurrent(&current);
ASSERT_EQ(current, context1->get());

plugin->call<detail::PiApiKind::piQueueRelease>(queue);

// mark the first set of processing as done and notify the main thread
std::unique_lock<std::mutex> lock(m);
thread_done = true;
lock.unlock();
cv.notify_one();

// wait for the main thread to release the first context
lock.lock();
cv.wait(lock, [&] { return released; });

// check that the first context is still active, this is because deleting a
// context only cleans up the current thread
cuCtxGetCurrent(&current);
ASSERT_EQ(current, context1->get());

// create a queue with the second context
ASSERT_EQ((plugin->call_nocheck<detail::PiApiKind::piQueueCreate>(
context2, device_, 0, &queue)),
PI_SUCCESS);
ASSERT_NE(queue, nullptr);

// ensure the queue has the correct context
ASSERT_EQ(context2, queue->get_context());

// check that the second context is now the active CUDA context
cuCtxGetCurrent(&current);
ASSERT_EQ(current, context2->get());

plugin->call<detail::PiApiKind::piQueueRelease>(queue);
});

// wait for the thread to be done with the first queue to release the first context
std::unique_lock<std::mutex> lock(m);
cv.wait(lock, [&] { return thread_done; });
plugin->call<detail::PiApiKind::piContextRelease>(context1);

// notify the other thread that the context was released
released = true;
lock.unlock();
cv.notify_one();

// wait for the thread to finish
test_thread.join();

plugin->call<detail::PiApiKind::piContextRelease>(context2);

// check that there is no context set on the main thread
CUcontext current = nullptr;
cuCtxGetCurrent(&current);
ASSERT_EQ(current, nullptr);
}