Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SYCL][ROCm] Use offload-arch instead of mcpu for AMD arch #4239

Merged
merged 8 commits into from
Aug 19, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions clang/include/clang/Basic/DiagnosticDriverKinds.td
Original file line number Diff line number Diff line change
Expand Up @@ -289,6 +289,8 @@ def err_drv_expecting_fsycl_with_sycl_opt : Error<
"'%0' must be used in conjunction with '-fsycl' to enable offloading">;
def err_drv_fsycl_with_c_type : Error<
"'%0' must not be used in conjunction with '-fsycl', which expects C++ source">;
def err_drv_sycl_missing_amdgpu_arch : Error<
"missing AMDGPU architecture for SYCL offloading; specify it with '-Xsycl-target-backend --offload-arch'">;
def warn_drv_sycl_offload_target_duplicate : Warning<
"SYCL offloading target '%0' is similar to target '%1' already specified; "
"will be ignored">, InGroup<SyclTarget>;
Expand Down
102 changes: 81 additions & 21 deletions clang/lib/Driver/Driver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3960,8 +3960,9 @@ class OffloadingActionBuilder final {
/// List of static archives to extract FPGA dependency info from
ActionList FPGAArchiveInputs;

/// List of CUDA architectures to use in this compilation with NVPTX targets.
SmallVector<CudaArch, 8> GpuArchList;
/// List of GPU architectures to use in this compilation with NVPTX/AMDGCN
/// targets.
SmallVector<std::pair<llvm::Triple, std::string>, 8> GpuArchList;

/// Build the last steps for CUDA after all BC files have been linked.
JobAction *finalizeNVPTXDependences(Action *Input, const llvm::Triple &TT) {
Expand Down Expand Up @@ -3998,13 +3999,17 @@ class OffloadingActionBuilder final {
const Driver::InputList &Inputs)
: DeviceActionBuilder(C, Args, Inputs, Action::OFK_SYCL) {}

void withBoundArchForToolChain(const ToolChain* TC,
void withBoundArchForToolChain(const ToolChain *TC,
llvm::function_ref<void(const char *)> Op) {
if (TC->getTriple().isNVPTX())
for (CudaArch A : GpuArchList)
Op(CudaArchToString(A));
else
Op(nullptr);
for (auto &A : GpuArchList) {
if (TC->getTriple() == A.first) {
Op(Args.MakeArgString(A.second.c_str()));
return;
}
}

// no bound arch for this toolchain
Op(nullptr);
}

ActionBuilderReturnCode
Expand Down Expand Up @@ -4058,8 +4063,8 @@ class OffloadingActionBuilder final {
}
const auto *TC = ToolChains.front();
const char *BoundArch = nullptr;
if (TC->getTriple().isNVPTX())
BoundArch = CudaArchToString(GpuArchList.front());
if (TC->getTriple().isNVPTX() || TC->getTriple().isAMDGCN())
BoundArch = GpuArchList.front().second.c_str();
DA.add(*DeviceCompilerInput, *TC, BoundArch, Action::OFK_SYCL);
// Clear the input file, it is already a dependence to a host
// action.
Expand Down Expand Up @@ -4642,39 +4647,94 @@ class OffloadingActionBuilder final {
}
}

/// Initialize the GPU architecture list from arguments - this populates `GpuArchList` from
/// `--cuda-gpu-arch` flags. Only relevant if compiling to CUDA. Return true if any
/// initialization errors are found.
/// Initialize the GPU architecture list from arguments - this populates
/// `GpuArchList` from `--offload-arch` flags. Only relevant if compiling to
/// CUDA or AMDGCN. Return true if any initialization errors are found.
/// FIXME: "offload-arch" and the BoundArch mechanism should also be
// used in the SYCLToolChain for SPIR-V AOT to track the offload
// architecture instead of the Triple sub-arch it currently uses.
bool initializeGpuArchMap() {
const OptTable &Opts = C.getDriver().getOpts();
for (auto *A : Args) {
unsigned Index;
llvm::Triple *TargetBE = nullptr;

if (A->getOption().matches(options::OPT_Xsycl_backend_EQ))
auto GetTripleIt = [&, this](llvm::StringRef Triple) {
llvm::Triple TargetTriple{Triple};
auto TripleIt = llvm::find_if(SYCLTripleList, [&](auto &SYCLTriple) {
return SYCLTriple == TargetTriple;
});
return TripleIt != SYCLTripleList.end() ? &*TripleIt : nullptr;
};

if (A->getOption().matches(options::OPT_Xsycl_backend_EQ)) {
TargetBE = GetTripleIt(A->getValue(0));
// Passing device args: -Xsycl-target-backend=<triple> -opt=val.
if (llvm::Triple(A->getValue(0)).isNVPTX())
if (TargetBE)
Index = Args.getBaseArgs().MakeIndex(A->getValue(1));
else
continue;
else if (A->getOption().matches(options::OPT_Xsycl_backend))
} else if (A->getOption().matches(options::OPT_Xsycl_backend)) {
if (SYCLTripleList.size() > 1) {
C.getDriver().Diag(diag::err_drv_Xsycl_target_missing_triple)
<< A->getSpelling();
continue;
}
// Passing device args: -Xsycl-target-backend -opt=val.
TargetBE = &SYCLTripleList.front();
Index = Args.getBaseArgs().MakeIndex(A->getValue(0));
else
} else
continue;

A->claim();
auto ParsedArg = Opts.ParseOneArg(Args, Index);

// TODO: Support --no-cuda-gpu-arch, --{,no-}cuda-gpu-arch=all.
if (ParsedArg &&
ParsedArg->getOption().matches(options::OPT_offload_arch_EQ)) {
llvm::StringRef ArchStr = ParsedArg->getValue(0);
if (TargetBE->isNVPTX()) {
// CUDA arch also applies to AMDGCN ...
CudaArch Arch = StringToCudaArch(ArchStr);
if (Arch == CudaArch::UNKNOWN || !IsNVIDIAGpuArch(Arch)) {
C.getDriver().Diag(clang::diag::err_drv_cuda_bad_gpu_arch)
<< ArchStr;
continue;
}
ArchStr = CudaArchToString(Arch);
} else if (TargetBE->isAMDGCN()) {
llvm::StringMap<bool> Features;
auto Arch =
parseTargetID(getHIPOffloadTargetTriple(), ArchStr, &Features);
if (!Arch) {
C.getDriver().Diag(clang::diag::err_drv_bad_target_id) << ArchStr;
continue;
}
auto CanId = getCanonicalTargetID(Arch.getValue(), Features);
ArchStr = Args.MakeArgStringRef(CanId);
}
ParsedArg->claim();
GpuArchList.push_back(StringToCudaArch(ParsedArg->getValue(0)));
GpuArchList.emplace_back(*TargetBE, ArchStr);
}
}

// If there are no CUDA architectures provided then default to SM_50.
if (GpuArchList.empty()) {
GpuArchList.push_back(CudaArch::SM_50);
// Handle defaults architectures
for (auto &Triple : SYCLTripleList) {
// For NVIDIA use SM_50 as a default
if (Triple.isNVPTX() && llvm::none_of(GpuArchList, [&](auto &P) {
return P.first.isNVPTX();
})) {
llvm::StringRef DefaultArch = CudaArchToString(CudaArch::SM_50);
GpuArchList.emplace_back(Triple, DefaultArch);
}

// For AMD require the architecture to be set by the user
if (Triple.isAMDGCN() && llvm::none_of(GpuArchList, [&](auto &P) {
return P.first.isAMDGCN();
})) {
C.getDriver().Diag(clang::diag::err_drv_sycl_missing_amdgpu_arch);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about adding a default AMD GPU arch?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The reason I didn't go for the default AMD GPU arch is that as I understand it we need to specify the exact GPU architecture for AMD so a default would only work for a very specific type of GPUs. Which means that in a lot of cases users would still need to specify the architecture manually, so I think it is better to force the architecture to always be set manually and have a clear diagnostic, than have a default architecture that rarely works and a more confusing error message from hip.

This is different with NVidia because SM_50 covers a lot of different GPUs, so in most cases it will work out of the box and the user won't have to set the architecture.

return true;
}
}

return false;
Expand Down
5 changes: 3 additions & 2 deletions clang/lib/Driver/ToolChain.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1174,8 +1174,9 @@ llvm::opt::DerivedArgList *ToolChain::TranslateOffloadTargetArgs(
// at all, target and host share a toolchain.
if (A->getOption().matches(options::OPT_m_Group)) {
// AMD GPU is a special case, as -mcpu is required for the device
// compilation.
if (SameTripleAsHost || getTriple().getArch() == llvm::Triple::amdgcn)
// compilation, except for SYCL which uses --offload-arch.
if (SameTripleAsHost || (getTriple().getArch() == llvm::Triple::amdgcn &&
DeviceOffloadKind != Action::OFK_SYCL))
DAL->append(A);
else
Modified = true;
Expand Down
42 changes: 24 additions & 18 deletions clang/test/Driver/sycl-offload-amdgcn.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,38 +3,44 @@

// UNSUPPORTED: system-windows

// Check that the offload arch is required
// RUN: %clangxx -### -std=c++11 -target x86_64-unknown-linux-gnu -fsycl \
// RUN: -fsycl-targets=amdgcn-amd-amdhsa-sycldevice %s 2>&1 \
// RUN: | FileCheck -check-prefix=CHK-ARCH %s
// CHK-ARCH: error: missing AMDGPU architecture for SYCL offloading; specify it with '-Xsycl-target-backend --offload-arch'

/// Check action graph.
// RUN: %clangxx -### -std=c++11 -target x86_64-unknown-linux-gnu -fsycl \
// RUN: -fsycl-targets=amdgcn-amd-amdhsa-sycldevice -mcpu=gfx906 \
// RUN: -fsycl-targets=amdgcn-amd-amdhsa-sycldevice -Xsycl-target-backend --offload-arch=gfx906 \
// RUN: -fsycl-libspirv-path=%S/Inputs/SYCL/libspirv.bc %s 2>&1 \
// RUN: | FileCheck -check-prefix=CHK-ACTIONS %s
// CHK-ACTIONS: "-cc1" "-triple" "amdgcn-amd-amdhsa-sycldevice" "-aux-triple" "x86_64-unknown-linux-gnu"{{.*}} "-fsycl-is-device"{{.*}} "-Wno-sycl-strict"{{.*}} "-sycl-std=2020" {{.*}} "-internal-isystem" "{{.*}}bin{{[/\\]+}}..{{[/\\]+}}include{{[/\\]+}}sycl"{{.*}} "-mlink-builtin-bitcode" "{{.*}}libspirv.bc"{{.*}} "-target-cpu" "gfx906"{{.*}} "-std=c++11"{{.*}}
// CHK-ACTIONS-NOT: "-mllvm -sycl-opt"
// CHK-ACTIONS: clang-offload-wrapper"{{.*}} "-host=x86_64-unknown-linux-gnu" "-target=amdgcn" "-kind=sycl"{{.*}}
// CHK-ACTIONS: clang-offload-wrapper"{{.*}} "-host=x86_64-unknown-linux-gnu" "-compile-opts=--offload-arch=gfx906" "-target=amdgcn" "-kind=sycl"{{.*}}

/// Check phases w/out specifying a compute capability.
// RUN: %clangxx -ccc-print-phases -std=c++11 -target x86_64-unknown-linux-gnu -fsycl \
// RUN: -fsycl-targets=amdgcn-amd-amdhsa-sycldevice -mcpu=gfx906 %s 2>&1 \
// RUN: -fsycl-targets=amdgcn-amd-amdhsa-sycldevice -Xsycl-target-backend --offload-arch=gfx906 %s 2>&1 \
// RUN: | FileCheck -check-prefix=CHK-PHASES-NO-CC %s
// CHK-PHASES-NO-CC: 0: input, "{{.*}}", c++, (host-sycl)
// CHK-PHASES-NO-CC: 1: append-footer, {0}, c++, (host-sycl)
// CHK-PHASES-NO-CC: 2: preprocessor, {1}, c++-cpp-output, (host-sycl)
// CHK-PHASES-NO-CC: 3: input, "{{.*}}", c++, (device-sycl)
// CHK-PHASES-NO-CC: 4: preprocessor, {3}, c++-cpp-output, (device-sycl)
// CHK-PHASES-NO-CC: 5: compiler, {4}, ir, (device-sycl)
// CHK-PHASES-NO-CC: 6: offload, "host-sycl (x86_64-unknown-linux-gnu)" {2}, "device-sycl (amdgcn-amd-amdhsa-sycldevice)" {5}, c++-cpp-output
// CHK-PHASES-NO-CC: 3: input, "{{.*}}", c++, (device-sycl, gfx906)
// CHK-PHASES-NO-CC: 4: preprocessor, {3}, c++-cpp-output, (device-sycl, gfx906)
// CHK-PHASES-NO-CC: 5: compiler, {4}, ir, (device-sycl, gfx906)
// CHK-PHASES-NO-CC: 6: offload, "host-sycl (x86_64-unknown-linux-gnu)" {2}, "device-sycl (amdgcn-amd-amdhsa-sycldevice:gfx906)" {5}, c++-cpp-output
// CHK-PHASES-NO-CC: 7: compiler, {6}, ir, (host-sycl)
// CHK-PHASES-NO-CC: 8: backend, {7}, assembler, (host-sycl)
// CHK-PHASES-NO-CC: 9: assembler, {8}, object, (host-sycl)
// CHK-PHASES-NO-CC: 10: linker, {9}, image, (host-sycl)
// CHK-PHASES-NO-CC: 11: linker, {5}, ir, (device-sycl)
// CHK-PHASES-NO-CC: 12: sycl-post-link, {11}, ir, (device-sycl)
// CHK-PHASES-NO-CC: 13: file-table-tform, {12}, ir, (device-sycl)
// CHK-PHASES-NO-CC: 14: backend, {13}, assembler, (device-sycl)
// CHK-PHASES-NO-CC: 15: assembler, {14}, object, (device-sycl)
// CHK-PHASES-NO-CC: 16: linker, {15}, image, (device-sycl)
// CHK-PHASES-NO-CC: 17: linker, {16}, hip-fatbin, (device-sycl)
// CHK-PHASES-NO-CC: 18: foreach, {13, 17}, hip-fatbin, (device-sycl)
// CHK-PHASES-NO-CC: 19: file-table-tform, {12, 18}, tempfiletable, (device-sycl)
// CHK-PHASES-NO-CC: 20: clang-offload-wrapper, {19}, object, (device-sycl)
// CHK-PHASES-NO-CC: 21: offload, "host-sycl (x86_64-unknown-linux-gnu)" {10}, "device-sycl (amdgcn-amd-amdhsa-sycldevice)" {20}, image
// CHK-PHASES-NO-CC: 11: linker, {5}, ir, (device-sycl, gfx906)
// CHK-PHASES-NO-CC: 12: sycl-post-link, {11}, ir, (device-sycl, gfx906)
// CHK-PHASES-NO-CC: 13: file-table-tform, {12}, ir, (device-sycl, gfx906)
// CHK-PHASES-NO-CC: 14: backend, {13}, assembler, (device-sycl, gfx906)
// CHK-PHASES-NO-CC: 15: assembler, {14}, object, (device-sycl, gfx906)
// CHK-PHASES-NO-CC: 16: linker, {15}, image, (device-sycl, gfx906)
// CHK-PHASES-NO-CC: 17: linker, {16}, hip-fatbin, (device-sycl, gfx906)
// CHK-PHASES-NO-CC: 18: foreach, {13, 17}, hip-fatbin, (device-sycl, gfx906)
// CHK-PHASES-NO-CC: 19: file-table-tform, {12, 18}, tempfiletable, (device-sycl, gfx906)
// CHK-PHASES-NO-CC: 20: clang-offload-wrapper, {19}, object, (device-sycl, gfx906)
// CHK-PHASES-NO-CC: 21: offload, "host-sycl (x86_64-unknown-linux-gnu)" {10}, "device-sycl (amdgcn-amd-amdhsa-sycldevice:gfx906)" {20}, image
2 changes: 1 addition & 1 deletion clang/test/Driver/sycl-triple-dae-flags.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// RUN: %clangxx -### -fsycl -fsycl-targets=nvptx64-nvidia-cuda-sycldevice -fsycl-dead-args-optimization %s 2> %t.cuda.out
// RUN: FileCheck %s --input-file %t.cuda.out
//
// RUN: %clangxx -### -fsycl -fsycl-targets=amdgcn-amd-amdhsa-sycldevice -fsycl-dead-args-optimization %s 2> %t.rocm.out
// RUN: %clangxx -### -fsycl -fsycl-targets=amdgcn-amd-amdhsa-sycldevice -Xsycl-target-backend --offload-arch=gfx906 -fsycl-dead-args-optimization %s 2> %t.rocm.out
// RUN: FileCheck %s --input-file %t.rocm.out
// CHECK-NOT: -fenable-sycl-dae
// CHECK-NOT: -emit-param-info
Expand Down
7 changes: 5 additions & 2 deletions sycl/doc/GetStartedGuide.md
Original file line number Diff line number Diff line change
Expand Up @@ -547,11 +547,14 @@ clang++ -fsycl -fsycl-targets=nvptx64-nvidia-cuda-sycldevice \
simple-sycl-app.cpp -o simple-sycl-app-cuda.exe
```

When building for ROCm, please note that the option `mcpu` must be specified, use the ROCm target triple as follows:
When building for ROCm, use the ROCm target triple and specify the
target architecture with `-Xsycl-target-backend --offload-arch=<arch>`
as follows:

```bash
clang++ -fsycl -fsycl-targets=amdgcn-amd-amdhsa-sycldevice \
-mcpu=gfx906 simple-sycl-app.cpp -o simple-sycl-app-cuda.exe
-Xsycl-target-backend --offload-arch=gfx906 \
simple-sycl-app.cpp -o simple-sycl-app-amd.exe
```

To build simple-sycl-app ahead of time for GPU, CPU or Accelerator devices,
Expand Down