Skip to content

Commit a6218a7

Browse files
committed
[SYCL][ROCm] Use offload-arch instead of mcpu for AMD arch
This patch changes using `-mcpu` for SYCL applications targeting AMD to `-Xsycl-target-backend --offload-arch`. Before this patch the offloading arch wasn't set correctly for AMD architectures. This is fixing an issue with HIP that was talked about in intel#4133, regarding having `v4` in the hip part of the triple, without the `v4` HIP seems to be ignoring the fact that the offloading arch is missing from the triple, which is why there was a workaround orignally to force not using `v4` with SYCL. By fixing the offloading arch this patch fixes the issue properly and now the triple with `v4` works because it also contains the offloading architecture.
1 parent 9ddb8f0 commit a6218a7

File tree

4 files changed

+89
-24
lines changed

4 files changed

+89
-24
lines changed

clang/include/clang/Basic/DiagnosticDriverKinds.td

+2
Original file line numberDiff line numberDiff line change
@@ -289,6 +289,8 @@ def err_drv_expecting_fsycl_with_sycl_opt : Error<
289289
"'%0' must be used in conjunction with '-fsycl' to enable offloading">;
290290
def err_drv_fsycl_with_c_type : Error<
291291
"'%0' must not be used in conjunction with '-fsycl', which expects C++ source">;
292+
def err_drv_sycl_missing_amdgpu_arch : Error<
293+
"Missing AMDGPU architecture for SYCL offloading. Please specify it with -Xsycl-target-backend --offload-arch.">;
292294
def warn_drv_sycl_offload_target_duplicate : Warning<
293295
"SYCL offloading target '%0' is similar to target '%1' already specified; "
294296
"will be ignored">, InGroup<SyclTarget>;

clang/lib/Driver/Driver.cpp

+79-20
Original file line numberDiff line numberDiff line change
@@ -3961,7 +3961,7 @@ class OffloadingActionBuilder final {
39613961
ActionList FPGAArchiveInputs;
39623962

39633963
/// List of CUDA architectures to use in this compilation with NVPTX targets.
3964-
SmallVector<CudaArch, 8> GpuArchList;
3964+
SmallVector<std::pair<llvm::Triple, std::string>, 8> GpuArchList;
39653965

39663966
/// Build the last steps for CUDA after all BC files have been linked.
39673967
JobAction *finalizeNVPTXDependences(Action *Input, const llvm::Triple &TT) {
@@ -3998,13 +3998,17 @@ class OffloadingActionBuilder final {
39983998
const Driver::InputList &Inputs)
39993999
: DeviceActionBuilder(C, Args, Inputs, Action::OFK_SYCL) {}
40004000

4001-
void withBoundArchForToolChain(const ToolChain* TC,
4001+
void withBoundArchForToolChain(const ToolChain *TC,
40024002
llvm::function_ref<void(const char *)> Op) {
4003-
if (TC->getTriple().isNVPTX())
4004-
for (CudaArch A : GpuArchList)
4005-
Op(CudaArchToString(A));
4006-
else
4007-
Op(nullptr);
4003+
for (auto &A : GpuArchList) {
4004+
if (TC->getTriple() == A.first) {
4005+
Op(Args.MakeArgString(A.second.c_str()));
4006+
return;
4007+
}
4008+
}
4009+
4010+
// no bound arch for this toolchain
4011+
Op(nullptr);
40084012
}
40094013

40104014
ActionBuilderReturnCode
@@ -4058,8 +4062,8 @@ class OffloadingActionBuilder final {
40584062
}
40594063
const auto *TC = ToolChains.front();
40604064
const char *BoundArch = nullptr;
4061-
if (TC->getTriple().isNVPTX())
4062-
BoundArch = CudaArchToString(GpuArchList.front());
4065+
if (TC->getTriple().isNVPTX() || TC->getTriple().isAMDGCN())
4066+
BoundArch = GpuArchList.front().second.c_str();
40634067
DA.add(*DeviceCompilerInput, *TC, BoundArch, Action::OFK_SYCL);
40644068
// Clear the input file, it is already a dependence to a host
40654069
// action.
@@ -4642,39 +4646,94 @@ class OffloadingActionBuilder final {
46424646
}
46434647
}
46444648

4645-
/// Initialize the GPU architecture list from arguments - this populates `GpuArchList` from
4646-
/// `--cuda-gpu-arch` flags. Only relevant if compiling to CUDA. Return true if any
4647-
/// initialization errors are found.
4649+
/// Initialize the GPU architecture list from arguments - this populates
4650+
/// `GpuArchList` from `--offload-arch` flags. Only relevant if compiling to
4651+
/// CUDA or AMDGCN. Return true if any initialization errors are found.
4652+
/// FIXME: SPIR-V AOT targets should also use `offload-arch` to better fit
4653+
/// in the standard model.
46484654
bool initializeGpuArchMap() {
46494655
const OptTable &Opts = C.getDriver().getOpts();
46504656
for (auto *A : Args) {
46514657
unsigned Index;
4658+
llvm::Triple *TargetBE = nullptr;
46524659

4653-
if (A->getOption().matches(options::OPT_Xsycl_backend_EQ))
4660+
auto GetTripleIt = [&, this](llvm::StringRef Triple) {
4661+
llvm::Triple TargetTriple{Triple};
4662+
auto TripleIt = llvm::find_if(SYCLTripleList, [&](auto &SYCLTriple) {
4663+
return SYCLTriple == TargetTriple;
4664+
});
4665+
return TripleIt != SYCLTripleList.end() ? &*TripleIt : nullptr;
4666+
};
4667+
4668+
if (A->getOption().matches(options::OPT_Xsycl_backend_EQ)) {
4669+
TargetBE = GetTripleIt(A->getValue(0));
46544670
// Passing device args: -Xsycl-target-backend=<triple> -opt=val.
4655-
if (llvm::Triple(A->getValue(0)).isNVPTX())
4671+
if (TargetBE)
46564672
Index = Args.getBaseArgs().MakeIndex(A->getValue(1));
46574673
else
46584674
continue;
4659-
else if (A->getOption().matches(options::OPT_Xsycl_backend))
4675+
} else if (A->getOption().matches(options::OPT_Xsycl_backend)) {
4676+
if (SYCLTripleList.size() > 1) {
4677+
C.getDriver().Diag(diag::err_drv_Xsycl_target_missing_triple)
4678+
<< A->getSpelling();
4679+
continue;
4680+
}
46604681
// Passing device args: -Xsycl-target-backend -opt=val.
4682+
TargetBE = &SYCLTripleList.front();
46614683
Index = Args.getBaseArgs().MakeIndex(A->getValue(0));
4662-
else
4684+
} else
46634685
continue;
46644686

46654687
A->claim();
46664688
auto ParsedArg = Opts.ParseOneArg(Args, Index);
4689+
46674690
// TODO: Support --no-cuda-gpu-arch, --{,no-}cuda-gpu-arch=all.
46684691
if (ParsedArg &&
46694692
ParsedArg->getOption().matches(options::OPT_offload_arch_EQ)) {
4693+
llvm::StringRef ArchStr = ParsedArg->getValue(0);
4694+
if (TargetBE->isNVPTX()) {
4695+
// CUDA arch also applies to AMDGCN ...
4696+
CudaArch Arch = StringToCudaArch(ArchStr);
4697+
if (Arch == CudaArch::UNKNOWN || !IsNVIDIAGpuArch(Arch)) {
4698+
C.getDriver().Diag(clang::diag::err_drv_cuda_bad_gpu_arch)
4699+
<< ArchStr;
4700+
continue;
4701+
}
4702+
ArchStr = CudaArchToString(Arch);
4703+
} else if (TargetBE->isAMDGCN()) {
4704+
llvm::StringMap<bool> Features;
4705+
auto Arch =
4706+
parseTargetID(getHIPOffloadTargetTriple(), ArchStr, &Features);
4707+
if (!Arch) {
4708+
C.getDriver().Diag(clang::diag::err_drv_bad_target_id) << ArchStr;
4709+
continue;
4710+
}
4711+
auto CanId = getCanonicalTargetID(Arch.getValue(), Features);
4712+
ArchStr = Args.MakeArgStringRef(CanId);
4713+
}
46704714
ParsedArg->claim();
4671-
GpuArchList.push_back(StringToCudaArch(ParsedArg->getValue(0)));
4715+
GpuArchList.emplace_back(*TargetBE, ArchStr);
46724716
}
46734717
}
46744718

4675-
// If there are no CUDA architectures provided then default to SM_50.
4676-
if (GpuArchList.empty()) {
4677-
GpuArchList.push_back(CudaArch::SM_50);
4719+
// Handle defaults architectures
4720+
for (auto &Triple : SYCLTripleList) {
4721+
// For NVIDIA use SM_50 as a default
4722+
if (Triple.isNVPTX() && llvm::none_of(GpuArchList, [&](auto &P) {
4723+
return P.first.isNVPTX();
4724+
})) {
4725+
llvm::StringRef DefaultArch = CudaArchToString(CudaArch::SM_50);
4726+
GpuArchList.emplace_back(Triple, DefaultArch);
4727+
}
4728+
4729+
// For AMD require the architecture to be set by the user
4730+
if (Triple.isAMDGCN() && llvm::none_of(GpuArchList, [&](auto &P) {
4731+
return P.first.isAMDGCN();
4732+
})) {
4733+
C.getDriver().Diag(
4734+
clang::diag::err_drv_sycl_missing_amdgpu_arch);
4735+
continue;
4736+
}
46784737
}
46794738

46804739
return false;

clang/lib/Driver/ToolChain.cpp

+3-2
Original file line numberDiff line numberDiff line change
@@ -1174,8 +1174,9 @@ llvm::opt::DerivedArgList *ToolChain::TranslateOffloadTargetArgs(
11741174
// at all, target and host share a toolchain.
11751175
if (A->getOption().matches(options::OPT_m_Group)) {
11761176
// AMD GPU is a special case, as -mcpu is required for the device
1177-
// compilation.
1178-
if (SameTripleAsHost || getTriple().getArch() == llvm::Triple::amdgcn)
1177+
// compilation, except for SYCL which uses --offload-arch.
1178+
if (SameTripleAsHost || (getTriple().getArch() == llvm::Triple::amdgcn &&
1179+
DeviceOffloadKind != Action::OFK_SYCL))
11791180
DAL->append(A);
11801181
else
11811182
Modified = true;

sycl/doc/GetStartedGuide.md

+5-2
Original file line numberDiff line numberDiff line change
@@ -547,11 +547,14 @@ clang++ -fsycl -fsycl-targets=nvptx64-nvidia-cuda-sycldevice \
547547
simple-sycl-app.cpp -o simple-sycl-app-cuda.exe
548548
```
549549
550-
When building for ROCm, please note that the option `mcpu` must be specified, use the ROCm target triple as follows:
550+
When building for ROCm, use the ROCm target triple and specify the
551+
target architecture with `-Xsycl-target-backend --offload-arch=<arch>`
552+
as follows:
551553
552554
```bash
553555
clang++ -fsycl -fsycl-targets=amdgcn-amd-amdhsa-sycldevice \
554-
-mcpu=gfx906 simple-sycl-app.cpp -o simple-sycl-app-cuda.exe
556+
-Xsycl-target-backend --offload-arch=gfx906 \
557+
simple-sycl-app.cpp -o simple-sycl-app-amd.exe
555558
```
556559
557560
To build simple-sycl-app ahead of time for GPU, CPU or Accelerator devices,

0 commit comments

Comments
 (0)