Skip to content

Commit 8272490

Browse files
authored
AMDGPU ukernels: Bazel build, separate bitcode files, c-embed archives. (#19274)
1. Implement Bazel, generate CMake from Bazel. 2. Split .bc bitcode files, one .bc file <-> one ukernel function. 3. Generate embedded-data archives. 4. Update the compiler code to use the embedded-data archives. 5. Simplify setAlwaysInline now that we are no longer dealing with HIP symbols. --------- Signed-off-by: Benoit Jacob <jacob.benoit.1@gmail.com>
1 parent 677ae42 commit 8272490

File tree

12 files changed

+540
-239
lines changed

12 files changed

+540
-239
lines changed

.gitignore

+2
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@ Testing/
2727

2828
# Bazel artifacts
2929
**/bazel-*
30+
MODULE.bazel
31+
MODULE.bazel.lock
3032

3133
# Executables
3234
*.exe

build_tools/bazel/iree_bitcode_library.bzl

+74
Original file line numberDiff line numberDiff line change
@@ -255,6 +255,80 @@ def iree_cuda_bitcode_library(
255255
**kwargs
256256
)
257257

258+
def iree_amdgpu_bitcode_library(
259+
name,
260+
gpu_arch,
261+
srcs,
262+
copts = [],
263+
out = None,
264+
**kwargs):
265+
"""Builds an AMDGPU LLVM bitcode library from an input file using clang.
266+
267+
Args:
268+
name: Name of the target.
269+
gpu_arch: Target AMDGPU architecture, e.g. gfx942.
270+
srcs: Source files to pass to clang. Headers (*.h) are for dependency
271+
tracking only. Current limitation: only one non-header source is
272+
supported.
273+
copts: Additional flags to pass to clang.
274+
out: Output file name. Defaults to {source.c}.{gpu_arch}.bc.
275+
**kwargs: any additional attributes to pass to the underlying rules.
276+
"""
277+
278+
clang_tool = "@llvm-project//clang:clang"
279+
280+
base_copts = [
281+
# Language: C23.
282+
"-std=c23",
283+
284+
# Avoid dependencies.
285+
"-nogpulib",
286+
287+
# Avoid ABI issues.
288+
"-fno-short-wchar", # Shouldn't matter to us, but doesn't hurt.
289+
290+
# Target architecture/machine.
291+
"-target",
292+
"amdgcn-amd-amdhsa",
293+
"-march=%s" % gpu_arch,
294+
"-fgpu-rdc", # NOTE: may not be required for all targets.
295+
296+
# Optimized.
297+
"-O3",
298+
"-fno-ident",
299+
"-fvisibility=hidden",
300+
301+
# Object file only in bitcode format.
302+
"-c",
303+
"-emit-llvm",
304+
]
305+
306+
non_header_srcs = [src for src in srcs if not src.endswith(".h")]
307+
if len(non_header_srcs) != 1:
308+
fail("Expected exactly one non-header file in srcs, got srcs=[" + ", ".join(srcs) + "]")
309+
src = non_header_srcs[0]
310+
311+
if not out:
312+
out = "%s.%s.bc" % (src, gpu_arch)
313+
314+
native.genrule(
315+
name = "gen_%s" % (out),
316+
srcs = srcs,
317+
outs = [out],
318+
cmd = " ".join([
319+
"$(location %s)" % (clang_tool),
320+
"$(location %s)" % (src),
321+
"-o $(location %s)" % (out),
322+
"-I .",
323+
] + base_copts + copts),
324+
tools = [
325+
clang_tool,
326+
],
327+
message = "Compiling %s to %s..." % (src, out),
328+
output_to_bindir = 1,
329+
**kwargs
330+
)
331+
258332
def iree_link_bitcode(
259333
name,
260334
bitcode_files,

build_tools/bazel_to_cmake/bazel_to_cmake_converter.py

+19
Original file line numberDiff line numberDiff line change
@@ -610,6 +610,25 @@ def iree_cuda_bitcode_library(
610610
f")\n\n"
611611
)
612612

613+
def iree_amdgpu_bitcode_library(self, name, gpu_arch, srcs, copts=None, out=None):
614+
name_block = self._convert_string_arg_block("NAME", name, quote=False)
615+
gpu_arch_block = self._convert_string_arg_block(
616+
"GPU_ARCH", gpu_arch, quote=False
617+
)
618+
srcs_block = self._convert_srcs_block(srcs)
619+
out_block = self._convert_string_arg_block("OUT", out, quote=False)
620+
copts_block = self._convert_string_list_block("COPTS", copts, sort=False)
621+
622+
self._converter.body += (
623+
f"iree_amdgpu_bitcode_library(\n"
624+
f"{name_block}"
625+
f"{gpu_arch_block}"
626+
f"{srcs_block}"
627+
f"{out_block}"
628+
f"{copts_block}"
629+
f")\n\n"
630+
)
631+
613632
def iree_link_bitcode(self, name, bitcode_files):
614633
name_block = self._convert_string_arg_block("NAME", name, quote=False)
615634
bitcode_files_block = self._convert_srcs_block(

build_tools/cmake/iree_bitcode_library.cmake

+92
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,98 @@ function(iree_cuda_bitcode_library)
227227
)
228228
endfunction()
229229

230+
# iree_amdgpu_bitcode_library()
231+
#
232+
# Builds an AMDGPU LLVM bitcode library from an input file via clang.
233+
#
234+
# Parameters:
235+
# NAME: Name of the target.
236+
# GPU_ARCH: Target AMDGPU architecture, e.g. gfx942.
237+
# SRCS: Source files to pass to clang. Headers (*.h) are for dependency
238+
# tracking only. Current limitation: only one non-header source is
239+
# supported.
240+
# COPTS: Additional flags to pass to clang.
241+
# OUT: Output file name. Defaults to {source.c}.{gpu_arch}.bc.
242+
#
243+
function(iree_amdgpu_bitcode_library)
244+
cmake_parse_arguments(
245+
_RULE
246+
""
247+
"NAME;OUT;GPU_ARCH"
248+
"SRCS;COPTS"
249+
${ARGN}
250+
)
251+
252+
set(_SRC "")
253+
foreach(_SRCS_ENTRY IN LISTS _RULE_SRCS)
254+
if(_SRCS_ENTRY MATCHES "\.h$")
255+
continue()
256+
endif()
257+
if (_SRC)
258+
message(SEND_ERROR "Currently limitation: only one non-header file allowed in SRCS.")
259+
endif()
260+
set(_SRC "${_SRCS_ENTRY}")
261+
endforeach()
262+
if(NOT _SRC)
263+
message(SEND_ERROR "Error: no non-header file found in SRCS=${_RULE_SRCS}.")
264+
endif()
265+
266+
if(DEFINED _RULE_OUT)
267+
set(_OUT "${_RULE_OUT}")
268+
else()
269+
set(_OUT "${_SRC}.${_RULE_GPU_ARCH}.bc")
270+
endif()
271+
272+
set(_COPTS
273+
# Language: C23
274+
"-std=c23"
275+
276+
# Avoid dependencies.
277+
"-nogpulib"
278+
279+
# Avoid ABI issues.
280+
"-fno-short-wchar" # Shouldn't matter to us, but doesn't hurt.
281+
282+
# Target architecture/machine.
283+
"-target"
284+
"amdgcn-amd-amdhsa"
285+
"-march=${_RULE_GPU_ARCH}"
286+
"-fgpu-rdc" # NOTE: may not be required for all targets.
287+
288+
# Optimized.
289+
"-O3"
290+
"-fno-ident"
291+
"-fvisibility=hidden"
292+
293+
# Object file only in bitcode format.
294+
"-c"
295+
"-emit-llvm"
296+
)
297+
298+
add_custom_command(
299+
OUTPUT
300+
"${_OUT}"
301+
COMMAND
302+
"${IREE_CLANG_BINARY}"
303+
${_COPTS}
304+
"-I" "${IREE_SOURCE_DIR}"
305+
"${CMAKE_CURRENT_SOURCE_DIR}/${_SRC}"
306+
"-o" "${_OUT}"
307+
DEPENDS
308+
"${IREE_CLANG_BINARY}"
309+
"${_RULE_SRCS}"
310+
COMMENT
311+
"Compiling ${_SRC} to ${_OUT}"
312+
VERBATIM
313+
)
314+
315+
# Only add iree_${NAME} as custom target doesn't support aliasing to
316+
# iree::${NAME}.
317+
iree_package_name(_PACKAGE_NAME)
318+
add_custom_target("${_PACKAGE_NAME}_${_RULE_NAME}"
319+
DEPENDS "${_OUT}"
320+
)
321+
endfunction()
230322

231323
# iree_link_bitcode()
232324
#

compiler/plugins/target/ROCM/BUILD.bazel

+4
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,10 @@ iree_compiler_cc_library(
2727
"ROCMTargetUtils.h",
2828
],
2929
deps = [
30+
"//compiler/plugins/target/ROCM/builtins/ukernel:iree_uk_amdgpu_gfx1030",
31+
"//compiler/plugins/target/ROCM/builtins/ukernel:iree_uk_amdgpu_gfx1100",
32+
"//compiler/plugins/target/ROCM/builtins/ukernel:iree_uk_amdgpu_gfx90a",
33+
"//compiler/plugins/target/ROCM/builtins/ukernel:iree_uk_amdgpu_gfx942",
3034
"//compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR:IREECodegenDialect",
3135
"//compiler/src/iree/compiler/Codegen/Dialect/GPU/IR:IREEGPUDialect",
3236
"//compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils:KnownTargets",

compiler/plugins/target/ROCM/CMakeLists.txt

+4
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,10 @@ iree_cc_library(
6464
iree::compiler::Dialect::HAL::Utils::LLVMLinkerUtils
6565
iree::compiler::PluginAPI
6666
iree::compiler::Utils
67+
iree::compiler::plugins::target::ROCM::builtins::ukernel::iree_uk_amdgpu_gfx1030
68+
iree::compiler::plugins::target::ROCM::builtins::ukernel::iree_uk_amdgpu_gfx1100
69+
iree::compiler::plugins::target::ROCM::builtins::ukernel::iree_uk_amdgpu_gfx90a
70+
iree::compiler::plugins::target::ROCM::builtins::ukernel::iree_uk_amdgpu_gfx942
6771
iree::schemas::amdgpu_executable_def_c_fbs
6872
iree::schemas::executable_debug_info_c_fbs
6973
iree::schemas::hip_executable_def_c_fbs

compiler/plugins/target/ROCM/ROCMTargetUtils.cpp

+34-65
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,14 @@
66

77
#include "compiler/plugins/target/ROCM/ROCMTargetUtils.h"
88

9+
#include "compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_gfx1030.h"
10+
#include "compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_gfx1100.h"
11+
#include "compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_gfx90a.h"
12+
#include "compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_gfx942.h"
913
#include "iree/compiler/Codegen/Utils/GPUUtils.h"
1014
#include "iree/compiler/Dialect/HAL/Utils/LLVMLinkerUtils.h"
1115
#include "iree/compiler/Utils/ToolUtils.h"
16+
#include "llvm/ADT/StringSwitch.h"
1217
#include "llvm/IR/Constants.h"
1318
#include "llvm/IR/Module.h"
1419
#include "llvm/IRReader/IRReader.h"
@@ -79,76 +84,28 @@ static LogicalResult linkWithBitcodeFiles(Location loc, llvm::Module *module,
7984
}
8085

8186
static LogicalResult linkBitcodeFile(Location loc, llvm::Linker &linker,
82-
unsigned linkerFlags, StringRef path,
87+
unsigned linkerFlags, StringRef filename,
88+
StringRef contents,
8389
llvm::TargetMachine &targetMachine,
8490
llvm::LLVMContext &context) {
85-
auto bitcodeBufferRef = llvm::MemoryBuffer::getFile(path);
86-
if (auto ec = bitcodeBufferRef.getError()) {
87-
return mlir::emitError(loc) << "failed reading user bitcode file `" << path
88-
<< "`: " << ec.message();
89-
}
91+
llvm::MemoryBufferRef bitcodeBufferRef(contents, filename);
9092
auto setAlwaysInline = [&](llvm::Module &module) {
91-
if (targetMachine.getTargetCPU().contains("gfx10") ||
92-
targetMachine.getTargetCPU().contains("gfx11")) {
93-
// Some ROCM/HIP functions for gfx10 or gfx11 has accuracy issue if
94-
// inlined.
95-
return;
96-
}
9793
for (auto &func : module.getFunctionList()) {
98-
// Some ROCM/HIP builtin functions have Optnone and NoInline for default.
99-
if (targetMachine.getTargetTriple().isAMDGCN()) {
100-
if (func.hasFnAttribute(llvm::Attribute::OptimizeNone)) {
101-
func.removeFnAttr(llvm::Attribute::OptimizeNone);
102-
}
103-
if (targetMachine.getTargetTriple().isAMDGCN() &&
104-
func.hasFnAttribute(llvm::Attribute::NoInline)) {
105-
func.removeFnAttr(llvm::Attribute::NoInline);
106-
}
107-
}
10894
func.addFnAttr(llvm::Attribute::AlwaysInline);
10995
}
11096
};
111-
if (failed(linkBitcodeModule(
112-
loc, linker, linkerFlags, targetMachine, path,
113-
llvm::parseBitcodeFile(*bitcodeBufferRef->get(), context),
114-
setAlwaysInline))) {
97+
if (failed(
98+
linkBitcodeModule(loc, linker, linkerFlags, targetMachine, filename,
99+
llvm::parseBitcodeFile(bitcodeBufferRef, context),
100+
setAlwaysInline))) {
115101
return mlir::emitError(loc) << "failed linking in user bitcode file `"
116-
<< path << "` for target triple '"
102+
<< filename << "` for target triple '"
117103
<< targetMachine.getTargetTriple().str() << "'";
118104
}
119105

120106
return success();
121107
}
122108

123-
static std::vector<std::string> getUkernelPaths(StringRef enabledUkernelsStr,
124-
StringRef targetChip,
125-
StringRef bitcodePath) {
126-
std::vector<std::string> selectedUkernelNames;
127-
if (enabledUkernelsStr == "all") {
128-
const char *allUkernelNames[] = {"argmax"};
129-
size_t numUkernels = sizeof(allUkernelNames) / sizeof(allUkernelNames[0]);
130-
for (int i = 0; i < numUkernels; i++) {
131-
selectedUkernelNames.push_back(allUkernelNames[i]);
132-
}
133-
} else {
134-
while (!enabledUkernelsStr.empty()) {
135-
auto split = enabledUkernelsStr.split(',');
136-
selectedUkernelNames.push_back(split.first.str());
137-
enabledUkernelsStr = split.second;
138-
}
139-
}
140-
141-
// Construct full path to ROCDL bitcode libraries.
142-
std::vector<std::string> result;
143-
std::string app = "/";
144-
for (auto &kernelName : selectedUkernelNames) {
145-
std::string filename =
146-
"rocm_" + kernelName + "_ukernel_" + targetChip.str();
147-
result.push_back(bitcodePath.str() + app + filename + ".bc");
148-
}
149-
return result;
150-
}
151-
152109
static void overridePlatformGlobal(llvm::Module *module, StringRef globalName,
153110
uint32_t newValue, llvm::Type *globalTy) {
154111
// NOTE: the global will not be defined if it is not used in the module.
@@ -228,24 +185,36 @@ LogicalResult linkHIPBitcodeIfNeeded(Location loc, llvm::Module *module,
228185
return linkWithBitcodeFiles(loc, module, bitcodePaths);
229186
}
230187

188+
static std::tuple<const iree_file_toc_t *, int>
189+
getUkernelBitcodeTOC(StringRef gpuArch) {
190+
return llvm::StringSwitch<std::tuple<const iree_file_toc_t *, int>>(gpuArch)
191+
.Case("gfx90a",
192+
{iree_uk_amdgpu_gfx90a_create(), iree_uk_amdgpu_gfx90a_size()})
193+
.Case("gfx942",
194+
{iree_uk_amdgpu_gfx942_create(), iree_uk_amdgpu_gfx942_size()})
195+
.Case("gfx1030",
196+
{iree_uk_amdgpu_gfx1030_create(), iree_uk_amdgpu_gfx1030_size()})
197+
.Case("gfx1100",
198+
{iree_uk_amdgpu_gfx1100_create(), iree_uk_amdgpu_gfx1100_size()})
199+
.Default({nullptr, 0});
200+
}
201+
231202
// Links optimized Ukernel bitcode into the given module if the module needs it.
232203
LogicalResult linkUkernelBitcodeFiles(Location loc, llvm::Module *module,
233204
StringRef enabledUkernelsStr,
234205
StringRef targetChip,
235206
StringRef bitcodePath,
236207
unsigned linkerFlags,
237208
llvm::TargetMachine &targetMachine) {
238-
// Early exit if Ukernel not supported on target chip.
239-
if (!iree_compiler::hasUkernelSupportedRocmArch(targetChip)) {
240-
return mlir::emitError(loc)
241-
<< "ukernel '" << enabledUkernelsStr
242-
<< "' not supported on target chip: " << targetChip;
209+
auto [toc, toc_size] = getUkernelBitcodeTOC(targetChip);
210+
if (!toc) {
211+
return failure();
243212
}
244-
std::vector<std::string> ukernelPaths =
245-
getUkernelPaths(enabledUkernelsStr, targetChip, bitcodePath);
213+
246214
llvm::Linker linker(*module);
247-
for (auto &path : ukernelPaths) {
248-
if (failed(linkBitcodeFile(loc, linker, linkerFlags, StringRef(path),
215+
for (int i = 0; i < toc_size; ++i) {
216+
if (failed(linkBitcodeFile(loc, linker, linkerFlags, toc[i].name,
217+
llvm::StringRef(toc[i].data, toc[i].size),
249218
targetMachine, module->getContext())))
250219
return failure();
251220
}

compiler/plugins/target/ROCM/ROCMTargetUtils.h

-3
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,6 @@ LogicalResult linkUkernelBitcodeFiles(Location loc, llvm::Module *module,
3434
// a blob.
3535
std::string createHsaco(Location loc, StringRef isa, StringRef name);
3636

37-
// Returns true if the rocm archtecture target is supported for ukernels.
38-
bool hasUkernelSupportedRocmArch(IREE::HAL::ExecutableTargetAttr targetAttr);
39-
4037
} // namespace mlir::iree_compiler::IREE::HAL
4138

4239
#endif // IREE_COMPILER_PLUGINS_TARGET_ROCM_ROCMTARGETUTILS_H_

0 commit comments

Comments
 (0)