Skip to content

Commit e3b6bea

Browse files
committed
[LLVMGPU] Enable TileAndFuse for matmul by default
1 parent 368735f commit e3b6bea

File tree

5 files changed

+25
-15
lines changed

5 files changed

+25
-15
lines changed

compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp

+18-10
Original file line numberDiff line numberDiff line change
@@ -50,22 +50,19 @@
5050
#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
5151
namespace mlir::iree_compiler {
5252

53-
llvm::cl::opt<bool> clGPUTestTileAndFuseMatmul(
54-
"iree-codegen-llvmgpu-test-tile-and-fuse-matmul",
53+
// TODO: Formalize flag under LLVMGPU opt levels.
54+
llvm::cl::opt<bool> clGPUUseTileAndFuseMatmul(
55+
"iree-codegen-llvmgpu-use-tile-and-fuse-matmul",
5556
llvm::cl::desc("test the the tile and fuse pipeline for matmul"),
56-
llvm::cl::init(false));
57-
58-
llvm::cl::opt<bool> clGPUTestTileAndFuseVectorize(
59-
"iree-codegen-llvmgpu-test-tile-and-fuse-vectorize",
60-
llvm::cl::desc(
61-
"test the tile and fuse pipeline for all supported operations"),
62-
llvm::cl::init(false));
57+
llvm::cl::init(true));
6358

59+
// TODO: Formalize flag under LLVMGPU opt levels.
6460
llvm::cl::opt<bool> clLLVMGPUVectorizePipeline(
6561
"iree-codegen-llvmgpu-vectorize-pipeline",
6662
llvm::cl::desc("forces use of the legacy LLVMGPU vectorize pipeline"),
6763
llvm::cl::init(false));
6864

65+
// TODO: Formalize flag under LLVMGPU opt levels.
6966
llvm::cl::opt<bool> clGPUEnableVectorDistribution(
7067
"iree-codegen-llvmgpu-use-vector-distribution",
7168
llvm::cl::desc("enable the usage of the vector distribution pipeline"),
@@ -80,24 +77,28 @@ llvm::cl::opt<bool> clGPUUnalignedGEMMVectorDistribution(
8077
"unaligned GEMMs when supported"),
8178
llvm::cl::init(false));
8279

80+
// TODO: Formalize flag under LLVMGPU opt levels.
8381
llvm::cl::opt<bool> clGPUUseTileAndFuseConvolution(
8482
"iree-codegen-llvmgpu-use-tile-and-fuse-convolution",
8583
llvm::cl::desc(
8684
"enable the tile and fuse pipeline for supported convolutions"),
8785
llvm::cl::init(true));
8886

87+
// TODO: Formalize flag under LLVMGPU opt levels.
8988
/// Flag to force using WMMA tensorcore operations.
9089
llvm::cl::opt<bool>
9190
clGPUUseWMMA("iree-codegen-llvmgpu-use-wmma",
9291
llvm::cl::desc("force use of wmma operations for tensorcore"),
9392
llvm::cl::init(false));
9493

94+
// TODO: Formalize flag under LLVMGPU opt levels.
9595
/// Flag used to toggle using mma.sync vs wmma when targetting tensorcore.
9696
llvm::cl::opt<bool>
9797
clGPUUseMMASync("iree-codegen-llvmgpu-use-mma-sync",
9898
llvm::cl::desc("force use mma sync instead of wmma ops"),
9999
llvm::cl::init(false));
100100

101+
// TODO: Move to a testing only flag.
101102
llvm::cl::opt<int> clGPUMatmulCThreshold(
102103
"iree-codegen-llvmgpu-matmul-c-matrix-threshold",
103104
llvm::cl::desc("matmul c matrix element count threshold to be considered "
@@ -114,6 +115,13 @@ static llvm::cl::opt<bool>
114115
clLLVMGPUUseIgemm("iree-codegen-llvmgpu-use-igemm",
115116
llvm::cl::desc("Enable implicit gemm for convolutions."),
116117
llvm::cl::init(true));
118+
119+
// Hidden testing only flag
120+
llvm::cl::opt<bool> clGPUTestTileAndFuseVectorize(
121+
"iree-codegen-llvmgpu-test-tile-and-fuse-vectorize",
122+
llvm::cl::desc(
123+
"test the tile and fuse pipeline for all supported operations"),
124+
llvm::cl::init(false), llvm::cl::Hidden);
117125
namespace {
118126

119127
using CodeGenPipeline = IREE::Codegen::DispatchLoweringPassPipeline;
@@ -2340,7 +2348,7 @@ static LogicalResult setRootConfig(IREE::GPU::TargetAttr target,
23402348
LDBG("Tile and fuse data tiled multi_mma config");
23412349
return success();
23422350
}
2343-
if (clGPUTestTileAndFuseMatmul) {
2351+
if (clGPUUseTileAndFuseMatmul) {
23442352
if (succeeded(IREE::GPU::setMatmulLoweringConfig(target, entryPointFn,
23452353
computeOp))) {
23462354
LDBG("Tile and fuse matmul config");

compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
// RUN: iree-opt --mlir-print-local-scope --split-input-file --iree-gpu-test-target=gfx942 \
2-
// RUN: --iree-codegen-llvmgpu-test-tile-and-fuse-matmul=true --iree-codegen-llvmgpu-test-tile-and-fuse-vectorize=true \
2+
// RUN: --iree-codegen-llvmgpu-use-tile-and-fuse-matmul=true --iree-codegen-llvmgpu-test-tile-and-fuse-vectorize=true \
33
// RUN: --iree-codegen-llvmgpu-use-igemm=false \
44
// RUN: --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy)" %s | FileCheck %s
55

compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute_gfx1100.mlir

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx1100 --iree-codegen-llvmgpu-use-vector-distribution \
1+
// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx1100 \
2+
// RUN: --iree-codegen-llvmgpu-use-tile-and-fuse-matmul=false --iree-codegen-llvmgpu-use-vector-distribution \
23
// RUN: --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy)" %s | FileCheck %s --check-prefix=WMMA
34

45
// TODO: This test is still using the legacy LLVMGPU kernel config. This needs

compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute_gfx942.mlir

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx942 --iree-codegen-llvmgpu-use-vector-distribution \
1+
// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx942 \
2+
// RUN: --iree-codegen-llvmgpu-use-tile-and-fuse-matmul=false --iree-codegen-llvmgpu-use-vector-distribution \
23
// RUN: --iree-codegen-llvmgpu-use-unaligned-gemm-vector-distribution --iree-codegen-llvmgpu-use-igemm=false \
34
// RUN: --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy)" %s | FileCheck %s
45

compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_custom_op.mlir

+2-2
Original file line numberDiff line numberDiff line change
@@ -33,14 +33,14 @@ func.func @custom_op(%arg0 : tensor<384x512xf32>, %arg1 : tensor<512x128xf32>,
3333
return %1 : tensor<384x128xf32>
3434
}
3535
// CHECK: #[[CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[64, 64, 0]]>
36-
// CHECK: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [256, 1, 1] subgroup_size = 64,
36+
// CHECK: #[[TRANSLATION:.+]] = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64,
3737
// CHECK: func @custom_op
3838
// CHECK-SAME: translation_info = #[[TRANSLATION]]
3939
// CHECK: iree_linalg_ext.custom_op
4040
// CHECK-SAME: lowering_config = #[[CONFIG]]
4141
// CHECK: ^bb
4242
// CHECK: linalg.matmul
43-
// CHECK-SAME: lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>, promote_operands = [0, 1], reduction = [0, 0, 32], subgroup_m_count = 2 : i64, subgroup_n_count = 2 : i64, workgroup = [64, 64, 0]}>
43+
// CHECK-SAME: lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>
4444
// CHECK: iree_linalg_ext.yield
4545

4646
// -----

0 commit comments

Comments
 (0)