Skip to content

Commit 10b5135

Browse files
nirvedhmeshramita9naiwa
authored andcommitted
[GPU] Use tile and fuse for matmul after vector distribute by default (iree-org#19884)
Currently some efforts such as iree-org#19854 and iree-org#19520 are ongoing to make the Tile and Fuse matmul pipeline on by default. However, these efforts are still WIP in achieving exact parity with the current default of Vector Distribute in all use cases. This PR in the time being tries Tile and Fuse after Vector Distribute so that we can get the benefits of tile and fuse such as handling unaligned to intrinsic shape while leaving the shapes that vector distribute handles untouched. Fixes : iree-org#19864 Fixes: iree-org#19855 --------- Signed-off-by: Nirvedh Meshram <nirvedh@gmail.com> Signed-off-by: Hyunsung Lee <ita9naiwa@gmail.com>
1 parent 667dbf9 commit 10b5135

File tree

2 files changed

+55
-32
lines changed

2 files changed

+55
-32
lines changed

compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp

+10-3
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,8 @@
5050
#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
5151
namespace mlir::iree_compiler {
5252

53-
llvm::cl::opt<bool> clGPUTestTileAndFuseMatmul(
54-
"iree-codegen-llvmgpu-test-tile-and-fuse-matmul",
53+
llvm::cl::opt<bool> clGPUEarlyTileAndFuseMatmul(
54+
"iree-codegen-llvmgpu-early-tile-and-fuse-matmul",
5555
llvm::cl::desc("test the the tile and fuse pipeline for matmul"),
5656
llvm::cl::init(false));
5757

@@ -2340,7 +2340,7 @@ static LogicalResult setRootConfig(IREE::GPU::TargetAttr target,
23402340
LDBG("Tile and fuse data tiled multi_mma config");
23412341
return success();
23422342
}
2343-
if (clGPUTestTileAndFuseMatmul) {
2343+
if (clGPUEarlyTileAndFuseMatmul) {
23442344
if (succeeded(IREE::GPU::setMatmulLoweringConfig(target, entryPointFn,
23452345
computeOp))) {
23462346
LDBG("Tile and fuse matmul config");
@@ -2364,6 +2364,13 @@ static LogicalResult setRootConfig(IREE::GPU::TargetAttr target,
23642364
if (succeeded(setVectorDistributionConfig(target, entryPointFn, computeOp))) {
23652365
return success();
23662366
}
2367+
// TODO (nirvedhmeshram, qedawkins) : remove this when tile and fuse backend
2368+
// config becomes the default for matmul.
2369+
if (succeeded(IREE::GPU::setMatmulLoweringConfig(target, entryPointFn,
2370+
computeOp))) {
2371+
LDBG("Tile and fuse matmul config after no vector distribute config");
2372+
return success();
2373+
}
23672374

23682375
if (auto linalgOp = dyn_cast<linalg::LinalgOp>(computeOp)) {
23692376
if (succeeded(setContractConfig(target, entryPointFn, linalgOp))) {

compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir

+45-29
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,11 @@
11
// RUN: iree-opt --mlir-print-local-scope --split-input-file --iree-gpu-test-target=gfx942 \
2-
// RUN: --iree-codegen-llvmgpu-test-tile-and-fuse-matmul=true --iree-codegen-llvmgpu-test-tile-and-fuse-vectorize=true \
2+
// RUN: --iree-codegen-llvmgpu-early-tile-and-fuse-matmul=true --iree-codegen-llvmgpu-test-tile-and-fuse-vectorize=true \
33
// RUN: --iree-codegen-llvmgpu-use-igemm=false \
4-
// RUN: --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy)" %s | FileCheck %s
4+
// RUN: --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy)" %s | FileCheck %s --check-prefix=CHECK
5+
//
6+
// RUN: iree-opt --mlir-print-local-scope --split-input-file --iree-gpu-test-target=gfx942 \
7+
// RUN: --iree-codegen-llvmgpu-use-igemm=false \
8+
// RUN: --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy)" %s | FileCheck %s --check-prefix=LATE
59

610
// TODO: This test is still using the legacy LLVMGPU kernel config. This needs
711
// to be migrated to the rocdl heuristics, but for now is just physically
@@ -43,6 +47,8 @@ func.func @expanded_matmul_transpose_b(%lhs: tensor<2x64x2048xf16>, %rhs: tensor
4347
// CHECK-SAME: subgroup = [1, 1, 4, 1, 0]
4448
// CHECK-SAME: workgroup = [1, 1, 64, 64, 0]
4549

50+
// LATE: LLVMGPUVectorDistribute
51+
4652
// -----
4753

4854
#map = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d4, d5)>
@@ -78,6 +84,8 @@ func.func @multi_dim_mma_schedule(%lhs: tensor<10x32x128x16xf16>, %rhs: tensor<4
7884
// CHECK-SAME: subgroup = [2, 2, 1, 1, 0, 0]
7985
// CHECK-SAME: workgroup = [2, 2, 32, 32, 0, 0]
8086

87+
// LATE: LLVMGPUVectorDistribute
88+
8189
// -----
8290

8391
#map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d3, d5, d6)>
@@ -115,6 +123,8 @@ func.func @dynamic_multi_dim_mma_schedule(%lhs: tensor<?x6x16x?x16xf16>, %rhs: t
115123
// CHECK-SAME: subgroup = [0, 1, 0, 1, 1, 0, 0]
116124
// CHECK-SAME: workgroup = [1, 2, 1, 16, 32, 0, 0]
117125

126+
// LATE: LLVMGPUVectorDistribute
127+
118128
// -----
119129

120130
func.func @mfma_matmul_1024x1024x1024(%lhs: tensor<1024x1024xf16>, %rhs: tensor<1024x1024xf16>) -> tensor<1024x1024xf32> {
@@ -140,6 +150,8 @@ func.func @mfma_matmul_1024x1024x1024(%lhs: tensor<1024x1024xf16>, %rhs: tensor<
140150
// CHECK-SAME: subgroup = [4, 4, 0]
141151
// CHECK-SAME: workgroup = [128, 128, 0]
142152

153+
// LATE: LLVMGPUVectorDistribute
154+
143155
// -----
144156

145157
module {
@@ -160,6 +172,8 @@ module {
160172
// CHECK-SAME: thread = [1, 1, 1, 1, 0, 0, 0]
161173
// CHECK-SAME: workgroup = [1, 1, 1, 64, 0, 0, 0]
162174

175+
// LATE: LLVMGPUVectorDistribute
176+
163177
// -----
164178

165179
module {
@@ -182,6 +196,8 @@ module {
182196
// CHECK-SAME: thread = [1, 4, 0]
183197
// CHECK-SAME: workgroup = [1, 256, 0]
184198

199+
// LATE: LLVMGPUWarpReduction
200+
185201
// -----
186202

187203
module {
@@ -275,15 +291,15 @@ func.func @unaligned_to_intrinsic_batched_matmul(%lhs : tensor<12x577x577xf32>,
275291
}
276292
}
277293

278-
// CHECK-LABEL: func.func @unaligned_to_intrinsic_batched_matmul
279-
// CHECK-SAME: #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64
280-
// CHECK-SAME: {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}
281-
// CHECK: linalg.batch_matmul {{.*}}lowering_config = #iree_gpu.lowering_config
282-
// CHECK-SAME: padding = [1, 16, 16, 4]
283-
// CHECK-SAME: promote_operands = [0, 1, 2]
284-
// CHECK-SAME: reduction = [0, 0, 0, 1]
285-
// CHECK-SAME: subgroup = [0, 1, 1, 0]
286-
// CHECK-SAME: workgroup = [1, 16, 16, 0]
294+
// LATE-LABEL: func.func @unaligned_to_intrinsic_batched_matmul
295+
// LATE-SAME: #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64
296+
// LATE-SAME: {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}
297+
// LATE: linalg.batch_matmul {{.*}}lowering_config = #iree_gpu.lowering_config
298+
// LATE-SAME: padding = [1, 16, 16, 4]
299+
// LATE-SAME: promote_operands = [0, 1, 2]
300+
// LATE-SAME: reduction = [0, 0, 0, 1]
301+
// LATE-SAME: subgroup = [0, 1, 1, 0]
302+
// LATE-SAME: workgroup = [1, 16, 16, 0]
287303

288304
// -----
289305

@@ -302,15 +318,15 @@ func.func @unaligned_matmul_with_two_reduce_dim(%arg0: tensor<196x9x4xf32>, %arg
302318
}
303319
}
304320

305-
// CHECK-LABEL: func.func @unaligned_matmul_with_two_reduce_dim
306-
// CHECK-SAME: {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64
307-
// CHECK: linalg.generic
308-
// CHECK-SAME: {lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>
309-
// CHECK-SAME: padding = [16, 1, 16, 4]
310-
// CHECK-SAME: promote_operands = [0, 1, 2]
311-
// CHECK-SAME: reduction = [0, 1, 0, 1],
312-
// CHECK-SAME: subgroup = [1, 0, 1, 0],
313-
// CHECK-SAME: workgroup = [16, 0, 16, 0]}
321+
// LATE-LABEL: func.func @unaligned_matmul_with_two_reduce_dim
322+
// LATE-SAME: {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64
323+
// LATE: linalg.generic
324+
// LATE-SAME: {lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>
325+
// LATE-SAME: padding = [16, 1, 16, 4]
326+
// LATE-SAME: promote_operands = [0, 1, 2]
327+
// LATE-SAME: reduction = [0, 1, 0, 1],
328+
// LATE-SAME: subgroup = [1, 0, 1, 0],
329+
// LATE-SAME: workgroup = [16, 0, 16, 0]}
314330

315331
// -----
316332

@@ -331,15 +347,15 @@ func.func @unaligned_to_intrinsic_batched_matmul_tiling_check(%lhs : tensor<12x5
331347
// In this unit test, if C promotion is not considered, it will deduce a MMA
332348
// schedule with nTileSize of 16 while in reality it should be 8.
333349

334-
// CHECK-LABEL: func.func @unaligned_to_intrinsic_batched_matmul_tiling_check
335-
// CHECK-SAME: #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64
336-
// CHECK-SAME: {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}
337-
// CHECK: linalg.batch_matmul {{.*}}lowering_config = #iree_gpu.lowering_config
338-
// CHECK-SAME: padding = [1, 16, 512, 4]
339-
// CHECK-SAME: promote_operands = [0, 1, 2]
340-
// CHECK-SAME: reduction = [0, 0, 0, 1]
341-
// CHECK-SAME: subgroup = [0, 1, 8, 0]
342-
// CHECK-SAME: workgroup = [1, 16, 512, 0]
350+
// LATE-LABEL: func.func @unaligned_to_intrinsic_batched_matmul_tiling_check
351+
// LATE-SAME: #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64
352+
// LATE-SAME: {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}
353+
// LATE: linalg.batch_matmul {{.*}}lowering_config = #iree_gpu.lowering_config
354+
// LATE-SAME: padding = [1, 16, 512, 4]
355+
// LATE-SAME: promote_operands = [0, 1, 2]
356+
// LATE-SAME: reduction = [0, 0, 0, 1]
357+
// LATE-SAME: subgroup = [0, 1, 8, 0]
358+
// LATE-SAME: workgroup = [1, 16, 512, 0]
343359

344360
// -----
345361

0 commit comments

Comments
 (0)