Skip to content

Commit 81321d9

Browse files
committed
Remove the operand promotion for LHS and RHS.
Operand promotion for unaligned matmul cases is leading to dynamic trip count and forall loop fusion is not taking place by iree-codegen-gpu-fuse-and-hoist-parallel-loops.
1 parent 54b9e29 commit 81321d9

File tree

5 files changed

+8
-10
lines changed

5 files changed

+8
-10
lines changed

compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp

-2
Original file line numberDiff line numberDiff line change
@@ -1346,8 +1346,6 @@ static LogicalResult setContractConfig(IREE::GPU::TargetAttr target,
13461346
attrs.emplace_back(b.getStringAttr("reduction"),
13471347
b.getI64ArrayAttr(reductionTileSizes));
13481348

1349-
// Promote operands to use shared memory for LHS and RHS.
1350-
IREE::GPU::setPromotedOperandList(context, attrs, {0, 1});
13511349
auto configDict = b.getDictionaryAttr(attrs);
13521350
auto loweringConfig =
13531351
IREE::GPU::LoweringConfigAttr::get(context, configDict);

compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_matvec.mlir

+1-1
Original file line numberDiff line numberDiff line change
@@ -271,7 +271,7 @@ func.func @not_vmt() {
271271
// CHECK: func.func @not_vmt()
272272
// CHECK-SAME: translation_info = #[[$TRANSLATION]]
273273
// CHECK: linalg.generic
274-
// CHECK-SAME: lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 8], thread = [1, 128, 0], workgroup = [1, 128, 1]}>
274+
// CHECK-SAME: lowering_config = #iree_gpu.lowering_config<{reduction = [0, 0, 8], thread = [1, 128, 0], workgroup = [1, 128, 1]}>
275275

276276
// -----
277277

compiler/src/iree/compiler/Codegen/LLVMGPU/test/gpu_set_num_workgroups.mlir

+3-3
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ func.func @dot_dispatch_1() {
5959
// CHECK-SAME: translation_info = #[[TRANSLATION]]
6060
// CHECK: linalg.fill
6161
// CHECK: linalg.matmul
62-
// CHECK-SAME: lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 4], thread = [2, 1, 0], workgroup = [4, 2, 1]}>
62+
// CHECK-SAME: lowering_config = #iree_gpu.lowering_config<{reduction = [0, 0, 4], thread = [2, 1, 0], workgroup = [4, 2, 1]}>
6363

6464
// -----
6565

@@ -86,7 +86,7 @@ func.func @unaligned_k() {
8686
// CHECK-SAME: translation_info = #[[TRANSLATION]]
8787
// CHECK: linalg.fill
8888
// CHECK: linalg.matmul
89-
// CHECK-SAME: lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 2], thread = [1, 16, 0], workgroup = [32, 128, 1]}>
89+
// CHECK-SAME: lowering_config = #iree_gpu.lowering_config<{reduction = [0, 0, 2], thread = [1, 16, 0], workgroup = [32, 128, 1]}>
9090

9191
// -----
9292

@@ -656,7 +656,7 @@ func.func @_main_dispatch_15_generic_512x4x42x42x64_f32() {
656656
// CHECK-SAME: translation_info = #[[TRANSLATION]]
657657
// CHECK: linalg.fill
658658
// CHECK: linalg.generic
659-
// CHECK-SAME: lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 0, 0, 32], thread = [1, 1, 1, 16, 0], workgroup = [1, 1, 32, 128, 1]}>
659+
// CHECK-SAME: lowering_config = #iree_gpu.lowering_config<{reduction = [0, 0, 0, 0, 32], thread = [1, 1, 1, 16, 0], workgroup = [1, 1, 32, 128, 1]}>
660660

661661
// -----
662662

compiler/src/iree/compiler/Codegen/LLVMGPU/test/nvvm_pipeline_test.mlir

+2-2
Original file line numberDiff line numberDiff line change
@@ -87,8 +87,8 @@ hal.executable @dot_dispatch_0 {
8787
// CHECK: hal.executable.variant public @cuda
8888
// CHECK-NOT: llvm.store
8989
// CHECK: llvm.br
90-
// CHECK: llvm.load {{.*}} : !llvm.ptr<3> -> vector<32xf32>
91-
// CHECK-COUNT-32: llvm.load {{.*}} : !llvm.ptr<3> -> vector<16xf32>
90+
// CHECK: llvm.load {{.*}} : !llvm.ptr<1> -> vector<32xf32>
91+
// CHECK-COUNT-32: llvm.load {{.*}} : !llvm.ptr<1> -> vector<16xf32>
9292
// CHECK-COUNT-32: llvm.intr.fmuladd({{.*}}) : (vector<16xf32>, vector<16xf32>, vector<16xf32>) -> vector<16xf32>
9393
// CHECK: llvm.store {{.*}} : vector<16xf32>, !llvm.ptr<1>
9494

compiler/src/iree/compiler/Codegen/LLVMGPU/test/rocdl_pipeline_test.mlir

+2-2
Original file line numberDiff line numberDiff line change
@@ -88,8 +88,8 @@ hal.executable @dot_dispatch_0 {
8888
// RDNA3: hal.executable.variant public @rocm
8989
// RDNA3-NOT: llvm.store
9090
// RDNA3: llvm.br
91-
// RDNA3-COUNT-1: llvm.load {{.*}} : !llvm.ptr<3> -> vector<32xf32>
92-
// RDNA3-COUNT-32: llvm.load {{.*}} : !llvm.ptr<3> -> vector<16xf32>
91+
// RDNA3-COUNT-1: llvm.load {{.*}} : !llvm.ptr<1> -> vector<32xf32>
92+
// RDNA3-COUNT-32: llvm.load {{.*}} : !llvm.ptr<1> -> vector<16xf32>
9393
// RDNA3-COUNT-32: llvm.intr.fmuladd({{.*}}) : (vector<16xf32>, vector<16xf32>, vector<16xf32>) -> vector<16xf32>
9494
// RDNA3-COUNT-1: llvm.store {{.*}} : vector<16xf32>, !llvm.ptr<1>
9595
// RDNA3: llvm.br

0 commit comments

Comments
 (0)