Skip to content

Commit d999ed1

Browse files
Match TileAndFuse Matmul Heuristics to VectorDistibute and raise limit of TileLargeTensorPass
Signed-off-by: Nirvedh Meshram <nirvedh@gmail.com>
1 parent b89335b commit d999ed1

File tree

8 files changed

+40
-33
lines changed

8 files changed

+40
-33
lines changed

compiler/src/iree/compiler/Codegen/Common/Passes.td

+1-1
Original file line numberDiff line numberDiff line change
@@ -654,7 +654,7 @@ def TileLargeTensorsPass :
654654
];
655655
let options = [
656656
Option<"maxVectorSize", "max-vector-size", "int64_t",
657-
/*default=*/"64",
657+
/*default=*/"256",
658658
"Maximum static size to tile to (i.e. all remaining ops will be smaller)">,
659659
];
660660
}

compiler/src/iree/compiler/Codegen/Common/test/tile_large_tensors.mlir

+15-15
Original file line numberDiff line numberDiff line change
@@ -3,22 +3,22 @@
33
// RUN: FileCheck %s
44

55
#map = affine_map<(d0, d1) -> (d0, d1)>
6-
func.func @simple_generic(%3: tensor<64x256xf32>, %4: tensor<64x256xf32>, %5: tensor<64x256xf32>) -> tensor<64x256xf32> {
6+
func.func @simple_generic(%3: tensor<64x512xf32>, %4: tensor<64x512xf32>, %5: tensor<64x512xf32>) -> tensor<64x512xf32> {
77
%6 = linalg.generic {
88
indexing_maps = [#map, #map, #map],
99
iterator_types = ["parallel", "parallel"]
10-
} ins(%3, %4 : tensor<64x256xf32>, tensor<64x256xf32>) outs(%5 : tensor<64x256xf32>) {
10+
} ins(%3, %4 : tensor<64x512xf32>, tensor<64x512xf32>) outs(%5 : tensor<64x512xf32>) {
1111
^bb0(%in: f32, %in_0: f32, %out: f32):
1212
%7 = arith.addf %in, %in_0 : f32
1313
linalg.yield %7 : f32
14-
} -> tensor<64x256xf32>
15-
return %6 : tensor<64x256xf32>
14+
} -> tensor<64x512xf32>
15+
return %6 : tensor<64x512xf32>
1616
}
1717

1818
// CHECK-LABEL: func.func @simple_generic
1919
// CHECK: scf.for %{{.*}} = %c0 to %c64 step %c1
20-
// CHECK: scf.for %{{.*}} = %c0 to %c256 step %c64
21-
// CHECK: linalg.generic {{.*}} outs({{.*}}: tensor<1x64xf32>)
20+
// CHECK: scf.for %{{.*}} = %c0 to %c512 step %c256
21+
// CHECK: linalg.generic {{.*}} outs({{.*}}: tensor<1x256xf32>)
2222

2323
// -----
2424

@@ -65,21 +65,21 @@ func.func @in_nested_region(%3: tensor<64x64xf32>, %4: tensor<64x64xf32>, %5: te
6565

6666
// -----
6767

68-
func.func @multiple_use_tilable_op(%3: tensor<64x256xf32>, %4: tensor<64x256xf32>) -> (tensor<64x256xf32>, tensor<256x64xf32>) {
69-
%add_empty = tensor.empty() : tensor<64x256xf32>
68+
func.func @multiple_use_tilable_op(%3: tensor<64x512xf32>, %4: tensor<64x512xf32>) -> (tensor<64x512xf32>, tensor<512x64xf32>) {
69+
%add_empty = tensor.empty() : tensor<64x512xf32>
7070
%6 = linalg.add
71-
ins(%3, %4 : tensor<64x256xf32>, tensor<64x256xf32>)
72-
outs(%add_empty : tensor<64x256xf32>) -> tensor<64x256xf32>
73-
%transpose_empty = tensor.empty() : tensor<256x64xf32>
71+
ins(%3, %4 : tensor<64x512xf32>, tensor<64x512xf32>)
72+
outs(%add_empty : tensor<64x512xf32>) -> tensor<64x512xf32>
73+
%transpose_empty = tensor.empty() : tensor<512x64xf32>
7474
%7 = linalg.transpose
75-
ins(%6 : tensor<64x256xf32>)
76-
outs(%transpose_empty : tensor<256x64xf32>) permutation = [1, 0]
77-
return %6, %7 : tensor<64x256xf32>, tensor<256x64xf32>
75+
ins(%6 : tensor<64x512xf32>)
76+
outs(%transpose_empty : tensor<512x64xf32>) permutation = [1, 0]
77+
return %6, %7 : tensor<64x512xf32>, tensor<512x64xf32>
7878
}
7979

8080
// CHECK-LABEL: func.func @multiple_use_tilable_op
8181
// CHECK: %[[ADD_TILING:.+]] = scf.for
82-
// CHECK: linalg.add {{.*}} -> tensor<1x64xf32>
82+
// CHECK: linalg.add {{.*}} -> tensor<1x256xf32>
8383
// CHECK: %[[T_TILING:.+]] = scf.for
8484
// CHECK: %[[FUSED_ADD:.+]] = linalg.add {{.*}} -> tensor<64x1xf32>
8585
// CHECK: linalg.transpose ins(%[[FUSED_ADD]]

compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp

+9-4
Original file line numberDiff line numberDiff line change
@@ -149,16 +149,21 @@ static std::optional<GPUMMASchedule> getMmaScheduleFromProblemAndTarget(
149149
seeds = {/*bestSubgroupCountPerWorkgroup=*/4,
150150
/*bestMNTileCountPerSubgroup=*/4,
151151
/*bestKTileCountPerSubgroup=*/8,
152-
/*bestKElementCountPerSubgroup*/ kCacheLineSizeBits / inBitWidth};
152+
/*bestKElementCountPerSubgroup*/ kCacheLineSizeBits * 2 /
153+
inBitWidth};
153154
} else {
154155
seeds = {/*bestSubgroupCountPerWorkgroup=*/4,
155156
/*bestMNTileCountPerSubgroup=*/16,
156157
/*bestKTileCountPerSubgroup=*/4,
157-
/*bestKElementCountPerSubgroup*/ kCacheLineSizeBits / 2 /
158-
inBitWidth};
158+
/*bestKElementCountPerSubgroup*/ kCacheLineSizeBits / inBitWidth};
159159
}
160160

161-
int64_t maxSharedMemoryBytes = target.getWgp().getMaxWorkgroupMemoryBytes();
161+
// We target slightly below the full available shared Memory to leave room for
162+
// `GPUReduceBankConflictsPass` that will pad shared memory without keeping
163+
// track of usage. We can drop this after solving
164+
// https://github.com/iree-org/iree/issues/19675
165+
int64_t maxSharedMemoryBytes =
166+
target.getWgp().getMaxWorkgroupMemoryBytes() - 64 * inBitWidth;
162167

163168
// First try to find a schedule with an exactly matching intrinsic.
164169
std::optional<GPUMMASchedule> schedule = deduceMMASchedule(

compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp

+3
Original file line numberDiff line numberDiff line change
@@ -620,6 +620,9 @@ setMatmulVectorDistributionConfig(IREE::GPU::TargetAttr target,
620620
/*canUpcastAcc=*/true);
621621
}
622622

623+
LDBG("transposedLhs: " << transposedLhs);
624+
LDBG("transposedRhs: " << transposedRhs);
625+
623626
// Only batch_matmul is supported in the LLVMGPUPadAndVectorDistribute
624627
// pipeline.
625628
// TODO(hanchung): Support cases that there are fused producers.

compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_igemm_tile_and_fuse.mlir

+6-6
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ func.func @nhwc_conv_mfma() {
2424
// CHECK: linalg.conv_2d_nhwc_hwcf {{.*}}lowering_config = #iree_gpu.lowering_config
2525
// CHECK-SAME: mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>
2626
// CHECK-SAME: promote_operands = [0, 1]
27-
// CHECK-SAME: reduction = [0, 0, 0, 0, 8]
27+
// CHECK-SAME: reduction = [0, 0, 0, 0, 16]
2828
// CHECK-SAME: subgroup = [1, 2, 2, 1, 0]
2929
// CHECK-SAME: workgroup = [1, 2, 32, 64, 0]
3030

@@ -53,7 +53,7 @@ func.func @nchw_conv_mfma() {
5353
// CHECK: linalg.conv_2d_nchw_fchw {{.*}}lowering_config = #iree_gpu.lowering_config
5454
// CHECK-SAME: mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>
5555
// CHECK-SAME: promote_operands = [0, 1]
56-
// CHECK-SAME: reduction = [0, 0, 0, 0, 8]
56+
// CHECK-SAME: reduction = [0, 0, 0, 0, 16]
5757
// CHECK-SAME: subgroup = [1, 2, 2, 1, 0]
5858
// CHECK-SAME: workgroup = [1, 64, 2, 32, 0]
5959

@@ -81,9 +81,9 @@ func.func @nhwc_conv_unaligned_mfma() {
8181

8282
// CHECK: linalg.conv_2d_nhwc_hwcf {{.*}}lowering_config = #iree_gpu.lowering_config
8383
// CHECK-SAME: mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>
84-
// CHECK-SAME: padding = [2, 1, 32, 64, 32]
84+
// CHECK-SAME: padding = [2, 1, 32, 64, 64]
8585
// CHECK-SAME: promote_operands = [0, 1, 2]
86-
// CHECK-SAME: reduction = [0, 0, 0, 0, 8]
86+
// CHECK-SAME: reduction = [0, 0, 0, 0, 16]
8787
// CHECK-SAME: subgroup = [2, 1, 2, 1, 0]
8888
// CHECK-SAME: workgroup = [2, 1, 32, 64, 0]
8989

@@ -111,8 +111,8 @@ func.func @nchw_conv_unaligned_mfma() {
111111

112112
// CHECK: linalg.conv_2d_nchw_fchw {{.*}}lowering_config = #iree_gpu.lowering_config
113113
// CHECK-SAME: mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>
114-
// CHECK-SAME: padding = [1, 64, 2, 32, 32]
114+
// CHECK-SAME: padding = [1, 64, 2, 32, 64]
115115
// CHECK-SAME: promote_operands = [0, 1, 2]
116-
// CHECK-SAME: reduction = [0, 0, 0, 0, 8]
116+
// CHECK-SAME: reduction = [0, 0, 0, 0, 16]
117117
// CHECK-SAME: subgroup = [1, 2, 2, 1, 0]
118118
// CHECK-SAME: workgroup = [1, 64, 2, 32, 0]

compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir

+3-3
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ func.func @expanded_matmul_transpose_b(%lhs: tensor<2x64x2048xf16>, %rhs: tensor
3939
// CHECK: linalg.generic {{.*}}lowering_config = #iree_gpu.lowering_config
4040
// CHECK-SAME: mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>
4141
// CHECK-SAME: promote_operands = [0, 1]
42-
// CHECK-SAME: reduction = [0, 0, 0, 0, 4]
42+
// CHECK-SAME: reduction = [0, 0, 0, 0, 8]
4343
// CHECK-SAME: subgroup = [1, 1, 4, 1, 0]
4444
// CHECK-SAME: workgroup = [1, 1, 64, 64, 0]
4545

@@ -74,7 +74,7 @@ func.func @multi_dim_mma_schedule(%lhs: tensor<10x32x128x16xf16>, %rhs: tensor<4
7474
// CHECK: linalg.generic {{.*}}lowering_config = #iree_gpu.lowering_config
7575
// CHECK-SAME: mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>
7676
// CHECK-SAME: promote_operands = [0, 1]
77-
// CHECK-SAME: reduction = [0, 0, 0, 0, 4, 1]
77+
// CHECK-SAME: reduction = [0, 0, 0, 0, 8, 1]
7878
// CHECK-SAME: subgroup = [2, 2, 1, 1, 0, 0]
7979
// CHECK-SAME: workgroup = [2, 2, 32, 32, 0, 0]
8080

@@ -136,7 +136,7 @@ func.func @mfma_matmul_1024x1024x1024(%lhs: tensor<1024x1024xf16>, %rhs: tensor<
136136
// CHECK: linalg.matmul {{.*}}lowering_config = #iree_gpu.lowering_config
137137
// CHECK-SAME: mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>
138138
// CHECK-SAME: promote_operands = [0, 1]
139-
// CHECK-SAME: reduction = [0, 0, 2]
139+
// CHECK-SAME: reduction = [0, 0, 4]
140140
// CHECK-SAME: subgroup = [4, 4, 0]
141141
// CHECK-SAME: workgroup = [128, 128, 0]
142142

compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir

+2-3
Original file line numberDiff line numberDiff line change
@@ -1013,9 +1013,8 @@ hal.executable public @main {
10131013
// CHECK: scf.yield %[[REDUCE]]
10141014

10151015
// CHECK: scf.for %{{.*}} = %{{.*}} to %c16 step %c1
1016-
// CHECK: scf.for
1017-
// CHECK-COUNT-4: arith.addf {{.*}} : vector<9xf32>
1018-
// CHECK: vector.transfer_write {{.*}} vector<9xi8>, memref<32x16x9x9xi8, #hal.descriptor_type<storage_buffer>>
1016+
// CHECK-COUNT-4: arith.addf {{.*}} : vector<9x9xf32>
1017+
// CHECK: vector.transfer_write {{.*}} vector<9x9xi8>, memref<32x16x9x9xi8, #hal.descriptor_type<storage_buffer>>
10191018

10201019
// -----
10211020

compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_custom_op.mlir

+1-1
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ func.func @custom_op(%arg0 : tensor<384x512xf32>, %arg1 : tensor<512x128xf32>,
4040
// CHECK-SAME: lowering_config = #[[CONFIG]]
4141
// CHECK: ^bb
4242
// CHECK: linalg.matmul
43-
// CHECK-SAME: lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>, promote_operands = [0, 1], reduction = [0, 0, 8], subgroup = [2, 2, 0], workgroup = [64, 64, 0]}>
43+
// CHECK-SAME: lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>, promote_operands = [0, 1], reduction = [0, 0, 16], subgroup = [2, 2, 0], workgroup = [64, 64, 0]}>
4444
// CHECK: iree_linalg_ext.yield
4545

4646
// -----

0 commit comments

Comments
 (0)