1
1
// RUN: iree-opt --mlir-print-local-scope --split-input-file --iree-gpu-test-target=gfx942 \
2
- // RUN: --iree-codegen-llvmgpu-test -tile-and-fuse-matmul=true --iree-codegen-llvmgpu-test-tile-and-fuse-vectorize=true \
2
+ // RUN: --iree-codegen-llvmgpu-early -tile-and-fuse-matmul=true --iree-codegen-llvmgpu-test-tile-and-fuse-vectorize=true \
3
3
// RUN: --iree-codegen-llvmgpu-use-igemm=false \
4
- // RUN: --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy)" %s | FileCheck %s
4
+ // RUN: --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy)" %s | FileCheck %s --check-prefix=CHECK
5
+ //
6
+ // RUN: iree-opt --mlir-print-local-scope --split-input-file --iree-gpu-test-target=gfx942 \
7
+ // RUN: --iree-codegen-llvmgpu-use-igemm=false \
8
+ // RUN: --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy)" %s | FileCheck %s --check-prefix=LATE
5
9
6
10
// TODO: This test is still using the legacy LLVMGPU kernel config. This needs
7
11
// to be migrated to the rocdl heuristics, but for now is just physically
@@ -43,6 +47,8 @@ func.func @expanded_matmul_transpose_b(%lhs: tensor<2x64x2048xf16>, %rhs: tensor
43
47
// CHECK-SAME: subgroup = [1, 1, 4, 1, 0]
44
48
// CHECK-SAME: workgroup = [1, 1, 64, 64, 0]
45
49
50
+ // LATE: LLVMGPUVectorDistribute
51
+
46
52
// -----
47
53
48
54
#map = affine_map <(d0 , d1 , d2 , d3 , d4 , d5 ) -> (d0 , d2 , d4 , d5 )>
@@ -78,6 +84,8 @@ func.func @multi_dim_mma_schedule(%lhs: tensor<10x32x128x16xf16>, %rhs: tensor<4
78
84
// CHECK-SAME: subgroup = [2, 2, 1, 1, 0, 0]
79
85
// CHECK-SAME: workgroup = [2, 2, 32, 32, 0, 0]
80
86
87
+ // LATE: LLVMGPUVectorDistribute
88
+
81
89
// -----
82
90
83
91
#map = affine_map <(d0 , d1 , d2 , d3 , d4 , d5 , d6 ) -> (d0 , d1 , d3 , d5 , d6 )>
@@ -115,6 +123,8 @@ func.func @dynamic_multi_dim_mma_schedule(%lhs: tensor<?x6x16x?x16xf16>, %rhs: t
115
123
// CHECK-SAME: subgroup = [0, 1, 0, 1, 1, 0, 0]
116
124
// CHECK-SAME: workgroup = [1, 2, 1, 16, 32, 0, 0]
117
125
126
+ // LATE: LLVMGPUVectorDistribute
127
+
118
128
// -----
119
129
120
130
func.func @mfma_matmul_1024x1024x1024 (%lhs: tensor <1024 x1024 xf16 >, %rhs: tensor <1024 x1024 xf16 >) -> tensor <1024 x1024 xf32 > {
@@ -140,6 +150,8 @@ func.func @mfma_matmul_1024x1024x1024(%lhs: tensor<1024x1024xf16>, %rhs: tensor<
140
150
// CHECK-SAME: subgroup = [4, 4, 0]
141
151
// CHECK-SAME: workgroup = [128, 128, 0]
142
152
153
+ // LATE: LLVMGPUVectorDistribute
154
+
143
155
// -----
144
156
145
157
module {
@@ -160,6 +172,8 @@ module {
160
172
// CHECK-SAME: thread = [1, 1, 1, 1, 0, 0, 0]
161
173
// CHECK-SAME: workgroup = [1, 1, 1, 64, 0, 0, 0]
162
174
175
+ // LATE: LLVMGPUVectorDistribute
176
+
163
177
// -----
164
178
165
179
module {
@@ -182,6 +196,8 @@ module {
182
196
// CHECK-SAME: thread = [1, 4, 0]
183
197
// CHECK-SAME: workgroup = [1, 256, 0]
184
198
199
+ // LATE: LLVMGPUWarpReduction
200
+
185
201
// -----
186
202
187
203
module {
@@ -275,15 +291,15 @@ func.func @unaligned_to_intrinsic_batched_matmul(%lhs : tensor<12x577x577xf32>,
275
291
}
276
292
}
277
293
278
- // CHECK -LABEL: func.func @unaligned_to_intrinsic_batched_matmul
279
- // CHECK -SAME: #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64
280
- // CHECK -SAME: {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}
281
- // CHECK : linalg.batch_matmul {{.*}}lowering_config = #iree_gpu.lowering_config
282
- // CHECK -SAME: padding = [1, 16, 16, 4]
283
- // CHECK -SAME: promote_operands = [0, 1, 2]
284
- // CHECK -SAME: reduction = [0, 0, 0, 1]
285
- // CHECK -SAME: subgroup = [0, 1, 1, 0]
286
- // CHECK -SAME: workgroup = [1, 16, 16, 0]
294
+ // LATE -LABEL: func.func @unaligned_to_intrinsic_batched_matmul
295
+ // LATE -SAME: #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64
296
+ // LATE -SAME: {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}
297
+ // LATE : linalg.batch_matmul {{.*}}lowering_config = #iree_gpu.lowering_config
298
+ // LATE -SAME: padding = [1, 16, 16, 4]
299
+ // LATE -SAME: promote_operands = [0, 1, 2]
300
+ // LATE -SAME: reduction = [0, 0, 0, 1]
301
+ // LATE -SAME: subgroup = [0, 1, 1, 0]
302
+ // LATE -SAME: workgroup = [1, 16, 16, 0]
287
303
288
304
// -----
289
305
@@ -302,15 +318,15 @@ func.func @unaligned_matmul_with_two_reduce_dim(%arg0: tensor<196x9x4xf32>, %arg
302
318
}
303
319
}
304
320
305
- // CHECK -LABEL: func.func @unaligned_matmul_with_two_reduce_dim
306
- // CHECK -SAME: {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64
307
- // CHECK : linalg.generic
308
- // CHECK -SAME: {lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>
309
- // CHECK -SAME: padding = [16, 1, 16, 4]
310
- // CHECK -SAME: promote_operands = [0, 1, 2]
311
- // CHECK -SAME: reduction = [0, 1, 0, 1],
312
- // CHECK -SAME: subgroup = [1, 0, 1, 0],
313
- // CHECK -SAME: workgroup = [16, 0, 16, 0]}
321
+ // LATE -LABEL: func.func @unaligned_matmul_with_two_reduce_dim
322
+ // LATE -SAME: {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64
323
+ // LATE : linalg.generic
324
+ // LATE -SAME: {lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>
325
+ // LATE -SAME: padding = [16, 1, 16, 4]
326
+ // LATE -SAME: promote_operands = [0, 1, 2]
327
+ // LATE -SAME: reduction = [0, 1, 0, 1],
328
+ // LATE -SAME: subgroup = [1, 0, 1, 0],
329
+ // LATE -SAME: workgroup = [16, 0, 16, 0]}
314
330
315
331
// -----
316
332
@@ -331,15 +347,15 @@ func.func @unaligned_to_intrinsic_batched_matmul_tiling_check(%lhs : tensor<12x5
331
347
// In this unit test, if C promotion is not considered, it will deduce a MMA
332
348
// schedule with nTileSize of 16 while in reality it should be 8.
333
349
334
- // CHECK -LABEL: func.func @unaligned_to_intrinsic_batched_matmul_tiling_check
335
- // CHECK -SAME: #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64
336
- // CHECK -SAME: {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}
337
- // CHECK : linalg.batch_matmul {{.*}}lowering_config = #iree_gpu.lowering_config
338
- // CHECK -SAME: padding = [1, 16, 512, 4]
339
- // CHECK -SAME: promote_operands = [0, 1, 2]
340
- // CHECK -SAME: reduction = [0, 0, 0, 1]
341
- // CHECK -SAME: subgroup = [0, 1, 8, 0]
342
- // CHECK -SAME: workgroup = [1, 16, 512, 0]
350
+ // LATE -LABEL: func.func @unaligned_to_intrinsic_batched_matmul_tiling_check
351
+ // LATE -SAME: #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64
352
+ // LATE -SAME: {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}
353
+ // LATE : linalg.batch_matmul {{.*}}lowering_config = #iree_gpu.lowering_config
354
+ // LATE -SAME: padding = [1, 16, 512, 4]
355
+ // LATE -SAME: promote_operands = [0, 1, 2]
356
+ // LATE -SAME: reduction = [0, 0, 0, 1]
357
+ // LATE -SAME: subgroup = [0, 1, 8, 0]
358
+ // LATE -SAME: workgroup = [1, 16, 512, 0]
343
359
344
360
// -----
345
361
0 commit comments