1
1
// RUN: iree-opt --mlir-print-local-scope --split-input-file --iree-gpu-test-target=gfx942 \
2
- // RUN: --iree-codegen-llvmgpu-test-tile-and-fuse-matmul=true --iree-codegen-llvmgpu-test-tile-and-fuse- vectorize=true \
2
+ // RUN: --iree-codegen-llvmgpu-test-tile-and-fuse-vectorize=true \
3
3
// RUN: --iree-codegen-llvmgpu-use-igemm=false \
4
4
// RUN: --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy)" %s | FileCheck %s
5
5
10
10
#map = affine_map <(d0 , d1 , d2 , d3 , d4 ) -> (d0 , d2 , d4 )>
11
11
#map1 = affine_map <(d0 , d1 , d2 , d3 , d4 ) -> (d1 , d3 , d4 )>
12
12
#map2 = affine_map <(d0 , d1 , d2 , d3 , d4 ) -> (d0 , d1 , d2 , d3 )>
13
- func.func @expanded_matmul_transpose_b (%lhs: tensor <2 x64 x2048 xf16 >, %rhs: tensor <10 x64 x2048 xf16 >) -> tensor <2 x 10 x 64 x 64 x f16 > {
13
+ func.func @expanded_matmul_transpose_b (%lhs: tensor <2 x64 x2048 xf16 >, %rhs: tensor <10 x64 x2048 xf16 >) -> tensor <2 x 10 x 64 x 64 x f32 > {
14
14
%c0 = arith.constant 0 : index
15
- %cst = arith.constant 0.000000e+00 : f16
16
- %5 = tensor.empty () : tensor <2 x 10 x 64 x 64 x f16 >
17
- %6 = linalg.fill ins (%cst : f16 ) outs (%5 : tensor <2 x 10 x 64 x 64 x f16 >) -> tensor <2 x 10 x 64 x 64 x f16 >
15
+ %cst = arith.constant 0.000000e+00 : f32
16
+ %5 = tensor.empty () : tensor <2 x 10 x 64 x 64 x f32 >
17
+ %6 = linalg.fill ins (%cst : f32 ) outs (%5 : tensor <2 x 10 x 64 x 64 x f32 >) -> tensor <2 x 10 x 64 x 64 x f32 >
18
18
%7 = linalg.generic {
19
19
indexing_maps = [#map , #map1 , #map2 ],
20
20
iterator_types = [" parallel" , " parallel" , " parallel" , " parallel" , " reduction" ]}
21
- ins (%lhs , %rhs : tensor <2 x64 x2048 xf16 >, tensor <10 x64 x2048 xf16 >) outs (%6 : tensor <2 x10 x64 x64 xf16 >) {
22
- ^bb0 (%in: f16 , %in_0: f16 , %out: f16 ):
23
- %8 = arith.mulf %in , %in_0 : f16
24
- %9 = arith.addf %8 , %out : f16
25
- linalg.yield %9 : f16
26
- } -> tensor <2 x10 x64 x64 xf16 >
27
- return %7 : tensor <2 x10 x64 x64 xf16 >
21
+ ins (%lhs , %rhs : tensor <2 x64 x2048 xf16 >, tensor <10 x64 x2048 xf16 >) outs (%6 : tensor <2 x10 x64 x64 xf32 >) {
22
+ ^bb0 (%in: f16 , %in_0: f16 , %out: f32 ):
23
+ %8 = arith.extf %in : f16 to f32
24
+ %9 = arith.extf %in_0 : f16 to f32
25
+ %10 = arith.mulf %8 , %9 : f32
26
+ %11 = arith.addf %10 , %out : f32
27
+ linalg.yield %11 : f32
28
+ } -> tensor <2 x10 x64 x64 xf32 >
29
+ return %7 : tensor <2 x10 x64 x64 xf32 >
28
30
}
29
31
30
32
// CHECK-LABEL: func.func @expanded_matmul_transpose_b
@@ -46,21 +48,23 @@ func.func @expanded_matmul_transpose_b(%lhs: tensor<2x64x2048xf16>, %rhs: tensor
46
48
#map = affine_map <(d0 , d1 , d2 , d3 , d4 , d5 ) -> (d0 , d2 , d4 , d5 )>
47
49
#map1 = affine_map <(d0 , d1 , d2 , d3 , d4 , d5 ) -> (d1 , d3 , d4 , d5 )>
48
50
#map2 = affine_map <(d0 , d1 , d2 , d3 , d4 , d5 ) -> (d0 , d1 , d2 , d3 )>
49
- func.func @multi_dim_mma_schedule (%lhs: tensor <10 x32 x128 x16 xf16 >, %rhs: tensor <4 x32 x128 x16 xf16 >) -> tensor <10 x 4 x 32 x 32 x f16 > {
51
+ func.func @multi_dim_mma_schedule (%lhs: tensor <10 x32 x128 x16 xf16 >, %rhs: tensor <4 x32 x128 x16 xf16 >) -> tensor <10 x 4 x 32 x 32 x f32 > {
50
52
%c0 = arith.constant 0 : index
51
- %cst = arith.constant 0.000000e+00 : f16
52
- %5 = tensor.empty () : tensor <10 x 4 x 32 x 32 x f16 >
53
- %6 = linalg.fill ins (%cst : f16 ) outs (%5 : tensor <10 x 4 x 32 x 32 x f16 >) -> tensor <10 x 4 x 32 x 32 x f16 >
53
+ %cst = arith.constant 0.000000e+00 : f32
54
+ %5 = tensor.empty () : tensor <10 x 4 x 32 x 32 x f32 >
55
+ %6 = linalg.fill ins (%cst : f32 ) outs (%5 : tensor <10 x 4 x 32 x 32 x f32 >) -> tensor <10 x 4 x 32 x 32 x f32 >
54
56
%7 = linalg.generic {
55
57
indexing_maps = [#map , #map1 , #map2 ],
56
58
iterator_types = [" parallel" , " parallel" , " parallel" , " parallel" , " reduction" , " reduction" ]}
57
- ins (%lhs , %rhs : tensor <10 x32 x128 x16 xf16 >, tensor <4 x32 x128 x16 xf16 >) outs (%6 : tensor <10 x4 x32 x32 xf16 >) {
58
- ^bb0 (%in: f16 , %in_0: f16 , %out: f16 ):
59
- %8 = arith.mulf %in , %in_0 : f16
60
- %9 = arith.addf %8 , %out : f16
61
- linalg.yield %9 : f16
62
- } -> tensor <10 x4 x32 x32 xf16 >
63
- return %7 : tensor <10 x4 x32 x32 xf16 >
59
+ ins (%lhs , %rhs : tensor <10 x32 x128 x16 xf16 >, tensor <4 x32 x128 x16 xf16 >) outs (%6 : tensor <10 x4 x32 x32 xf32 >) {
60
+ ^bb0 (%in: f16 , %in_0: f16 , %out: f32 ):
61
+ %8 = arith.extf %in : f16 to f32
62
+ %9 = arith.extf %in_0 : f16 to f32
63
+ %10 = arith.mulf %8 , %9 : f32
64
+ %11 = arith.addf %10 , %out : f32
65
+ linalg.yield %11 : f32
66
+ } -> tensor <10 x4 x32 x32 xf32 >
67
+ return %7 : tensor <10 x4 x32 x32 xf32 >
64
68
}
65
69
66
70
// CHECK-LABEL: func.func @multi_dim_mma_schedule
@@ -79,23 +83,25 @@ func.func @multi_dim_mma_schedule(%lhs: tensor<10x32x128x16xf16>, %rhs: tensor<4
79
83
#map = affine_map <(d0 , d1 , d2 , d3 , d4 , d5 , d6 ) -> (d0 , d1 , d3 , d5 , d6 )>
80
84
#map1 = affine_map <(d0 , d1 , d2 , d3 , d4 , d5 , d6 ) -> (d2 , d4 , d5 , d6 )>
81
85
#map2 = affine_map <(d0 , d1 , d2 , d3 , d4 , d5 , d6 ) -> (d0 , d1 , d2 , d3 , d4 )>
82
- func.func @dynamic_multi_dim_mma_schedule (%lhs: tensor <?x6 x16 x?x16 xf16 >, %rhs: tensor <?x32 x?x16 xf16 >) -> tensor <?x6 x?x 16 x 32 x f16 > {
86
+ func.func @dynamic_multi_dim_mma_schedule (%lhs: tensor <?x6 x16 x?x16 xf16 >, %rhs: tensor <?x32 x?x16 xf16 >) -> tensor <?x6 x?x 16 x 32 x f32 > {
83
87
%c0 = arith.constant 0 : index
84
- %cst = arith.constant 0.000000e+00 : f16
88
+ %cst = arith.constant 0.000000e+00 : f32
85
89
%d0 = tensor.dim %lhs , %c0 : tensor <?x6 x16 x?x16 xf16 >
86
90
%d2 = tensor.dim %rhs , %c0 : tensor <?x32 x?x16 xf16 >
87
- %5 = tensor.empty (%d0 , %d2 ) : tensor <?x6 x?x 16 x 32 x f16 >
88
- %6 = linalg.fill ins (%cst : f16 ) outs (%5 : tensor <?x6 x?x 16 x 32 x f16 >) -> tensor <?x6 x?x 16 x 32 x f16 >
91
+ %5 = tensor.empty (%d0 , %d2 ) : tensor <?x6 x?x 16 x 32 x f32 >
92
+ %6 = linalg.fill ins (%cst : f32 ) outs (%5 : tensor <?x6 x?x 16 x 32 x f32 >) -> tensor <?x6 x?x 16 x 32 x f32 >
89
93
%7 = linalg.generic {
90
94
indexing_maps = [#map , #map1 , #map2 ],
91
95
iterator_types = [" parallel" , " parallel" , " parallel" , " parallel" , " parallel" , " reduction" , " reduction" ]}
92
- ins (%lhs , %rhs : tensor <?x6 x16 x?x16 xf16 >, tensor <?x32 x?x16 xf16 >) outs (%6 : tensor <?x6 x?x16 x32 xf16 >) {
93
- ^bb0 (%in: f16 , %in_0: f16 , %out: f16 ):
94
- %8 = arith.mulf %in , %in_0 : f16
95
- %9 = arith.addf %8 , %out : f16
96
- linalg.yield %9 : f16
97
- } -> tensor <?x6 x?x16 x32 xf16 >
98
- return %7 : tensor <?x6 x?x16 x32 xf16 >
96
+ ins (%lhs , %rhs : tensor <?x6 x16 x?x16 xf16 >, tensor <?x32 x?x16 xf16 >) outs (%6 : tensor <?x6 x?x16 x32 xf32 >) {
97
+ ^bb0 (%in: f16 , %in_0: f16 , %out: f32 ):
98
+ %8 = arith.extf %in : f16 to f32
99
+ %9 = arith.extf %in_0 : f16 to f32
100
+ %10 = arith.mulf %8 , %9 : f32
101
+ %11 = arith.addf %10 , %out : f32
102
+ linalg.yield %11 : f32
103
+ } -> tensor <?x6 x?x16 x32 xf32 >
104
+ return %7 : tensor <?x6 x?x16 x32 xf32 >
99
105
}
100
106
101
107
// CHECK-LABEL: func.func @dynamic_multi_dim_mma_schedule
@@ -271,7 +277,7 @@ func.func @unaligned_to_intrinsic_batched_matmul(%lhs : tensor<12x577x577xf32>,
271
277
272
278
// CHECK-LABEL: func.func @unaligned_to_intrinsic_batched_matmul
273
279
// CHECK-SAME: #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64
274
- // CHECK-SAME: {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true , no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}
280
+ // CHECK-SAME: {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false , no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}
275
281
// CHECK: linalg.batch_matmul {{.*}}lowering_config = #iree_gpu.lowering_config
276
282
// CHECK-SAME: padding = [1, 16, 16, 4]
277
283
// CHECK-SAME: promote_operands = [0, 1, 2]
@@ -300,7 +306,7 @@ func.func @unaligned_to_intrinsic_batched_matmul_tiling_check(%lhs : tensor<12x5
300
306
301
307
// CHECK-LABEL: func.func @unaligned_to_intrinsic_batched_matmul_tiling_check
302
308
// CHECK-SAME: #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64
303
- // CHECK-SAME: {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true , no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}
309
+ // CHECK-SAME: {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false , no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}
304
310
// CHECK: linalg.batch_matmul {{.*}}lowering_config = #iree_gpu.lowering_config
305
311
// CHECK-SAME: padding = [1, 16, 512, 4]
306
312
// CHECK-SAME: promote_operands = [0, 1, 2]
0 commit comments