Skip to content

Commit 16097c1

Browse files
authored
Remove the operand promotion for LHS and RHS. (#19516)
Operand promotion for unaligned matmul cases is leading to dynamic trip count and forall loop fusion is not taking place by iree-codegen-gpu-fuse-and-hoist-parallel-loops.
1 parent e553425 commit 16097c1

17 files changed

+106
-259
lines changed

compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_reorder_workgroups_static.mlir

+1-1
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
]>
2626
hal.executable private @main_dispatch_0 {
2727
hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) {
28-
hal.executable.export public @main_dispatch_0_matmul_transpose_b_32000x32000x4096_f16 ordinal(0) layout(#pipeline_layout) attributes {subgroup_size = 64 : index, translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>, workgroup_size = [64 : index, 16 : index, 1 : index]} {
28+
hal.executable.export public @main_dispatch_0_matmul_transpose_b_32000x32000x4096_f16 ordinal(0) layout(#pipeline_layout) attributes {subgroup_size = 64 : index, translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>, workgroup_size = [64 : index, 16 : index, 1 : index]} {
2929
^bb0(%arg0: !hal.device):
3030
%c250 = arith.constant 250 : index
3131
%c500 = arith.constant 500 : index

compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.td

+10-12
Original file line numberDiff line numberDiff line change
@@ -40,26 +40,24 @@ def LLVMGPU_SimpleDistribute
4040
: I32EnumAttrCase<"LLVMGPUDistribute", 102>;
4141
def LLVMGPU_Vectorize
4242
: I32EnumAttrCase<"LLVMGPUVectorize", 103>;
43-
def LLVMGPU_MatmulSimt
44-
: I32EnumAttrCase<"LLVMGPUMatmulSimt", 104>;
4543
def LLVMGPU_MatmulTensorCore
46-
: I32EnumAttrCase<"LLVMGPUMatmulTensorCore", 105>;
44+
: I32EnumAttrCase<"LLVMGPUMatmulTensorCore", 104>;
4745
def LLVMGPU_TransposeSharedMem
48-
: I32EnumAttrCase<"LLVMGPUTransposeSharedMem", 106>;
46+
: I32EnumAttrCase<"LLVMGPUTransposeSharedMem", 105>;
4947
def LLVMGPU_WarpReduction
50-
: I32EnumAttrCase<"LLVMGPUWarpReduction", 107>;
48+
: I32EnumAttrCase<"LLVMGPUWarpReduction", 106>;
5149
def LLVMGPU_PackUnPack
52-
: I32EnumAttrCase<"LLVMGPUPackUnPack", 108>;
50+
: I32EnumAttrCase<"LLVMGPUPackUnPack", 107>;
5351
def LLVMGPU_MatmulTensorCoreMmaSync
54-
: I32EnumAttrCase<"LLVMGPUMatmulTensorCoreMmaSync", 109>;
52+
: I32EnumAttrCase<"LLVMGPUMatmulTensorCoreMmaSync", 108>;
5553
def LLVMGPU_VectorDistribute
56-
: I32EnumAttrCase<"LLVMGPUVectorDistribute", 110>;
54+
: I32EnumAttrCase<"LLVMGPUVectorDistribute", 109>;
5755
def LLVMGPU_PadAndVectorDistribute
58-
: I32EnumAttrCase<"LLVMGPUPadAndVectorDistribute", 111>;
56+
: I32EnumAttrCase<"LLVMGPUPadAndVectorDistribute", 110>;
5957
def LLVMGPU_WinogradVectorize
60-
: I32EnumAttrCase<"LLVMGPUWinogradVectorize", 112>;
58+
: I32EnumAttrCase<"LLVMGPUWinogradVectorize", 111>;
6159
def LLVMGPU_TileAndFuse
62-
: I32EnumAttrCase<"LLVMGPUTileAndFuse", 113>;
60+
: I32EnumAttrCase<"LLVMGPUTileAndFuse", 112>;
6361

6462
def SPIRV_BaseLowering
6563
: I32EnumAttrCase<"SPIRVBaseLowering", 200>;
@@ -98,7 +96,7 @@ def DispatchLoweringPassPipelineEnum : I32EnumAttr<
9896

9997
// LLVMGPU CodeGen pipelines
10098
LLVMGPU_Default, LLVMGPU_BaseLowering, LLVMGPU_SimpleDistribute,
101-
LLVMGPU_Vectorize, LLVMGPU_MatmulSimt, LLVMGPU_MatmulTensorCore,
99+
LLVMGPU_Vectorize, LLVMGPU_MatmulTensorCore,
102100
LLVMGPU_TransposeSharedMem, LLVMGPU_WarpReduction, LLVMGPU_PackUnPack,
103101
LLVMGPU_MatmulTensorCoreMmaSync, LLVMGPU_VectorDistribute,
104102
LLVMGPU_PadAndVectorDistribute, LLVMGPU_WinogradVectorize,

compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp

+61-7
Original file line numberDiff line numberDiff line change
@@ -1295,9 +1295,11 @@ static LogicalResult setContractConfig(IREE::GPU::TargetAttr target,
12951295
CodeGenPipeline pipeline) {
12961296
TileSizesListType tileSizes;
12971297
unsigned numParallelLoops = op.getNumParallelLoops();
1298-
SmallVector<int64_t> workgroupTileSizes(numParallelLoops - 2, 1);
1299-
workgroupTileSizes.append({tileX, tileY});
1300-
workgroupTileSizes.append(op.getNumReductionLoops(), tileK);
1298+
unsigned numReductionLoops = op.getNumReductionLoops();
1299+
SmallVector<int64_t> workgroupTileSizes(
1300+
numParallelLoops + numReductionLoops, 1);
1301+
workgroupTileSizes[numParallelLoops - 2] = tileX;
1302+
workgroupTileSizes[numParallelLoops - 1] = tileY;
13011303

13021304
SmallVector<unsigned> partitionedLoops =
13031305
cast<PartitionableLoopsInterface>(op.getOperation())
@@ -1311,11 +1313,63 @@ static LogicalResult setContractConfig(IREE::GPU::TargetAttr target,
13111313
}
13121314
}
13131315

1314-
tileSizes.emplace_back(std::move(workgroupTileSizes)); // Workgroup level.
13151316
std::optional<int64_t> subgroupSize = std::nullopt;
13161317
if (!subgroupSizes.empty())
13171318
subgroupSize = subgroupSizes.front();
13181319

1320+
// For the LLVMGPUTileAndFuse pipeline, we need to split tile sizes
1321+
// for workgroup, thread, and reduction.
1322+
if (pipeline == CodeGenPipeline::LLVMGPUTileAndFuse) {
1323+
1324+
auto context = op.getContext();
1325+
Builder b(context);
1326+
SmallVector<NamedAttribute, 1> attrs;
1327+
1328+
SmallVector<int64_t> threadTileSizes(numParallelLoops + numReductionLoops,
1329+
0);
1330+
std::fill(threadTileSizes.begin(),
1331+
threadTileSizes.begin() + numParallelLoops, 1);
1332+
1333+
threadTileSizes[numParallelLoops - 2] =
1334+
(tileX / workgroupSize[0]) < 1 ? 1 : (tileX / workgroupSize[0]);
1335+
threadTileSizes[numParallelLoops - 1] =
1336+
(tileY / workgroupSize[1]) < 1 ? 1 : (tileY / workgroupSize[1]);
1337+
1338+
SmallVector<int64_t> reductionTileSizes(
1339+
numParallelLoops + numReductionLoops, 0);
1340+
reductionTileSizes[numParallelLoops + numReductionLoops - 1] = tileK;
1341+
1342+
attrs.emplace_back(b.getStringAttr("workgroup"),
1343+
b.getI64ArrayAttr(workgroupTileSizes));
1344+
attrs.emplace_back(b.getStringAttr("thread"),
1345+
b.getI64ArrayAttr(threadTileSizes));
1346+
attrs.emplace_back(b.getStringAttr("reduction"),
1347+
b.getI64ArrayAttr(reductionTileSizes));
1348+
1349+
auto configDict = b.getDictionaryAttr(attrs);
1350+
auto loweringConfig =
1351+
IREE::GPU::LoweringConfigAttr::get(context, configDict);
1352+
SmallVector<NamedAttribute, 1> pipelineAttrs;
1353+
auto pipelineOptions = IREE::GPU::GPUPipelineOptionsAttr::get(
1354+
context, /*prefetchSharedMemory=*/false,
1355+
/*no_reduce_shared_memory_bank_conflicts=*/true,
1356+
/*use_igemm_convolution=*/false,
1357+
/*reorder_workgroups_strategy=*/std::nullopt);
1358+
pipelineAttrs.emplace_back(
1359+
b.getStringAttr(IREE::GPU::GPUPipelineOptionsAttr::getDictKeyName()),
1360+
pipelineOptions);
1361+
auto pipelineConfig = b.getDictionaryAttr(pipelineAttrs);
1362+
1363+
return setOpConfigAndEntryPointFnTranslation(
1364+
entryPoint, op, loweringConfig, pipeline, workgroupSize, subgroupSize,
1365+
pipelineConfig);
1366+
}
1367+
1368+
// Other pipeline (MatmulTensorCore) expect the reduction tile size to be in
1369+
// the same list.
1370+
workgroupTileSizes[numParallelLoops + numReductionLoops - 1] = tileK;
1371+
tileSizes.emplace_back(std::move(workgroupTileSizes));
1372+
13191373
return setOpConfigAndEntryPointFnTranslation(
13201374
entryPoint, op, tileSizes, pipeline, workgroupSize, subgroupSize,
13211375
getSoftwarePipeliningAttrDict(op->getContext(), softwarePipelineDepth,
@@ -1390,7 +1444,7 @@ static LogicalResult setContractConfig(IREE::GPU::TargetAttr target,
13901444
return setMatmulConfig(
13911445
sizeN, sizeM, 4, {sizeM, sizeN, 1},
13921446
target.getWgp().getSubgroupSizeChoices().asArrayRef(),
1393-
softwarePipelineDepthSimt, CodeGenPipeline::LLVMGPUMatmulSimt);
1447+
softwarePipelineDepthSimt, CodeGenPipeline::LLVMGPUTileAndFuse);
13941448
}
13951449

13961450
// SIMT matmul case. Query the best configuration.
@@ -1404,7 +1458,7 @@ static LogicalResult setContractConfig(IREE::GPU::TargetAttr target,
14041458
config.tileSize[0], config.tileSize[1], config.tileSize[2],
14051459
config.workgroupSize,
14061460
target.getWgp().getSubgroupSizeChoices().asArrayRef(),
1407-
softwarePipelineDepthSimt, CodeGenPipeline::LLVMGPUMatmulSimt);
1461+
softwarePipelineDepthSimt, CodeGenPipeline::LLVMGPUTileAndFuse);
14081462
}
14091463
}
14101464
}
@@ -1429,7 +1483,7 @@ static LogicalResult setContractConfig(IREE::GPU::TargetAttr target,
14291483
return setMatmulConfig(tileX, tileY, tileK, workgroupSize,
14301484
target.getWgp().getSubgroupSizeChoices().asArrayRef(),
14311485
softwarePipelineDepthSimt,
1432-
CodeGenPipeline::LLVMGPUMatmulSimt);
1486+
CodeGenPipeline::LLVMGPUTileAndFuse);
14331487
}
14341488

14351489
//====---------------------------------------------------------------------===//

compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPULowerExecutableTarget.cpp

-3
Original file line numberDiff line numberDiff line change
@@ -114,9 +114,6 @@ void LLVMGPULowerExecutableTargetPass::runOnOperation() {
114114
case IREE::Codegen::DispatchLoweringPassPipeline::LLVMGPUWinogradVectorize:
115115
addGPUWinogradVectorizePassPipeline(pipeline);
116116
break;
117-
case IREE::Codegen::DispatchLoweringPassPipeline::LLVMGPUMatmulSimt:
118-
addGPUMatmulSimtPassPipeline(pipeline, pipelineOptions);
119-
break;
120117
case IREE::Codegen::DispatchLoweringPassPipeline::LLVMGPUMatmulTensorCore: {
121118
FailureOr<int64_t> maybeDepth =
122119
getSoftwarePipelineDepth(translationInfo.getConfiguration());

compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp

-66
Original file line numberDiff line numberDiff line change
@@ -526,72 +526,6 @@ void addGPUWinogradVectorizePassPipeline(OpPassManager &funcPassManager) {
526526
funcPassManager.addPass(createOptimizeTensorInsertExtractSlicesPass());
527527
}
528528

529-
//===---------------------------------------------------------------------===//
530-
// MatmulSIMT
531-
//===---------------------------------------------------------------------===//
532-
533-
void addGPUMatmulSimtPassPipeline(OpPassManager &funcPassManager,
534-
const GPUPipelineOptions &options) {
535-
tileAndDistributeToWorkgroup(funcPassManager, /*useForall=*/false);
536-
537-
funcPassManager.addPass(createConfigTrackingCanonicalizerPass());
538-
funcPassManager.addPass(createConfigTrackingCanonicalizerPass());
539-
funcPassManager.addPass(createCSEPass());
540-
541-
funcPassManager.addPass(createGPUTensorTileToSerialLoopsPass());
542-
funcPassManager.addPass(createGPUTensorAlloc());
543-
funcPassManager.addPass(createGPUTensorTilePass());
544-
545-
// Linalg -> vector
546-
addGPUVectorizationPasses(funcPassManager);
547-
548-
// tensor to memref
549-
addBufferizePasses(funcPassManager);
550-
551-
// distribute foreach threads
552-
funcPassManager.addPass(createGPUDistributePass());
553-
554-
funcPassManager.addPass(createMemrefCopyToLinalgPass());
555-
funcPassManager.addPass(createGPUDistributeSharedMemoryCopyPass());
556-
funcPassManager.addPass(createCanonicalizerPass());
557-
funcPassManager.addPass(createCSEPass());
558-
559-
if (options.enableReduceSharedMemoryBankConflicts) {
560-
funcPassManager.addPass(createGPUReduceBankConflictsPass());
561-
}
562-
563-
ReorderWorkgroupsStrategy reorderStrategy =
564-
getReorderWorkgroupsStrategy(options.reorderStrategy);
565-
funcPassManager.addPass(
566-
createReorderWorkgroups(reorderStrategy, canReorderWorkgroups));
567-
568-
funcPassManager.addPass(createCanonicalizerPass());
569-
funcPassManager.addPass(createCSEPass());
570-
571-
funcPassManager.addPass(memref::createFoldMemRefAliasOpsPass());
572-
funcPassManager.addPass(createCSEPass());
573-
funcPassManager.addPass(createCanonicalizerPass());
574-
funcPassManager.addPass(createCSEPass());
575-
576-
// Even though we vectorize before bufferization we are not able to hoist
577-
// accumulator load/store out of the K loop until distribution. This is
578-
// because we materialize the fill and the matmul in two different scf.forall
579-
// regions, when they should be in the same scf.forall. Newer pipelines
580-
// like TileAndFuse don't have this problem, because they coalesce these
581-
// scf.forall regions into a single scf.forall.
582-
//
583-
// Therefore we still rely on buffer level transformations for transfer ops
584-
// hoisting and store to load forwarding. This relies on shacky alias
585-
// analysis and we need to move this to tensor level once we have better
586-
// abstractions.
587-
funcPassManager.addPass(createOptimizeVectorTransferPass());
588-
589-
// Hoist loop invariant code to avoid pipelining it.
590-
funcPassManager.addPass(createIREELoopInvariantCodeMotionPass());
591-
// Pipeline memory operations.
592-
funcPassManager.addPass(createGPUPipeliningPass());
593-
}
594-
595529
//===---------------------------------------------------------------------===//
596530
// Matmul Tensor Core
597531
//===---------------------------------------------------------------------===//

compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h

-4
Original file line numberDiff line numberDiff line change
@@ -28,10 +28,6 @@ using IREE::GPU::GPUPipelineOptions;
2828
// LLVMGPU Backend Pass Pipelines
2929
//----------------------------------------------------------------------------//
3030

31-
/// Lowering using SIMT CUDA core operations.
32-
void addGPUMatmulSimtPassPipeline(OpPassManager &funcPassManager,
33-
const GPUPipelineOptions &options);
34-
3531
/// Lowering using mma.sync Tensor Core operations.
3632
void addGPUMatmulTensorCoreMmaSyncPassPipeline(
3733
OpPassManager &funcPassManager, const GPUPipelineOptions &options,

compiler/src/iree/compiler/Codegen/LLVMGPU/Verifiers.cpp

+1-10
Original file line numberDiff line numberDiff line change
@@ -38,10 +38,6 @@ getInstructionShape(Operation *op, CodeGenPipeline pipeline,
3838
Type inputElementType,
3939
SmallVector<int64_t> &instructionShape) {
4040
switch (pipeline) {
41-
case CodeGenPipeline::LLVMGPUMatmulSimt:
42-
// SIMT Pipeline / CUDA Cores
43-
instructionShape = {1, 1, 1};
44-
break;
4541
case CodeGenPipeline::LLVMGPUMatmulTensorCore:
4642
// Tensor Core Pipeline / WMMA API
4743
if (inputElementType.isF16() || inputElementType.isBF16()) {
@@ -81,8 +77,7 @@ verifyGPUMatmulPipeline(Operation *op,
8177
ArrayRef<int64_t> workgroupSize) {
8278
// This verifier only applies to matmul.
8379
CodeGenPipeline pipeline = translationInfo.getDispatchLoweringPassPipeline();
84-
if (pipeline != CodeGenPipeline::LLVMGPUMatmulSimt &&
85-
pipeline != CodeGenPipeline::LLVMGPUMatmulTensorCore &&
80+
if (pipeline != CodeGenPipeline::LLVMGPUMatmulTensorCore &&
8681
pipeline != CodeGenPipeline::LLVMGPUMatmulTensorCoreMmaSync) {
8782
return success();
8883
}
@@ -180,10 +175,6 @@ verifyGPUMatmulPipeline(Operation *op,
180175
<< pipelineName;
181176
}
182177

183-
// Return success for SIMT/CUDA cores.
184-
if (pipeline == CodeGenPipeline::LLVMGPUMatmulSimt)
185-
return success();
186-
187178
//
188179
// Additional verification Tensor Core pipelines.
189180
//

compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_matvec.mlir

+2-3
Original file line numberDiff line numberDiff line change
@@ -267,12 +267,11 @@ func.func @not_vmt() {
267267
return
268268
}
269269

270-
// CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 128, 8]{{\]}}>
271-
// CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [32, 1, 1] subgroup_size = 64, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
270+
// CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [32, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true, use_igemm_convolution = false>}>
272271
// CHECK: func.func @not_vmt()
273272
// CHECK-SAME: translation_info = #[[$TRANSLATION]]
274273
// CHECK: linalg.generic
275-
// CHECK-SAME: lowering_config = #[[$CONFIG]]
274+
// CHECK-SAME: lowering_config = #iree_gpu.lowering_config<{reduction = [0, 0, 8], thread = [1, 128, 0], workgroup = [1, 128, 1]}>
276275

277276
// -----
278277

compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_root_op_attribute.mlir

+1-1
Original file line numberDiff line numberDiff line change
@@ -9,4 +9,4 @@ func.func @matmul(%lhs: tensor<4x4xf32>, %rhs: tensor<4x4xf32>) -> tensor<4x4xf3
99
return %result : tensor<4x4xf32>
1010
}
1111

12-
// CHECK: %2 = linalg.matmul {lowering_config = #config, root_op} ins(%arg0, %arg1 : tensor<4x4xf32>, tensor<4x4xf32>) outs(%1 : tensor<4x4xf32>) -> tensor<4x4xf32>
12+
// CHECK: %2 = linalg.matmul {lowering_config = #{{.*}}, root_op} ins(%arg0, %arg1 : tensor<4x4xf32>, tensor<4x4xf32>) outs(%1 : tensor<4x4xf32>) -> tensor<4x4xf32>

compiler/src/iree/compiler/Codegen/LLVMGPU/test/distribute_to_thread.mlir

+4-4
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
#map = affine_map<()[s0] -> (s0 * 2)>
1010
#map1 = affine_map<()[s0] -> (s0 * 256)>
1111
#map2 = affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>
12-
#translation = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [64, 1, 1], {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
12+
#translation = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1], {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
1313
func.func @dot_dispatch_0() attributes {translation_info = #translation} {
1414
%cst = arith.constant 0.000000e+00 : f32
1515
%c0 = arith.constant 0 : index
@@ -79,7 +79,7 @@ func.func @dot_dispatch_0() attributes {translation_info = #translation} {
7979
#map2 = affine_map<(d0, d1, d2)[s0] -> (d0 * 32768 + s0 + d1 * 1024 + d2)>
8080
#map3 = affine_map<(d0, d1, d2)[s0] -> (d0 * 65536 + s0 + d1 * 64 + d2)>
8181
#map4 = affine_map<(d0, d1, d2)[s0] -> (d0 * 2048 + s0 + d1 * 64 + d2)>
82-
#translation = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [8, 8, 1], {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
82+
#translation = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [8, 8, 1], {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
8383
func.func @batch_matmul_func() attributes {translation_info = #translation} {
8484
%c0 = arith.constant 0 : index
8585
%cst = arith.constant 0.000000e+00 : f32
@@ -148,7 +148,7 @@ func.func @batch_matmul_func() attributes {translation_info = #translation} {
148148
#map = affine_map<()[s0] -> (s0 * 2)>
149149
#map1 = affine_map<()[s0] -> (s0 * 32)>
150150
#map2 = affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>
151-
#translation = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [64, 8, 1], {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
151+
#translation = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 8, 1], {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
152152
func.func @dot_dispatch_0() attributes {translation_info = #translation} {
153153
%cst = arith.constant 0.000000e+00 : f32
154154
%c0 = arith.constant 0 : index
@@ -312,7 +312,7 @@ module {
312312
#hal.pipeline.binding<storage_buffer>
313313
]>
314314
#config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 2, 256, 4]]>
315-
#translation = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [64, 8, 1], {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
315+
#translation = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 8, 1], {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
316316
#map = affine_map<()[s0] -> (s0 * 2)>
317317
#map1 = affine_map<()[s0] -> (s0 * 256)>
318318
#map2 = affine_map<(d0)[s0] -> (-d0 + s0, 2)>

0 commit comments

Comments
 (0)