@@ -1295,11 +1295,9 @@ static LogicalResult setContractConfig(IREE::GPU::TargetAttr target,
1295
1295
CodeGenPipeline pipeline) {
1296
1296
TileSizesListType tileSizes;
1297
1297
unsigned numParallelLoops = op.getNumParallelLoops ();
1298
- unsigned numReductionLoops = op.getNumReductionLoops ();
1299
- SmallVector<int64_t > workgroupTileSizes (
1300
- numParallelLoops + numReductionLoops, 1 );
1301
- workgroupTileSizes[numParallelLoops - 2 ] = tileX;
1302
- workgroupTileSizes[numParallelLoops - 1 ] = tileY;
1298
+ SmallVector<int64_t > workgroupTileSizes (numParallelLoops - 2 , 1 );
1299
+ workgroupTileSizes.append ({tileX, tileY});
1300
+ workgroupTileSizes.append (op.getNumReductionLoops (), tileK);
1303
1301
1304
1302
SmallVector<unsigned > partitionedLoops =
1305
1303
cast<PartitionableLoopsInterface>(op.getOperation ())
@@ -1313,65 +1311,11 @@ static LogicalResult setContractConfig(IREE::GPU::TargetAttr target,
1313
1311
}
1314
1312
}
1315
1313
1314
+ tileSizes.emplace_back (std::move (workgroupTileSizes)); // Workgroup level.
1316
1315
std::optional<int64_t > subgroupSize = std::nullopt;
1317
1316
if (!subgroupSizes.empty ())
1318
1317
subgroupSize = subgroupSizes.front ();
1319
1318
1320
- // For the LLVMGPUTileAndFuse pipeline, we need to split tile sizes
1321
- // for workgroup, thread, and reduction.
1322
- if (pipeline == CodeGenPipeline::LLVMGPUTileAndFuse) {
1323
-
1324
- auto context = op.getContext ();
1325
- Builder b (context);
1326
- SmallVector<NamedAttribute, 1 > attrs;
1327
-
1328
- SmallVector<int64_t > threadTileSizes (numParallelLoops + numReductionLoops,
1329
- 0 );
1330
- std::fill (threadTileSizes.begin (),
1331
- threadTileSizes.begin () + numParallelLoops, 1 );
1332
-
1333
- threadTileSizes[numParallelLoops - 2 ] =
1334
- (tileX / workgroupSize[0 ]) < 1 ? 1 : (tileX / workgroupSize[0 ]);
1335
- threadTileSizes[numParallelLoops - 1 ] =
1336
- (tileY / workgroupSize[1 ]) < 1 ? 1 : (tileY / workgroupSize[1 ]);
1337
-
1338
- SmallVector<int64_t > reductionTileSizes (
1339
- numParallelLoops + numReductionLoops, 0 );
1340
- reductionTileSizes[numParallelLoops + numReductionLoops - 1 ] = tileK;
1341
-
1342
- attrs.emplace_back (b.getStringAttr (" workgroup" ),
1343
- b.getI64ArrayAttr (workgroupTileSizes));
1344
- attrs.emplace_back (b.getStringAttr (" thread" ),
1345
- b.getI64ArrayAttr (threadTileSizes));
1346
- attrs.emplace_back (b.getStringAttr (" reduction" ),
1347
- b.getI64ArrayAttr (reductionTileSizes));
1348
-
1349
- // Promote operands to use shared memory for LHS and RHS.
1350
- IREE::GPU::setPromotedOperandList (context, attrs, {0 , 1 });
1351
- auto configDict = b.getDictionaryAttr (attrs);
1352
- auto loweringConfig =
1353
- IREE::GPU::LoweringConfigAttr::get (context, configDict);
1354
- SmallVector<NamedAttribute, 1 > pipelineAttrs;
1355
- auto pipelineOptions = IREE::GPU::GPUPipelineOptionsAttr::get (
1356
- context, /* prefetchSharedMemory=*/ false ,
1357
- /* no_reduce_shared_memory_bank_conflicts=*/ true ,
1358
- /* use_igemm_convolution=*/ false ,
1359
- /* reorder_workgroups_strategy=*/ std::nullopt);
1360
- pipelineAttrs.emplace_back (
1361
- b.getStringAttr (IREE::GPU::GPUPipelineOptionsAttr::getDictKeyName ()),
1362
- pipelineOptions);
1363
- auto pipelineConfig = b.getDictionaryAttr (pipelineAttrs);
1364
-
1365
- return setOpConfigAndEntryPointFnTranslation (
1366
- entryPoint, op, loweringConfig, pipeline, workgroupSize, subgroupSize,
1367
- pipelineConfig);
1368
- }
1369
-
1370
- // Other pipeline (MatmulTensorCore) expect the reduction tile size to be in
1371
- // the same list.
1372
- workgroupTileSizes[numParallelLoops + numReductionLoops - 1 ] = tileK;
1373
- tileSizes.emplace_back (std::move (workgroupTileSizes));
1374
-
1375
1319
return setOpConfigAndEntryPointFnTranslation (
1376
1320
entryPoint, op, tileSizes, pipeline, workgroupSize, subgroupSize,
1377
1321
getSoftwarePipeliningAttrDict (op->getContext (), softwarePipelineDepth,
@@ -1446,7 +1390,7 @@ static LogicalResult setContractConfig(IREE::GPU::TargetAttr target,
1446
1390
return setMatmulConfig (
1447
1391
sizeN, sizeM, 4 , {sizeM, sizeN, 1 },
1448
1392
target.getWgp ().getSubgroupSizeChoices ().asArrayRef (),
1449
- softwarePipelineDepthSimt, CodeGenPipeline::LLVMGPUTileAndFuse );
1393
+ softwarePipelineDepthSimt, CodeGenPipeline::LLVMGPUMatmulSimt );
1450
1394
}
1451
1395
1452
1396
// SIMT matmul case. Query the best configuration.
@@ -1460,7 +1404,7 @@ static LogicalResult setContractConfig(IREE::GPU::TargetAttr target,
1460
1404
config.tileSize [0 ], config.tileSize [1 ], config.tileSize [2 ],
1461
1405
config.workgroupSize ,
1462
1406
target.getWgp ().getSubgroupSizeChoices ().asArrayRef (),
1463
- softwarePipelineDepthSimt, CodeGenPipeline::LLVMGPUTileAndFuse );
1407
+ softwarePipelineDepthSimt, CodeGenPipeline::LLVMGPUMatmulSimt );
1464
1408
}
1465
1409
}
1466
1410
}
@@ -1485,7 +1429,7 @@ static LogicalResult setContractConfig(IREE::GPU::TargetAttr target,
1485
1429
return setMatmulConfig (tileX, tileY, tileK, workgroupSize,
1486
1430
target.getWgp ().getSubgroupSizeChoices ().asArrayRef (),
1487
1431
softwarePipelineDepthSimt,
1488
- CodeGenPipeline::LLVMGPUTileAndFuse );
1432
+ CodeGenPipeline::LLVMGPUMatmulSimt );
1489
1433
}
1490
1434
1491
1435
// ====---------------------------------------------------------------------===//
0 commit comments