iree-org
diff --git a/‎compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_reorder_workgroups_static.mlir
+1-1 b/‎compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_reorder_workgroups_static.mlir
+1-1
diff --git a/‎compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.td
+12-10 b/‎compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.td
+12-10
diff --git a/‎compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
+7-63 b/‎compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
+7-63
diff --git a/‎compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPULowerExecutableTarget.cpp
+3 b/‎compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPULowerExecutableTarget.cpp
+3
diff --git a/‎compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
+66 b/‎compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
+66
diff --git a/‎compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h
+4 b/‎compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h
+4
diff --git a/‎compiler/src/iree/compiler/Codegen/LLVMGPU/Verifiers.cpp
+10-1 b/‎compiler/src/iree/compiler/Codegen/LLVMGPU/Verifiers.cpp
+10-1
diff --git a/‎compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_matvec.mlir
+3-2 b/‎compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_matvec.mlir
+3-2
diff --git a/‎compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_root_op_attribute.mlir
+1-1 b/‎compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_root_op_attribute.mlir
+1-1
diff --git a/‎compiler/src/iree/compiler/Codegen/LLVMGPU/test/distribute_to_thread.mlir
+4-4 b/‎compiler/src/iree/compiler/Codegen/LLVMGPU/test/distribute_to_thread.mlir
+4-4
@@ -25,7 +25,7 @@
 ]>
 hal.executable private @main_dispatch_0 {
 hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) {
-  hal.executable.export public @main_dispatch_0_matmul_transpose_b_32000x32000x4096_f16 ordinal(0) layout(#pipeline_layout) attributes {subgroup_size = 64 : index, translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>, workgroup_size = [64 : index, 16 : index, 1 : index]} {
+  hal.executable.export public @main_dispatch_0_matmul_transpose_b_32000x32000x4096_f16 ordinal(0) layout(#pipeline_layout) attributes {subgroup_size = 64 : index, translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>, workgroup_size = [64 : index, 16 : index, 1 : index]} {
   ^bb0(%arg0: !hal.device):
     %c250 = arith.constant 250 : index
     %c500 = arith.constant 500 : index
 
@@ -40,24 +40,26 @@ def LLVMGPU_SimpleDistribute
     : I32EnumAttrCase<"LLVMGPUDistribute", 102>;
 def LLVMGPU_Vectorize
     : I32EnumAttrCase<"LLVMGPUVectorize", 103>;
+def LLVMGPU_MatmulSimt
+    : I32EnumAttrCase<"LLVMGPUMatmulSimt", 104>;
 def LLVMGPU_MatmulTensorCore
-    : I32EnumAttrCase<"LLVMGPUMatmulTensorCore", 104>;
+    : I32EnumAttrCase<"LLVMGPUMatmulTensorCore", 105>;
 def LLVMGPU_TransposeSharedMem
-    : I32EnumAttrCase<"LLVMGPUTransposeSharedMem", 105>;
+    : I32EnumAttrCase<"LLVMGPUTransposeSharedMem", 106>;
 def LLVMGPU_WarpReduction
-    : I32EnumAttrCase<"LLVMGPUWarpReduction", 106>;
+    : I32EnumAttrCase<"LLVMGPUWarpReduction", 107>;
 def LLVMGPU_PackUnPack
-    : I32EnumAttrCase<"LLVMGPUPackUnPack", 107>;
+    : I32EnumAttrCase<"LLVMGPUPackUnPack", 108>;
 def LLVMGPU_MatmulTensorCoreMmaSync
-    : I32EnumAttrCase<"LLVMGPUMatmulTensorCoreMmaSync", 108>;
+    : I32EnumAttrCase<"LLVMGPUMatmulTensorCoreMmaSync", 109>;
 def LLVMGPU_VectorDistribute
-    : I32EnumAttrCase<"LLVMGPUVectorDistribute", 109>;
+    : I32EnumAttrCase<"LLVMGPUVectorDistribute", 110>;
 def LLVMGPU_PadAndVectorDistribute
-    : I32EnumAttrCase<"LLVMGPUPadAndVectorDistribute", 110>;
+    : I32EnumAttrCase<"LLVMGPUPadAndVectorDistribute", 111>;
 def LLVMGPU_WinogradVectorize
-    : I32EnumAttrCase<"LLVMGPUWinogradVectorize", 111>;
+    : I32EnumAttrCase<"LLVMGPUWinogradVectorize", 112>;
 def LLVMGPU_TileAndFuse
-    : I32EnumAttrCase<"LLVMGPUTileAndFuse", 112>;
+    : I32EnumAttrCase<"LLVMGPUTileAndFuse", 113>;
 
 def SPIRV_BaseLowering
     : I32EnumAttrCase<"SPIRVBaseLowering", 200>;
@@ -96,7 +98,7 @@ def DispatchLoweringPassPipelineEnum : I32EnumAttr<
 
     // LLVMGPU CodeGen pipelines
     LLVMGPU_Default, LLVMGPU_BaseLowering, LLVMGPU_SimpleDistribute,
-    LLVMGPU_Vectorize, LLVMGPU_MatmulTensorCore,
+    LLVMGPU_Vectorize, LLVMGPU_MatmulSimt, LLVMGPU_MatmulTensorCore,
     LLVMGPU_TransposeSharedMem, LLVMGPU_WarpReduction, LLVMGPU_PackUnPack,
     LLVMGPU_MatmulTensorCoreMmaSync, LLVMGPU_VectorDistribute,
     LLVMGPU_PadAndVectorDistribute, LLVMGPU_WinogradVectorize,
 
@@ -1295,11 +1295,9 @@ static LogicalResult setContractConfig(IREE::GPU::TargetAttr target,
                                             CodeGenPipeline pipeline) {
     TileSizesListType tileSizes;
     unsigned numParallelLoops = op.getNumParallelLoops();
-    unsigned numReductionLoops = op.getNumReductionLoops();
-    SmallVector<int64_t> workgroupTileSizes(
-        numParallelLoops + numReductionLoops, 1);
-    workgroupTileSizes[numParallelLoops - 2] = tileX;
-    workgroupTileSizes[numParallelLoops - 1] = tileY;
+    SmallVector<int64_t> workgroupTileSizes(numParallelLoops - 2, 1);
+    workgroupTileSizes.append({tileX, tileY});
+    workgroupTileSizes.append(op.getNumReductionLoops(), tileK);
 
     SmallVector<unsigned> partitionedLoops =
         cast<PartitionableLoopsInterface>(op.getOperation())
@@ -1313,65 +1311,11 @@ static LogicalResult setContractConfig(IREE::GPU::TargetAttr target,
       }
     }
 
+    tileSizes.emplace_back(std::move(workgroupTileSizes)); // Workgroup level.
     std::optional<int64_t> subgroupSize = std::nullopt;
     if (!subgroupSizes.empty())
       subgroupSize = subgroupSizes.front();
 
-    // For the LLVMGPUTileAndFuse pipeline, we need to split tile sizes
-    // for workgroup, thread, and reduction.
-    if (pipeline == CodeGenPipeline::LLVMGPUTileAndFuse) {
-
-      auto context = op.getContext();
-      Builder b(context);
-      SmallVector<NamedAttribute, 1> attrs;
-
-      SmallVector<int64_t> threadTileSizes(numParallelLoops + numReductionLoops,
-                                           0);
-      std::fill(threadTileSizes.begin(),
-                threadTileSizes.begin() + numParallelLoops, 1);
-
-      threadTileSizes[numParallelLoops - 2] =
-          (tileX / workgroupSize[0]) < 1 ? 1 : (tileX / workgroupSize[0]);
-      threadTileSizes[numParallelLoops - 1] =
-          (tileY / workgroupSize[1]) < 1 ? 1 : (tileY / workgroupSize[1]);
-
-      SmallVector<int64_t> reductionTileSizes(
-          numParallelLoops + numReductionLoops, 0);
-      reductionTileSizes[numParallelLoops + numReductionLoops - 1] = tileK;
-
-      attrs.emplace_back(b.getStringAttr("workgroup"),
-                         b.getI64ArrayAttr(workgroupTileSizes));
-      attrs.emplace_back(b.getStringAttr("thread"),
-                         b.getI64ArrayAttr(threadTileSizes));
-      attrs.emplace_back(b.getStringAttr("reduction"),
-                         b.getI64ArrayAttr(reductionTileSizes));
-
-      // Promote operands to use shared memory for LHS and RHS.
-      IREE::GPU::setPromotedOperandList(context, attrs, {0, 1});
-      auto configDict = b.getDictionaryAttr(attrs);
-      auto loweringConfig =
-          IREE::GPU::LoweringConfigAttr::get(context, configDict);
-      SmallVector<NamedAttribute, 1> pipelineAttrs;
-      auto pipelineOptions = IREE::GPU::GPUPipelineOptionsAttr::get(
-          context, /*prefetchSharedMemory=*/false,
-          /*no_reduce_shared_memory_bank_conflicts=*/true,
-          /*use_igemm_convolution=*/false,
-          /*reorder_workgroups_strategy=*/std::nullopt);
-      pipelineAttrs.emplace_back(
-          b.getStringAttr(IREE::GPU::GPUPipelineOptionsAttr::getDictKeyName()),
-          pipelineOptions);
-      auto pipelineConfig = b.getDictionaryAttr(pipelineAttrs);
-
-      return setOpConfigAndEntryPointFnTranslation(
-          entryPoint, op, loweringConfig, pipeline, workgroupSize, subgroupSize,
-          pipelineConfig);
-    }
-
-    // Other pipeline (MatmulTensorCore) expect the reduction tile size to be in
-    // the same list.
-    workgroupTileSizes[numParallelLoops + numReductionLoops - 1] = tileK;
-    tileSizes.emplace_back(std::move(workgroupTileSizes));
-
     return setOpConfigAndEntryPointFnTranslation(
         entryPoint, op, tileSizes, pipeline, workgroupSize, subgroupSize,
         getSoftwarePipeliningAttrDict(op->getContext(), softwarePipelineDepth,
@@ -1446,7 +1390,7 @@ static LogicalResult setContractConfig(IREE::GPU::TargetAttr target,
       return setMatmulConfig(
           sizeN, sizeM, 4, {sizeM, sizeN, 1},
           target.getWgp().getSubgroupSizeChoices().asArrayRef(),
-          softwarePipelineDepthSimt, CodeGenPipeline::LLVMGPUTileAndFuse);
+          softwarePipelineDepthSimt, CodeGenPipeline::LLVMGPUMatmulSimt);
     }
 
     // SIMT matmul case. Query the best configuration.
@@ -1460,7 +1404,7 @@ static LogicalResult setContractConfig(IREE::GPU::TargetAttr target,
             config.tileSize[0], config.tileSize[1], config.tileSize[2],
             config.workgroupSize,
             target.getWgp().getSubgroupSizeChoices().asArrayRef(),
-            softwarePipelineDepthSimt, CodeGenPipeline::LLVMGPUTileAndFuse);
+            softwarePipelineDepthSimt, CodeGenPipeline::LLVMGPUMatmulSimt);
       }
     }
   }
@@ -1485,7 +1429,7 @@ static LogicalResult setContractConfig(IREE::GPU::TargetAttr target,
   return setMatmulConfig(tileX, tileY, tileK, workgroupSize,
                          target.getWgp().getSubgroupSizeChoices().asArrayRef(),
                          softwarePipelineDepthSimt,
-                         CodeGenPipeline::LLVMGPUTileAndFuse);
+                         CodeGenPipeline::LLVMGPUMatmulSimt);
 }
 
 //====---------------------------------------------------------------------===//
 
@@ -114,6 +114,9 @@ void LLVMGPULowerExecutableTargetPass::runOnOperation() {
   case IREE::Codegen::DispatchLoweringPassPipeline::LLVMGPUWinogradVectorize:
     addGPUWinogradVectorizePassPipeline(pipeline);
     break;
+  case IREE::Codegen::DispatchLoweringPassPipeline::LLVMGPUMatmulSimt:
+    addGPUMatmulSimtPassPipeline(pipeline, pipelineOptions);
+    break;
   case IREE::Codegen::DispatchLoweringPassPipeline::LLVMGPUMatmulTensorCore: {
     FailureOr<int64_t> maybeDepth =
         getSoftwarePipelineDepth(translationInfo.getConfiguration());
 
@@ -526,6 +526,72 @@ void addGPUWinogradVectorizePassPipeline(OpPassManager &funcPassManager) {
   funcPassManager.addPass(createOptimizeTensorInsertExtractSlicesPass());
 }
 
+//===---------------------------------------------------------------------===//
+// MatmulSIMT
+//===---------------------------------------------------------------------===//
+
+void addGPUMatmulSimtPassPipeline(OpPassManager &funcPassManager,
+                                  const GPUPipelineOptions &options) {
+  tileAndDistributeToWorkgroup(funcPassManager, /*useForall=*/false);
+
+  funcPassManager.addPass(createConfigTrackingCanonicalizerPass());
+  funcPassManager.addPass(createConfigTrackingCanonicalizerPass());
+  funcPassManager.addPass(createCSEPass());
+
+  funcPassManager.addPass(createGPUTensorTileToSerialLoopsPass());
+  funcPassManager.addPass(createGPUTensorAlloc());
+  funcPassManager.addPass(createGPUTensorTilePass());
+
+  // Linalg -> vector
+  addGPUVectorizationPasses(funcPassManager);
+
+  // tensor to memref
+  addBufferizePasses(funcPassManager);
+
+  // distribute foreach threads
+  funcPassManager.addPass(createGPUDistributePass());
+
+  funcPassManager.addPass(createMemrefCopyToLinalgPass());
+  funcPassManager.addPass(createGPUDistributeSharedMemoryCopyPass());
+  funcPassManager.addPass(createCanonicalizerPass());
+  funcPassManager.addPass(createCSEPass());
+
+  if (options.enableReduceSharedMemoryBankConflicts) {
+    funcPassManager.addPass(createGPUReduceBankConflictsPass());
+  }
+
+  ReorderWorkgroupsStrategy reorderStrategy =
+      getReorderWorkgroupsStrategy(options.reorderStrategy);
+  funcPassManager.addPass(
+      createReorderWorkgroups(reorderStrategy, canReorderWorkgroups));
+
+  funcPassManager.addPass(createCanonicalizerPass());
+  funcPassManager.addPass(createCSEPass());
+
+  funcPassManager.addPass(memref::createFoldMemRefAliasOpsPass());
+  funcPassManager.addPass(createCSEPass());
+  funcPassManager.addPass(createCanonicalizerPass());
+  funcPassManager.addPass(createCSEPass());
+
+  // Even though we vectorize before bufferization we are not able to hoist
+  // accumulator load/store out of the K loop until distribution. This is
+  // because we materialize the fill and the matmul in two different scf.forall
+  // regions, when they should be in the same scf.forall. Newer pipelines
+  // like TileAndFuse don't have this problem, because they coalesce these
+  // scf.forall regions into a single scf.forall.
+  //
+  // Therefore we still rely on buffer level transformations for transfer ops
+  // hoisting and store to load forwarding. This relies on shacky alias
+  // analysis and we need to move this to tensor level once we have better
+  // abstractions.
+  funcPassManager.addPass(createOptimizeVectorTransferPass());
+
+  // Hoist loop invariant code to avoid pipelining it.
+  funcPassManager.addPass(createIREELoopInvariantCodeMotionPass());
+  // Pipeline memory operations.
+  funcPassManager.addPass(createGPUPipeliningPass());
+}
+
 //===---------------------------------------------------------------------===//
 // Matmul Tensor Core
 //===---------------------------------------------------------------------===//
 
@@ -28,6 +28,10 @@ using IREE::GPU::GPUPipelineOptions;
 // LLVMGPU Backend Pass Pipelines
 //----------------------------------------------------------------------------//
 
+/// Lowering using SIMT CUDA core operations.
+void addGPUMatmulSimtPassPipeline(OpPassManager &funcPassManager,
+                                  const GPUPipelineOptions &options);
+
 /// Lowering using mma.sync Tensor Core operations.
 void addGPUMatmulTensorCoreMmaSyncPassPipeline(
     OpPassManager &funcPassManager, const GPUPipelineOptions &options,
 
@@ -38,6 +38,10 @@ getInstructionShape(Operation *op, CodeGenPipeline pipeline,
                     Type inputElementType,
                     SmallVector<int64_t> &instructionShape) {
   switch (pipeline) {
+  case CodeGenPipeline::LLVMGPUMatmulSimt:
+    // SIMT Pipeline / CUDA Cores
+    instructionShape = {1, 1, 1};
+    break;
   case CodeGenPipeline::LLVMGPUMatmulTensorCore:
     // Tensor Core Pipeline / WMMA API
     if (inputElementType.isF16() || inputElementType.isBF16()) {
@@ -77,7 +81,8 @@ verifyGPUMatmulPipeline(Operation *op,
                         ArrayRef<int64_t> workgroupSize) {
   // This verifier only applies to matmul.
   CodeGenPipeline pipeline = translationInfo.getDispatchLoweringPassPipeline();
-  if (pipeline != CodeGenPipeline::LLVMGPUMatmulTensorCore &&
+  if (pipeline != CodeGenPipeline::LLVMGPUMatmulSimt &&
+      pipeline != CodeGenPipeline::LLVMGPUMatmulTensorCore &&
       pipeline != CodeGenPipeline::LLVMGPUMatmulTensorCoreMmaSync) {
     return success();
   }
@@ -175,6 +180,10 @@ verifyGPUMatmulPipeline(Operation *op,
            << pipelineName;
   }
 
+  // Return success for SIMT/CUDA cores.
+  if (pipeline == CodeGenPipeline::LLVMGPUMatmulSimt)
+    return success();
+
   //
   // Additional verification Tensor Core pipelines.
   //
 
@@ -267,11 +267,12 @@ func.func @not_vmt() {
   return
 }
 
-//   CHECK-DAG: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [32, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true, use_igemm_convolution = false>}>
+//   CHECK-DAG: #[[$CONFIG:.+]] = #iree_codegen.lowering_config<tile_sizes = {{\[}}[1, 128, 8]{{\]}}>
+//       CHECK: #[[$TRANSLATION:.+]] = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [32, 1, 1] subgroup_size = 64, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
 //       CHECK: func.func @not_vmt()
 //  CHECK-SAME:     translation_info = #[[$TRANSLATION]]
 //       CHECK:   linalg.generic
-//  CHECK-SAME:       lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1], reduction = [0, 0, 8], thread = [1, 128, 0], workgroup = [1, 128, 1]}>
+//  CHECK-SAME:       lowering_config = #[[$CONFIG]]
 
 // -----
 
 
@@ -9,4 +9,4 @@ func.func @matmul(%lhs: tensor<4x4xf32>, %rhs: tensor<4x4xf32>) -> tensor<4x4xf3
   return %result : tensor<4x4xf32>
 }
 
-// CHECK: %2 = linalg.matmul {lowering_config = #{{.*}}, root_op} ins(%arg0, %arg1 : tensor<4x4xf32>, tensor<4x4xf32>) outs(%1 : tensor<4x4xf32>) -> tensor<4x4xf32>
+// CHECK: %2 = linalg.matmul {lowering_config = #config, root_op} ins(%arg0, %arg1 : tensor<4x4xf32>, tensor<4x4xf32>) outs(%1 : tensor<4x4xf32>) -> tensor<4x4xf32>
@@ -9,7 +9,7 @@
 #map = affine_map<()[s0] -> (s0 * 2)>
 #map1 = affine_map<()[s0] -> (s0 * 256)>
 #map2 = affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>
-#translation = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1], {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
+#translation = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [64, 1, 1], {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
 func.func @dot_dispatch_0() attributes {translation_info = #translation} {
   %cst = arith.constant 0.000000e+00 : f32
   %c0 = arith.constant 0 : index
@@ -79,7 +79,7 @@ func.func @dot_dispatch_0() attributes {translation_info = #translation} {
 #map2 = affine_map<(d0, d1, d2)[s0] -> (d0 * 32768 + s0 + d1 * 1024 + d2)>
 #map3 = affine_map<(d0, d1, d2)[s0] -> (d0 * 65536 + s0 + d1 * 64 + d2)>
 #map4 = affine_map<(d0, d1, d2)[s0] -> (d0 * 2048 + s0 + d1 * 64 + d2)>
-#translation = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [8, 8, 1], {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
+#translation = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [8, 8, 1], {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
 func.func @batch_matmul_func() attributes {translation_info = #translation} {
   %c0 = arith.constant 0 : index
   %cst = arith.constant 0.000000e+00 : f32
@@ -148,7 +148,7 @@ func.func @batch_matmul_func() attributes {translation_info = #translation} {
 #map = affine_map<()[s0] -> (s0 * 2)>
 #map1 = affine_map<()[s0] -> (s0 * 32)>
 #map2 = affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>
-#translation = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 8, 1], {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
+#translation = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [64, 8, 1], {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
 func.func @dot_dispatch_0() attributes {translation_info = #translation} {
   %cst = arith.constant 0.000000e+00 : f32
   %c0 = arith.constant 0 : index
@@ -312,7 +312,7 @@ module {
   #hal.pipeline.binding<storage_buffer>
 ]>
 #config = #iree_codegen.lowering_config<tile_sizes = [[0, 1, 2, 256, 4]]>
-#translation = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 8, 1], {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
+#translation = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [64, 8, 1], {pipeline_depth = 0 : i64, store_stage = 1 : i64}>
 #map = affine_map<()[s0] -> (s0 * 2)>
 #map1 = affine_map<()[s0] -> (s0 * 256)>
 #map2 = affine_map<(d0)[s0] -> (-d0 + s0, 2)>
Original file line number	Diff line number	Diff line change
`@@ -9,4 +9,4 @@ func.func @matmul(%lhs: tensor<4x4xf32>, %rhs: tensor<4x4xf32>) -> tensor<4x4xf3`
`9`	`9`	`return %result : tensor<4x4xf32>`
`10`	`10`	`}`
`11`	`11`
`12`		`-// CHECK: %2 = linalg.matmul {lowering_config = #{{.*}}, root_op} ins(%arg0, %arg1 : tensor<4x4xf32>, tensor<4x4xf32>) outs(%1 : tensor<4x4xf32>) -> tensor<4x4xf32>`
	`12`	`+// CHECK: %2 = linalg.matmul {lowering_config = #config, root_op} ins(%arg0, %arg1 : tensor<4x4xf32>, tensor<4x4xf32>) outs(%1 : tensor<4x4xf32>) -> tensor<4x4xf32>`