Add comments to the tests.

hanhanW · hanhanW · commit 931032c88b4d · 2025-01-08T12:54:48.000+08:00
Signed-off-by: hanhanW &lt;hanhan0912@gmail.com&gt;
diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/SpecializeEncodings.cpp b/compiler/src/iree/compiler/Dialect/Stream/Transforms/SpecializeEncodings.cpp
@@ -16,6 +16,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/LogicalResult.h"
 #include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/SymbolTable.h"
 #include "mlir/Interfaces/FunctionInterfaces.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Support/LLVM.h"
@@ -52,6 +53,175 @@ SmallVector<const T *> gatherUsedDialectInterfaces(mlir::ModuleOp moduleOp) {
   return results;
 }
 
+/// Returns the affinities of the `dispatchOp`'s resource operands. An empty
+/// array attribute indicates that the resource operand affinity is not found.
+/// Usually, it happens when it fails on affinity analysis.
+/// Note that the size of the result might not equal to the number of resource
+/// operands. If a resource operand type is not AffinityType, it is  skipped.
+static SmallVector<Attribute>
+getResourceOperandsAffinities(IREE::Stream::AffinityAnalysis &affinityAnalysis,
+                              IREE::Stream::AsyncDispatchOp dispatchOp) {
+  SmallVector<Attribute> result;
+  Builder b(dispatchOp.getContext());
+  auto emptyArray = b.getArrayAttr({});
+  for (auto operand : dispatchOp.getResourceOperands()) {
+    // Skip if the operand type is not AffinityType.
+    if (!isa<IREE::Stream::AffinityTypeInterface>(operand.getType())) {
+      continue;
+    }
+    SmallVector<IREE::Stream::AffinityAttr> affinities;
+    if (!affinityAnalysis.tryLookupResourceAffinity(operand, affinities)) {
+      result.push_back(emptyArray);
+      continue;
+    }
+    result.push_back(b.getArrayAttr(llvm::to_vector_of<Attribute>(affinities)));
+  }
+  return result;
+}
+
+/// Duplicates stream.executables based on the affinity analysis of
+/// stream.async.dispatch ops. Some executables can be launched by different
+/// devices. It can produce wrong codegen artifacts when bindings types are
+/// encoded (i.e., the tensor type has an encoding attribute). Because they can
+/// result in different layouts, especially when multi-device is involved. E.g.,
+/// say that device_a and device_b interpret a tensor type with encodings in
+/// different layouts, and there is an executable that can be launch with
+/// resources from either device_a or device_b. It is confusing what the input
+/// layouts for the executable because there are two possibilities. In this
+/// case, we have to duplicate the executable with updated encoding, and modify
+/// the dispatch to launch proper executable based on device analysis.
+static LogicalResult duplicateExecutablesPerAffinityVariant(
+    ModuleOp moduleOp, SymbolTable symbolTable, FunctionOpInterface funcOp,
+    IREE::Stream::ResolveLayoutAttrFn resolveLayoutAttr) {
+  MLIRContext *ctx = moduleOp.getContext();
+  IRRewriter rewriter(ctx);
+
+  // 1. Gather per-export [execution affinity -> [resource affinities]] map.
+  IREE::Stream::AffinityAnalysis affinityAnalysis(moduleOp);
+  if (failed(affinityAnalysis.run())) {
+    return moduleOp.emitError("failed on running affinity analysis");
+  }
+  SmallVector<IREE::Stream::AsyncDispatchOp> candidates;
+  funcOp.walk(
+      [&](IREE::Stream::AsyncDispatchOp op) { candidates.push_back(op); });
+
+  // export -> [affinity -> array per resource of affinities PVS].
+  DenseMap<IREE::Stream::ExecutableExportOp,
+           SetVector<std::pair<IREE::Stream::AffinityAttr, ArrayAttr>>>
+      exportToDispatchSites;
+
+  llvm::MapVector<IREE::Stream::AsyncDispatchOp, SmallVector<Attribute>>
+      resourceAffinities;
+  for (auto dispatchOp : candidates) {
+    SmallVector<IREE::Stream::AffinityAttr> execAffinities;
+    if (!affinityAnalysis.tryLookupExecutionAffinity(dispatchOp,
+                                                     execAffinities)) {
+      return dispatchOp.emitError("failed on execution affinity lookup");
+    }
+    assert(execAffinities.size() == 1 &&
+           "We should only have a single execution "
+           "affinity when running the pass.");
+
+    SmallVector<Attribute> operandAffinityAttrs =
+        getResourceOperandsAffinities(affinityAnalysis, dispatchOp);
+    resourceAffinities[dispatchOp] = operandAffinityAttrs;
+
+    dispatchOp.forEachEntryPointAttr([&](SymbolRefAttr entryPoint) {
+      auto exportOp = cast<IREE::Stream::ExecutableExportOp>(
+          symbolTable.lookupSymbolIn(moduleOp, entryPoint));
+      exportToDispatchSites[exportOp].insert(std::make_pair(
+          execAffinities[0], rewriter.getArrayAttr(operandAffinityAttrs)));
+    });
+  }
+
+  LLVM_DEBUG({
+    llvm::dbgs() << "Dump of exportToDispatchSites\n";
+    for (auto [exportOp, affinities] : exportToDispatchSites) {
+      llvm::dbgs() << "  ExportOp: " << exportOp.getSymName() << "\n";
+      for (auto [execAffinity, resourceAffinities] : affinities) {
+        llvm::dbgs() << "    executaion affinity: " << execAffinity << "\n";
+        llvm::dbgs() << "    resource affinities: " << resourceAffinities
+                     << "\n";
+      }
+    }
+  });
+
+  // 2. Duplicate executables for each unqiue resource affinities.
+
+  // Mapping from [execution affinity, resource operands affinities, export] to
+  // the executable op.
+  using DispatchSiteInfo = std::tuple<IREE::Stream::AffinityAttr, ArrayAttr,
+                                      IREE::Stream::ExecutableExportOp>;
+  DenseMap<DispatchSiteInfo, IREE::Stream::ExecutableOp>
+      dispatchSiteToExecutableOp;
+  for (auto [exportOp, execAndResourceAffinities] : exportToDispatchSites) {
+    auto executableOp = exportOp->getParentOfType<IREE::Stream::ExecutableOp>();
+    // No need to duplicate the executable if all the uses have the same
+    // affinities.
+    // TODO(hanchung): Do not duplicate the executables if bindings are not
+    // encoded. I.e., all the tensor types do not have encodings.
+    if (execAndResourceAffinities.size() == 1) {
+      auto [execAffinity, resourceAffinities] = execAndResourceAffinities[0];
+      dispatchSiteToExecutableOp[DispatchSiteInfo(
+          execAffinity, resourceAffinities, exportOp)] = executableOp;
+      continue;
+    }
+
+    int64_t dupId = -1;
+    for (auto [execAffinity, resourceAffinities] : execAndResourceAffinities) {
+      rewriter.setInsertionPointAfter(executableOp);
+      IREE::Stream::ExecutableOp dupOp = executableOp;
+      if (dupId != -1) {
+        auto symName = std::string(executableOp.getSymName());
+        symName += "_dup" + std::to_string(dupId);
+        dupOp = rewriter.cloneWithoutRegions(executableOp);
+        rewriter.modifyOpInPlace(dupOp, [&] {
+          dupOp.setSymName(symName);
+          IRMapping mapping;
+          executableOp.getRegion().cloneInto(&dupOp.getRegion(), mapping);
+        });
+      }
+      dispatchSiteToExecutableOp[DispatchSiteInfo(
+          execAffinity, resourceAffinities, exportOp)] = dupOp;
+      dupId++;
+    }
+  }
+
+  // 3. Update dispatch sites, i.e., point dispatch entry points to
+  // corresponding cloned executables.
+  for (auto dispatchOp : candidates) {
+    SmallVector<Attribute> newEntryPoints;
+    SmallVector<IREE::Stream::AffinityAttr> execAffinities;
+    // Sanity checks. It should already meet the requirement because they are
+    // checked in step 1.
+    assert(affinityAnalysis.tryLookupExecutionAffinity(dispatchOp,
+                                                       execAffinities));
+    assert(execAffinities.size() == 1);
+    SmallVector<Attribute> operandAttrs = resourceAffinities[dispatchOp];
+    dispatchOp.forEachEntryPointAttr([&](SymbolRefAttr entryPoint) {
+      auto exportOp = cast<IREE::Stream::ExecutableExportOp>(
+          symbolTable.lookupSymbolIn(moduleOp, entryPoint));
+      auto info = DispatchSiteInfo(
+          execAffinities[0], rewriter.getArrayAttr(operandAttrs), exportOp);
+      assert(dispatchSiteToExecutableOp.count(info));
+
+      auto executableOp = dispatchSiteToExecutableOp[info];
+      auto newSym = SymbolRefAttr::get(executableOp->getAttrOfType<StringAttr>(
+                                           SymbolTable::getSymbolAttrName()),
+                                       entryPoint.getNestedReferences());
+      newEntryPoints.push_back(newSym);
+    });
+
+    rewriter.modifyOpInPlace(dispatchOp, [&] {
+      dispatchOp.setEntryPointsAttr(rewriter.getArrayAttr(newEntryPoints));
+    });
+  }
+
+  // TODO(hanchung): Update encodings in executables.
+
+  return success();
+}
+
 // TODO(hanchung): Add "cloneWithEncoding" method to RankedTensorType.
 static RankedTensorType cloneWithEncoding(RankedTensorType type,
                                           Attribute encodingAttr) {
@@ -149,6 +319,7 @@ struct SpecializeEncodingsPass
       return signalPassFailure();
     }
 
+    SymbolTable symbolTable(moduleOp);
     llvm::MapVector<StringRef, IREE::Stream::ExecutableOp> executableOps;
     for (auto executableOp : moduleOp.getOps<IREE::Stream::ExecutableOp>()) {
       executableOps[executableOp.getName()] = executableOp;
@@ -164,7 +335,11 @@ struct SpecializeEncodingsPass
         return signalPassFailure();
       }
 
-      // TODO(hanchung): Duplicate executables and update dispatch ops.
+      if (failed(duplicateExecutablesPerAffinityVariant(
+              moduleOp, symbolTable, funcOp, resolveLayoutAttr))) {
+        funcOp.emitError("failed on executable duplication");
+        return signalPassFailure();
+      }
     }
   }
 };
diff --git a/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/specialize_encodings.mlir b/compiler/src/iree/compiler/Dialect/Stream/Transforms/test/specialize_encodings.mlir
@@ -1,19 +1,69 @@
 // RUN: iree-opt --split-input-file --iree-stream-specialize-encodings %s | FileCheck %s
 
-#executable_target_vmvx_bytecode_fb = #hal.executable.target<"vmvx", "vmvx-bytecode-fb", {encoding_layout = #iree_cpu.vmvx_encoding_layout<>, ukernels = "all"}>
+//------------------------------------------------------------------------------
+// Stream ops that have TensorPhaseOp trait. This test suite tests that the
+// encoding is updated that carries resolved layouts.
+//------------------------------------------------------------------------------
+
+#executable_target_vmvx_bytecode_fb = #hal.executable.target<"vmvx", "vmvx-bytecode-fb", {encoding_layout = #iree_cpu.vmvx_encoding_layout<>}>
 #device_target_local_0_ = #hal.device.target<"local", {ordinal = 0 : index}, [#executable_target_vmvx_bytecode_fb]> : !hal.device
 #encoding = #iree_encoding.encoding<operand_index = 0 : index, op_type =  matmul, element_types = [f32, f32, f32]>
 module {
   util.global private @device_a = #device_target_local_0_
 
-  util.func public @main(%d0: index, %d1: index) -> index {
+  util.func public @tensor_sizeof(%d0: index, %d1: index) -> index {
     %size = stream.tensor.sizeof on(#hal.device.affinity<@device_a>) tensor<?x?xf32, #encoding>{%d0, %d1} : index
     util.return %size : index
   }
 }
 // CHECK:       #[[EXECUTABLE:.+]] = #hal.executable.target<"vmvx",
 // CHECK:       #[[$ENCODING:.+]] = #iree_encoding.encoding
 // CHECK-SAME:    layouts = [#[[EXECUTABLE]]]
-// CHECK-LABEL: util.func public @main
+// CHECK-LABEL: util.func public @tensor_sizeof
 // CHECK:         %[[RES:.+]] = stream.tensor.sizeof {{.+}} tensor<?x?xf32, #[[$ENCODING]]>
 // CHECK:         return %[[RES]]
+
+// -----
+
+#executable_target_vmvx_bytecode_fb = #hal.executable.target<"vmvx", "vmvx-bytecode-fb", {ukernels = "none"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local_0_ = #hal.device.target<"local", {ordinal = 0 : index}, [#executable_target_vmvx_bytecode_fb]> : !hal.device
+#device_target_local_1_ = #hal.device.target<"local", {ordinal = 1 : index}, [#executable_target_vmvx_bytecode_fb]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@device_a>} {
+  util.global private @device_a = #device_target_local_0_
+  util.global private @device_b = #device_target_local_1_
+  stream.executable private @ex {
+    stream.executable.export public @dispatch
+  }
+  util.func public @multi_device(%arg0: !hal.buffer_view, %arg1: !hal.fence, %arg2: !hal.fence) -> !hal.buffer_view {
+    %c16 = arith.constant 16 : index
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c4]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.import on(#hal.device.affinity<@device_a>) %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
+    %1 = stream.timepoint.import on(#hal.device.affinity<@device_a>) %arg1 : (!hal.fence) => !stream.timepoint
+    %2 = stream.timepoint.await %1 => %0 : !stream.resource<external>{%c16}
+    %3 = stream.async.transfer %2 : !stream.resource<external>{%c16} from(#hal.device.affinity<@device_a>) -> to(#hal.device.affinity<@device_a>) !stream.resource<*>{%c16}
+    %4 = stream.async.dispatch on(#hal.device.affinity<@device_a>) @ex::@dispatch(%3[%c0 to %c16 for %c16]) : (!stream.resource<*>{%c16}) -> !stream.resource<*>{%c16}
+    %5 = stream.async.transfer %4 : !stream.resource<*>{%c16} from(#hal.device.affinity<@device_a>) -> to(#hal.device.affinity<@device_b>) !stream.resource<*>{%c16}
+    %6 = stream.async.dispatch on(#hal.device.affinity<@device_b>) @ex::@dispatch(%5[%c0 to %c16 for %c16]) : (!stream.resource<*>{%c16}) -> !stream.resource<*>{%c16}
+    %7 = stream.async.transfer %6 : !stream.resource<*>{%c16} from(#hal.device.affinity<@device_b>) -> to(#hal.device.affinity<@device_a>) !stream.resource<*>{%c16}
+    %result, %result_timepoint = stream.timepoint.barrier on(#hal.device.affinity<@device_a>) %7 : !stream.resource<*>{%c16} => !stream.timepoint
+    stream.timepoint.chain_external on(#hal.device.affinity<@device_a>) %result_timepoint => (%arg2 : !hal.fence)
+    %8 = stream.async.transfer %result : !stream.resource<*>{%c16} from(#hal.device.affinity<@device_a>) -> to(#hal.device.affinity<@device_a>) !stream.resource<external>{%c16}
+    %9 = stream.tensor.export on(#hal.device.affinity<@device_a>) %8 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
+    util.return %9 : !hal.buffer_view
+  }
+}
+
+// CHECK:       #[[DEVICE_LOCAL_0:.+]] = #hal.device.target
+// CHECK:       #[[DEVICE_LOCAL_1:.+]] = #hal.device.target
+// CHECK:       util.global private @[[$DEVICE_A:.+]] = #[[DEVICE_LOCAL_0]]
+// CHECK:       util.global private @[[$DEVICE_B:.+]] = #[[DEVICE_LOCAL_1]]
+// CHECK:       stream.executable private @[[$EX0:.+]] {
+// CHECK:       stream.executable private @[[$EX1:.+]] {
+// CHECK-LABEL: util.func public @multi_device
+// CHECK:         stream.async.dispatch on(#hal.device.affinity<@[[$DEVICE_A]]>) @[[$EX0]]::@dispatch
+// CHECK:         stream.async.dispatch on(#hal.device.affinity<@[[$DEVICE_B]]>) @[[$EX1]]::@dispatch