iree-org · hanhanW · Jan 7, 2025 · Jan 13, 2025 · Jan 20, 2025 · benvanik
@@ -16,6 +16,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/LogicalResult.h"
 #include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/SymbolTable.h"
 #include "mlir/Interfaces/FunctionInterfaces.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Support/LLVM.h"
@@ -52,6 +53,179 @@ SmallVector<const T *> gatherUsedDialectInterfaces(mlir::ModuleOp moduleOp) {
   return results;
 }
 
+/// Returns the affinities of the `dispatchOp`'s resource operands. An empty
+/// array attribute indicates that the resource operand affinity is not found.
+/// Usually, it happens when it fails on affinity analysis.
+/// Note that the size of the result might not equal to the number of resource
+/// operands. If a resource operand type is not AffinityType, it is skipped.
+static SmallVector<Attribute>
+getResourceOperandsAffinities(IREE::Stream::AffinityAnalysis &affinityAnalysis,
+                              IREE::Stream::AsyncDispatchOp dispatchOp) {
+  SmallVector<Attribute> result;
+  Builder b(dispatchOp.getContext());
+  auto emptyArray = b.getArrayAttr({});
+  for (auto operand : dispatchOp.getResourceOperands()) {
+    // Skip if the operand type is not AffinityType.
+    if (!isa<IREE::Stream::AffinityTypeInterface>(operand.getType())) {
+      continue;
+    }
+    SmallVector<IREE::Stream::AffinityAttr> affinities;
+    if (!affinityAnalysis.tryLookupResourceAffinity(operand, affinities)) {
+      result.push_back(emptyArray);
+      continue;
+    }
+    result.push_back(b.getArrayAttr(llvm::to_vector_of<Attribute>(affinities)));
+  }
+  return result;
+}
+
+/// Duplicates stream.executables based on the affinity analysis of
+/// stream.async.dispatch ops. Some executables can be launched by different
+/// devices. It can produce wrong codegen artifacts when bindings types are
+/// encoded (i.e., the tensor type has an encoding attribute). Because they can
+/// result in different layouts, especially when multi-device is involved. E.g.,
+/// say that device_a and device_b interpret a tensor type with encodings in
+/// different layouts, and there is an executable that can be launch with
+/// resources from either device_a or device_b. It is confusing what the input
+/// layouts for the executable because there are two possibilities. In this
+/// case, we have to duplicate the executable with updated encoding, and modify
+/// the dispatch to launch proper executable based on device analysis.
+static LogicalResult duplicateExecutablesPerAffinityVariant(
+    ModuleOp moduleOp, SymbolTable symbolTable, FunctionOpInterface funcOp,
+    IREE::Stream::ResolveLayoutAttrFn resolveLayoutAttr) {
+  MLIRContext *ctx = moduleOp.getContext();
+  IRRewriter rewriter(ctx);
+
+  // 1. Gather per-export [execution affinity -> [resource affinities]] map.
+  IREE::Stream::AffinityAnalysis affinityAnalysis(moduleOp);
+  if (failed(affinityAnalysis.run())) {
+    return moduleOp.emitError("failed on running affinity analysis");
+  }
+  SmallVector<IREE::Stream::AsyncDispatchOp> candidates;
+  funcOp.walk(
+      [&](IREE::Stream::AsyncDispatchOp op) { candidates.push_back(op); });
+
+  // export -> [affinity -> array per resource of affinities PVS].
+  DenseMap<IREE::Stream::ExecutableExportOp,
+           SetVector<std::pair<IREE::Stream::AffinityAttr, ArrayAttr>>>
+      exportToDispatchSites;
+
+  llvm::MapVector<IREE::Stream::AsyncDispatchOp, SmallVector<Attribute>>
+      resourceAffinities;
+  for (auto dispatchOp : candidates) {
+    SmallVector<IREE::Stream::AffinityAttr> execAffinities;
+    if (!affinityAnalysis.tryLookupExecutionAffinity(dispatchOp,
+                                                     execAffinities)) {
+      return dispatchOp.emitError("failed on execution affinity lookup");
+    }
+    assert(execAffinities.size() == 1 &&
+           "We should only have a single execution "
+           "affinity when running the pass.");
+
+    SmallVector<Attribute> operandAffinityAttrs =
+        getResourceOperandsAffinities(affinityAnalysis, dispatchOp);
+    resourceAffinities[dispatchOp] = operandAffinityAttrs;
+
+    dispatchOp.forEachEntryPointAttr([&](SymbolRefAttr entryPoint) {
+      auto exportOp = cast<IREE::Stream::ExecutableExportOp>(
+          symbolTable.lookupSymbolIn(moduleOp, entryPoint));
+      exportToDispatchSites[exportOp].insert(std::make_pair(
+          execAffinities[0], rewriter.getArrayAttr(operandAffinityAttrs)));
+    });
+  }
+
+  LLVM_DEBUG({
+    llvm::dbgs() << "Dump of exportToDispatchSites\n";
+    for (auto [exportOp, affinities] : exportToDispatchSites) {
+      llvm::dbgs() << "  ExportOp: " << exportOp.getSymName() << "\n";
+      for (auto [execAffinity, resourceAffinities] : affinities) {
+        llvm::dbgs() << "    executaion affinity: " << execAffinity << "\n";
+        llvm::dbgs() << "    resource affinities: " << resourceAffinities
+                     << "\n";
+      }
+    }
+  });
+
+  // 2. Duplicate executables for each unqiue resource affinities.
+
+  // Mapping from [execution affinity, resource operands affinities, export] to
+  // the executable op.
+  using DispatchSiteInfo = std::tuple<IREE::Stream::AffinityAttr, ArrayAttr,
+                                      IREE::Stream::ExecutableExportOp>;
+  DenseMap<DispatchSiteInfo, IREE::Stream::ExecutableOp>
+      dispatchSiteToExecutableOp;
+  for (auto [exportOp, execAndResourceAffinities] : exportToDispatchSites) {
+    auto executableOp = exportOp->getParentOfType<IREE::Stream::ExecutableOp>();
+    // No need to duplicate the executable if all the uses have the same
+    // affinities.
+    // TODO(hanchung): Do not duplicate the executables if bindings are not
+    // encoded. I.e., all the tensor types do not have encodings.
+    if (execAndResourceAffinities.size() == 1) {
+      auto [execAffinity, resourceAffinities] = execAndResourceAffinities[0];
+      dispatchSiteToExecutableOp[DispatchSiteInfo(
+          execAffinity, resourceAffinities, exportOp)] = executableOp;
+      continue;
+    }
+
+    int64_t dupId = -1;
+    for (auto [execAffinity, resourceAffinities] : execAndResourceAffinities) {
+      rewriter.setInsertionPointAfter(executableOp);
+      IREE::Stream::ExecutableOp dupOp = executableOp;
+      if (dupId != -1) {
+        auto symName = std::string(executableOp.getSymName());
+        symName += "_dup" + std::to_string(dupId);
+        dupOp = rewriter.cloneWithoutRegions(executableOp);
+        rewriter.modifyOpInPlace(dupOp, [&] {
+          dupOp.setSymName(symName);
+          IRMapping mapping;
+          executableOp.getRegion().cloneInto(&dupOp.getRegion(), mapping);
+        });
+      }
+      dispatchSiteToExecutableOp[DispatchSiteInfo(
+          execAffinity, resourceAffinities, exportOp)] = dupOp;
+      dupId++;
+    }
+  }
+
+  // 3. Update dispatch sites, i.e., point dispatch entry points to
+  // corresponding cloned executables.
+  for (auto dispatchOp : candidates) {
+    SmallVector<Attribute> newEntryPoints;
+    SmallVector<IREE::Stream::AffinityAttr> execAffinities;
+    // Sanity checks. It should already meet the requirement because they are
+    // checked in step 1. This can not be wrapped by an assertion because it
+    // could be dropped by compiler.
+    if (!affinityAnalysis.tryLookupExecutionAffinity(dispatchOp,
+                                                     execAffinities)) {
+      return failure();
+    }
+
+    assert(execAffinities.size() == 1);
+    SmallVector<Attribute> operandAttrs = resourceAffinities[dispatchOp];
+    dispatchOp.forEachEntryPointAttr([&](SymbolRefAttr entryPoint) {
+      auto exportOp = cast<IREE::Stream::ExecutableExportOp>(
+          symbolTable.lookupSymbolIn(moduleOp, entryPoint));
+      auto info = DispatchSiteInfo(
+          execAffinities[0], rewriter.getArrayAttr(operandAttrs), exportOp);
+      assert(dispatchSiteToExecutableOp.count(info));
+
+      auto executableOp = dispatchSiteToExecutableOp[info];
+      auto newSym = SymbolRefAttr::get(executableOp->getAttrOfType<StringAttr>(
+                                           SymbolTable::getSymbolAttrName()),
+                                       entryPoint.getNestedReferences());
+      newEntryPoints.push_back(newSym);
+    });
+
+    rewriter.modifyOpInPlace(dispatchOp, [&] {
+      dispatchOp.setEntryPointsAttr(rewriter.getArrayAttr(newEntryPoints));
+    });
+  }
+
+  // TODO(hanchung): Update encodings in executables.
+
+  return success();
+}
+
 // TODO(hanchung): Add "cloneWithEncoding" method to RankedTensorType.
 static RankedTensorType cloneWithEncoding(RankedTensorType type,
                                           Attribute encodingAttr) {
@@ -156,6 +330,7 @@ struct SpecializeEncodingsPass
       return signalPassFailure();
     }
 
+    SymbolTable symbolTable(moduleOp);
     llvm::MapVector<StringRef, IREE::Stream::ExecutableOp> executableOps;
     for (auto executableOp : moduleOp.getOps<IREE::Stream::ExecutableOp>()) {
       executableOps[executableOp.getName()] = executableOp;
@@ -171,7 +346,11 @@ struct SpecializeEncodingsPass
         return signalPassFailure();
       }
 
-      // TODO(hanchung): Duplicate executables and update dispatch ops.
+      if (failed(duplicateExecutablesPerAffinityVariant(
+              moduleOp, symbolTable, funcOp, resolveLayoutAttr))) {
+        funcOp.emitError("failed on executable duplication");
+        return signalPassFailure();
+      }
     }
   }
 };

@@ -33,3 +33,48 @@ module {
 // CHECK:         %[[D0_RES:.+]] = stream.tensor.sizeof {{.+}} tensor<?x?xf32, #[[$ENCODING0]]>
 // CHECK:         %[[D1_RES:.+]] = stream.tensor.sizeof {{.+}} tensor<?x?xf32, #[[$ENCODING1]]>
 // CHECK:         return %[[D0_RES]], %[[D1_RES]]
+
+// -----
+
+#executable_target_vmvx_bytecode_fb = #hal.executable.target<"vmvx", "vmvx-bytecode-fb", {ukernels = "none"}>
+#map = affine_map<(d0) -> (d0)>
+#device_target_local_0_ = #hal.device.target<"local", {ordinal = 0 : index}, [#executable_target_vmvx_bytecode_fb]> : !hal.device
+#device_target_local_1_ = #hal.device.target<"local", {ordinal = 1 : index}, [#executable_target_vmvx_bytecode_fb]> : !hal.device
+module attributes {stream.affinity.default = #hal.device.affinity<@device_a>} {
+  util.global private @device_a = #device_target_local_0_
+  util.global private @device_b = #device_target_local_1_
+  stream.executable private @ex {
+    stream.executable.export public @dispatch
+  }
+  util.func public @multi_device(%arg0: !hal.buffer_view, %arg1: !hal.fence, %arg2: !hal.fence) -> !hal.buffer_view {
+    %c16 = arith.constant 16 : index
+    %c0 = arith.constant 0 : index
+    %c4 = arith.constant 4 : index
+    %element_type_f32 = hal.element_type<f32> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c4]) type(%element_type_f32) encoding(%dense_row_major)
+    %0 = stream.tensor.import on(#hal.device.affinity<@device_a>) %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
+    %1 = stream.timepoint.import on(#hal.device.affinity<@device_a>) %arg1 : (!hal.fence) => !stream.timepoint
+    %2 = stream.timepoint.await %1 => %0 : !stream.resource<external>{%c16}
+    %3 = stream.async.transfer %2 : !stream.resource<external>{%c16} from(#hal.device.affinity<@device_a>) -> to(#hal.device.affinity<@device_a>) !stream.resource<*>{%c16}
+    %4 = stream.async.dispatch on(#hal.device.affinity<@device_a>) @ex::@dispatch(%3[%c0 to %c16 for %c16]) : (!stream.resource<*>{%c16}) -> !stream.resource<*>{%c16}
+    %5 = stream.async.transfer %4 : !stream.resource<*>{%c16} from(#hal.device.affinity<@device_a>) -> to(#hal.device.affinity<@device_b>) !stream.resource<*>{%c16}
+    %6 = stream.async.dispatch on(#hal.device.affinity<@device_b>) @ex::@dispatch(%5[%c0 to %c16 for %c16]) : (!stream.resource<*>{%c16}) -> !stream.resource<*>{%c16}
+    %7 = stream.async.transfer %6 : !stream.resource<*>{%c16} from(#hal.device.affinity<@device_b>) -> to(#hal.device.affinity<@device_a>) !stream.resource<*>{%c16}
+    %result, %result_timepoint = stream.timepoint.barrier on(#hal.device.affinity<@device_a>) %7 : !stream.resource<*>{%c16} => !stream.timepoint
+    stream.timepoint.chain_external on(#hal.device.affinity<@device_a>) %result_timepoint => (%arg2 : !hal.fence)
+    %8 = stream.async.transfer %result : !stream.resource<*>{%c16} from(#hal.device.affinity<@device_a>) -> to(#hal.device.affinity<@device_a>) !stream.resource<external>{%c16}
+    %9 = stream.tensor.export on(#hal.device.affinity<@device_a>) %8 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
+    util.return %9 : !hal.buffer_view
+  }
+}
+
+// CHECK:       #[[DEVICE_LOCAL_0:.+]] = #hal.device.target
+// CHECK:       #[[DEVICE_LOCAL_1:.+]] = #hal.device.target
+// CHECK:       util.global private @[[$DEVICE_A:.+]] = #[[DEVICE_LOCAL_0]]
+// CHECK:       util.global private @[[$DEVICE_B:.+]] = #[[DEVICE_LOCAL_1]]
+// CHECK:       stream.executable private @[[$EX0:.+]] {
+// CHECK:       stream.executable private @[[$EX1:.+]] {
+// CHECK-LABEL: util.func public @multi_device
+// CHECK:         stream.async.dispatch on(#hal.device.affinity<@[[$DEVICE_A]]>) @[[$EX0]]::@dispatch
+// CHECK:         stream.async.dispatch on(#hal.device.affinity<@[[$DEVICE_B]]>) @[[$EX1]]::@dispatch