Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for executable duplication in encoding specialization pass. #19527

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/LogicalResult.h"
#include "mlir/IR/BuiltinTypes.h"
#include "mlir/IR/SymbolTable.h"
#include "mlir/Interfaces/FunctionInterfaces.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Support/LLVM.h"
Expand Down Expand Up @@ -52,6 +53,179 @@ SmallVector<const T *> gatherUsedDialectInterfaces(mlir::ModuleOp moduleOp) {
return results;
}

/// Returns the affinities of the `dispatchOp`'s resource operands. An empty
/// array attribute indicates that the resource operand affinity is not found.
/// Usually, it happens when it fails on affinity analysis.
/// Note that the size of the result might not equal to the number of resource
/// operands. If a resource operand type is not AffinityType, it is skipped.
static SmallVector<Attribute>
getResourceOperandsAffinities(IREE::Stream::AffinityAnalysis &affinityAnalysis,
IREE::Stream::AsyncDispatchOp dispatchOp) {
SmallVector<Attribute> result;
Builder b(dispatchOp.getContext());
auto emptyArray = b.getArrayAttr({});
for (auto operand : dispatchOp.getResourceOperands()) {
// Skip if the operand type is not AffinityType.
if (!isa<IREE::Stream::AffinityTypeInterface>(operand.getType())) {
continue;
}
SmallVector<IREE::Stream::AffinityAttr> affinities;
if (!affinityAnalysis.tryLookupResourceAffinity(operand, affinities)) {
result.push_back(emptyArray);
continue;
}
result.push_back(b.getArrayAttr(llvm::to_vector_of<Attribute>(affinities)));
}
return result;
}

/// Duplicates stream.executables based on the affinity analysis of
/// stream.async.dispatch ops. Some executables can be launched by different
/// devices. It can produce wrong codegen artifacts when bindings types are
/// encoded (i.e., the tensor type has an encoding attribute). Because they can
/// result in different layouts, especially when multi-device is involved. E.g.,
/// say that device_a and device_b interpret a tensor type with encodings in
/// different layouts, and there is an executable that can be launch with
/// resources from either device_a or device_b. It is confusing what the input
/// layouts for the executable because there are two possibilities. In this
/// case, we have to duplicate the executable with updated encoding, and modify
/// the dispatch to launch proper executable based on device analysis.
static LogicalResult duplicateExecutablesPerAffinityVariant(
ModuleOp moduleOp, SymbolTable symbolTable, FunctionOpInterface funcOp,
IREE::Stream::ResolveLayoutAttrFn resolveLayoutAttr) {
MLIRContext *ctx = moduleOp.getContext();
IRRewriter rewriter(ctx);

// 1. Gather per-export [execution affinity -> [resource affinities]] map.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

style: prefer not using numbers - they always end up wrong when code is moved around or when code is inserted/removed. You can call out preconditions/postconditions if blocks of code do have to be performed in order (e.g. "build cache of valid ops after removing dead ops" etc)

if the algorithm has a flow then that can be documented in the function comment where someone landing on the function wants to know how it works. By keeping them inline a reader has to read the whole function (like I just did, wondering where 2 was for a page :)

IREE::Stream::AffinityAnalysis affinityAnalysis(moduleOp);
if (failed(affinityAnalysis.run())) {
return moduleOp.emitError("failed on running affinity analysis");
}
SmallVector<IREE::Stream::AsyncDispatchOp> candidates;
funcOp.walk(
[&](IREE::Stream::AsyncDispatchOp op) { candidates.push_back(op); });

// export -> [affinity -> array per resource of affinities PVS].
DenseMap<IREE::Stream::ExecutableExportOp,
SetVector<std::pair<IREE::Stream::AffinityAttr, ArrayAttr>>>
exportToDispatchSites;

llvm::MapVector<IREE::Stream::AsyncDispatchOp, SmallVector<Attribute>>
resourceAffinities;
for (auto dispatchOp : candidates) {
SmallVector<IREE::Stream::AffinityAttr> execAffinities;
if (!affinityAnalysis.tryLookupExecutionAffinity(dispatchOp,
execAffinities)) {
return dispatchOp.emitError("failed on execution affinity lookup");
}
assert(execAffinities.size() == 1 &&
"We should only have a single execution "
"affinity when running the pass.");

SmallVector<Attribute> operandAffinityAttrs =
getResourceOperandsAffinities(affinityAnalysis, dispatchOp);
resourceAffinities[dispatchOp] = operandAffinityAttrs;

dispatchOp.forEachEntryPointAttr([&](SymbolRefAttr entryPoint) {
auto exportOp = cast<IREE::Stream::ExecutableExportOp>(
symbolTable.lookupSymbolIn(moduleOp, entryPoint));
exportToDispatchSites[exportOp].insert(std::make_pair(
execAffinities[0], rewriter.getArrayAttr(operandAffinityAttrs)));
});
}

LLVM_DEBUG({
llvm::dbgs() << "Dump of exportToDispatchSites\n";
for (auto [exportOp, affinities] : exportToDispatchSites) {
llvm::dbgs() << " ExportOp: " << exportOp.getSymName() << "\n";
for (auto [execAffinity, resourceAffinities] : affinities) {
llvm::dbgs() << " executaion affinity: " << execAffinity << "\n";
llvm::dbgs() << " resource affinities: " << resourceAffinities
<< "\n";
}
}
});

// 2. Duplicate executables for each unqiue resource affinities.

// Mapping from [execution affinity, resource operands affinities, export] to
// the executable op.
using DispatchSiteInfo = std::tuple<IREE::Stream::AffinityAttr, ArrayAttr,
IREE::Stream::ExecutableExportOp>;
DenseMap<DispatchSiteInfo, IREE::Stream::ExecutableOp>
dispatchSiteToExecutableOp;
for (auto [exportOp, execAndResourceAffinities] : exportToDispatchSites) {
auto executableOp = exportOp->getParentOfType<IREE::Stream::ExecutableOp>();
// No need to duplicate the executable if all the uses have the same
// affinities.
// TODO(hanchung): Do not duplicate the executables if bindings are not
// encoded. I.e., all the tensor types do not have encodings.
if (execAndResourceAffinities.size() == 1) {
auto [execAffinity, resourceAffinities] = execAndResourceAffinities[0];
dispatchSiteToExecutableOp[DispatchSiteInfo(
execAffinity, resourceAffinities, exportOp)] = executableOp;
continue;
}

int64_t dupId = -1;
for (auto [execAffinity, resourceAffinities] : execAndResourceAffinities) {
rewriter.setInsertionPointAfter(executableOp);
IREE::Stream::ExecutableOp dupOp = executableOp;
if (dupId != -1) {
auto symName = std::string(executableOp.getSymName());
symName += "_dup" + std::to_string(dupId);
dupOp = rewriter.cloneWithoutRegions(executableOp);
rewriter.modifyOpInPlace(dupOp, [&] {
dupOp.setSymName(symName);
IRMapping mapping;
executableOp.getRegion().cloneInto(&dupOp.getRegion(), mapping);
});
}
dispatchSiteToExecutableOp[DispatchSiteInfo(
execAffinity, resourceAffinities, exportOp)] = dupOp;
dupId++;
}
}

// 3. Update dispatch sites, i.e., point dispatch entry points to
// corresponding cloned executables.
for (auto dispatchOp : candidates) {
SmallVector<Attribute> newEntryPoints;
SmallVector<IREE::Stream::AffinityAttr> execAffinities;
// Sanity checks. It should already meet the requirement because they are
// checked in step 1. This can not be wrapped by an assertion because it
// could be dropped by compiler.
if (!affinityAnalysis.tryLookupExecutionAffinity(dispatchOp,
execAffinities)) {
return failure();
}

assert(execAffinities.size() == 1);
SmallVector<Attribute> operandAttrs = resourceAffinities[dispatchOp];
dispatchOp.forEachEntryPointAttr([&](SymbolRefAttr entryPoint) {
auto exportOp = cast<IREE::Stream::ExecutableExportOp>(
symbolTable.lookupSymbolIn(moduleOp, entryPoint));
auto info = DispatchSiteInfo(
execAffinities[0], rewriter.getArrayAttr(operandAttrs), exportOp);
assert(dispatchSiteToExecutableOp.count(info));

auto executableOp = dispatchSiteToExecutableOp[info];
auto newSym = SymbolRefAttr::get(executableOp->getAttrOfType<StringAttr>(
SymbolTable::getSymbolAttrName()),
entryPoint.getNestedReferences());
newEntryPoints.push_back(newSym);
});

rewriter.modifyOpInPlace(dispatchOp, [&] {
dispatchOp.setEntryPointsAttr(rewriter.getArrayAttr(newEntryPoints));
});
}

// TODO(hanchung): Update encodings in executables.

return success();
}

// TODO(hanchung): Add "cloneWithEncoding" method to RankedTensorType.
static RankedTensorType cloneWithEncoding(RankedTensorType type,
Attribute encodingAttr) {
Expand Down Expand Up @@ -156,6 +330,7 @@ struct SpecializeEncodingsPass
return signalPassFailure();
}

SymbolTable symbolTable(moduleOp);
llvm::MapVector<StringRef, IREE::Stream::ExecutableOp> executableOps;
for (auto executableOp : moduleOp.getOps<IREE::Stream::ExecutableOp>()) {
executableOps[executableOp.getName()] = executableOp;
Expand All @@ -171,7 +346,11 @@ struct SpecializeEncodingsPass
return signalPassFailure();
}

// TODO(hanchung): Duplicate executables and update dispatch ops.
if (failed(duplicateExecutablesPerAffinityVariant(
moduleOp, symbolTable, funcOp, resolveLayoutAttr))) {
funcOp.emitError("failed on executable duplication");
return signalPassFailure();
}
}
}
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,48 @@ module {
// CHECK: %[[D0_RES:.+]] = stream.tensor.sizeof {{.+}} tensor<?x?xf32, #[[$ENCODING0]]>
// CHECK: %[[D1_RES:.+]] = stream.tensor.sizeof {{.+}} tensor<?x?xf32, #[[$ENCODING1]]>
// CHECK: return %[[D0_RES]], %[[D1_RES]]

// -----

#executable_target_vmvx_bytecode_fb = #hal.executable.target<"vmvx", "vmvx-bytecode-fb", {ukernels = "none"}>
#map = affine_map<(d0) -> (d0)>
#device_target_local_0_ = #hal.device.target<"local", {ordinal = 0 : index}, [#executable_target_vmvx_bytecode_fb]> : !hal.device
#device_target_local_1_ = #hal.device.target<"local", {ordinal = 1 : index}, [#executable_target_vmvx_bytecode_fb]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@device_a>} {
util.global private @device_a = #device_target_local_0_
util.global private @device_b = #device_target_local_1_
stream.executable private @ex {
stream.executable.export public @dispatch
}
util.func public @multi_device(%arg0: !hal.buffer_view, %arg1: !hal.fence, %arg2: !hal.fence) -> !hal.buffer_view {
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%c4 = arith.constant 4 : index
%element_type_f32 = hal.element_type<f32> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c4]) type(%element_type_f32) encoding(%dense_row_major)
%0 = stream.tensor.import on(#hal.device.affinity<@device_a>) %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%1 = stream.timepoint.import on(#hal.device.affinity<@device_a>) %arg1 : (!hal.fence) => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<external>{%c16}
%3 = stream.async.transfer %2 : !stream.resource<external>{%c16} from(#hal.device.affinity<@device_a>) -> to(#hal.device.affinity<@device_a>) !stream.resource<*>{%c16}
%4 = stream.async.dispatch on(#hal.device.affinity<@device_a>) @ex::@dispatch(%3[%c0 to %c16 for %c16]) : (!stream.resource<*>{%c16}) -> !stream.resource<*>{%c16}
%5 = stream.async.transfer %4 : !stream.resource<*>{%c16} from(#hal.device.affinity<@device_a>) -> to(#hal.device.affinity<@device_b>) !stream.resource<*>{%c16}
%6 = stream.async.dispatch on(#hal.device.affinity<@device_b>) @ex::@dispatch(%5[%c0 to %c16 for %c16]) : (!stream.resource<*>{%c16}) -> !stream.resource<*>{%c16}
%7 = stream.async.transfer %6 : !stream.resource<*>{%c16} from(#hal.device.affinity<@device_b>) -> to(#hal.device.affinity<@device_a>) !stream.resource<*>{%c16}
%result, %result_timepoint = stream.timepoint.barrier on(#hal.device.affinity<@device_a>) %7 : !stream.resource<*>{%c16} => !stream.timepoint
stream.timepoint.chain_external on(#hal.device.affinity<@device_a>) %result_timepoint => (%arg2 : !hal.fence)
%8 = stream.async.transfer %result : !stream.resource<*>{%c16} from(#hal.device.affinity<@device_a>) -> to(#hal.device.affinity<@device_a>) !stream.resource<external>{%c16}
%9 = stream.tensor.export on(#hal.device.affinity<@device_a>) %8 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
util.return %9 : !hal.buffer_view
}
}

// CHECK: #[[DEVICE_LOCAL_0:.+]] = #hal.device.target
// CHECK: #[[DEVICE_LOCAL_1:.+]] = #hal.device.target
// CHECK: util.global private @[[$DEVICE_A:.+]] = #[[DEVICE_LOCAL_0]]
// CHECK: util.global private @[[$DEVICE_B:.+]] = #[[DEVICE_LOCAL_1]]
// CHECK: stream.executable private @[[$EX0:.+]] {
// CHECK: stream.executable private @[[$EX1:.+]] {
// CHECK-LABEL: util.func public @multi_device
// CHECK: stream.async.dispatch on(#hal.device.affinity<@[[$DEVICE_A]]>) @[[$EX0]]::@dispatch
// CHECK: stream.async.dispatch on(#hal.device.affinity<@[[$DEVICE_B]]>) @[[$EX1]]::@dispatch
Loading