Narrow for loops too

krzysz00 · krzysz00 · commit bd955e27df25 · 2024-12-16T21:15:11.000Z
diff --git a/compiler/src/iree/compiler/Codegen/Transforms/Transforms.cpp b/compiler/src/iree/compiler/Codegen/Transforms/Transforms.cpp
@@ -506,17 +506,30 @@ void moveLoopInvariantCodeFromGuaranteedLoops(Operation *target) {
         loopLike.getLoopLowerBounds();
     std::optional<SmallVector<OpFoldResult>> maybeUpperBounds =
         loopLike.getLoopUpperBounds();
-    if (!maybeLowerBounds || !maybeUpperBounds) {
+    std::optional<SmallVector<Value>> maybeIvs =
+        loopLike.getLoopInductionVars();
+    if (!maybeLowerBounds || !maybeUpperBounds || !maybeIvs) {
       return;
     }
 
     // If any lower + upper bound pair cannot be definitely verified as lb < ub
     // then the loop may have a zero trip count.
-    for (auto [lb, ub] :
-         llvm::zip_equal(*maybeLowerBounds, *maybeUpperBounds)) {
-      if (!ValueBoundsConstraintSet::compare(lb, ValueBoundsConstraintSet::LT,
-                                             ub)) {
-        return;
+    for (auto [lb, ub, iv] :
+         llvm::zip_equal(*maybeLowerBounds, *maybeUpperBounds, *maybeIvs)) {
+      if (iv.getType().isIndex()) {
+        if (!ValueBoundsConstraintSet::compare(lb, ValueBoundsConstraintSet::LT,
+                                               ub)) {
+          return;
+        }
+      } else {
+        // Weaker test for non-`index` operands to some loops
+        // like scf.for, since the value bounds interface requires index types.
+        auto maybeLb = getConstantIntValue(lb);
+        auto maybeUb = getConstantIntValue(ub);
+        if (!maybeLb || !maybeUb)
+          return;
+        if (*maybeLb >= *maybeUb)
+          return;
       }
     }
 
diff --git a/compiler/src/iree/compiler/Dialect/Util/Transforms/OptimizeIntArithmetic.cpp b/compiler/src/iree/compiler/Dialect/Util/Transforms/OptimizeIntArithmetic.cpp
@@ -18,6 +18,7 @@
 #include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Arith/Transforms/Passes.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/IR/Matchers.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Pass/Pass.h"
@@ -254,6 +255,66 @@ struct RemoveIndexCastForAssumeOfI32
   DataFlowSolver &solver;
 };
 
+//===----------------------------------------------------------------------===//
+// scf.for induction variable range narrowing
+// If the induction variable of an scf.for can be represented as an I32,
+// make that change to save on registers etc.
+//===----------------------------------------------------------------------===//
+struct NarrowSCFForIvToI32 : public OpRewritePattern<scf::ForOp> {
+  NarrowSCFForIvToI32(MLIRContext *context, DataFlowSolver &solver)
+      : OpRewritePattern(context), solver(solver) {}
+
+  LogicalResult matchAndRewrite(scf::ForOp forOp,
+                                PatternRewriter &rewriter) const override {
+    Location loc = forOp.getLoc();
+    Value iv = forOp.getInductionVar();
+    Type srcType = iv.getType();
+    if (!srcType.isIndex() && !srcType.isInteger(64))
+      return rewriter.notifyMatchFailure(forOp, "IV isn't an index or i64");
+    if (!staticallyLegalToConvertToUnsigned(solver, iv))
+      return rewriter.notifyMatchFailure(forOp, "IV isn't non-negative");
+    if (!staticallyLegalToConvertToUnsigned(solver, forOp.getStep()))
+      return rewriter.notifyMatchFailure(forOp, "Step isn't non-negative");
+    auto *ivState = solver.lookupState<IntegerValueRangeLattice>(iv);
+    if (ivState->getValue().getValue().smax().getActiveBits() > 31)
+      return rewriter.notifyMatchFailure(forOp, "IV won't fit in signed int32");
+
+    Type i32 = rewriter.getI32Type();
+    auto doCastDown = [&](Value v) -> Value {
+      if (srcType.isIndex())
+        return rewriter.create<arith::IndexCastUIOp>(loc, i32, v);
+      else
+        return rewriter.create<arith::TruncIOp>(loc, i32, v);
+    };
+    Value newLb = doCastDown(forOp.getLowerBound());
+    Value newUb = doCastDown(forOp.getUpperBound());
+    Value newStep = doCastDown(forOp.getStep());
+    {
+      PatternRewriter::InsertionGuard g(rewriter);
+      rewriter.setInsertionPointToStart(&forOp.getRegion().front());
+      Value castBackOp;
+      if (srcType.isIndex())
+        castBackOp =
+            rewriter.create<arith::IndexCastUIOp>(iv.getLoc(), srcType, iv);
+      else
+        castBackOp = rewriter.create<arith::ExtUIOp>(iv.getLoc(), srcType, iv);
+      (void)solver.getOrCreateState<IntegerValueRangeLattice>(castBackOp)
+          ->join(*ivState);
+      rewriter.replaceAllUsesExcept(iv, castBackOp, castBackOp.getDefiningOp());
+    }
+    solver.eraseState(iv);
+    rewriter.modifyOpInPlace(forOp, [&]() {
+      iv.setType(i32);
+      forOp.getLowerBoundMutable().assign(newLb);
+      forOp.getUpperBoundMutable().assign(newUb);
+      forOp.getStepMutable().assign(newStep);
+    });
+    return success();
+  }
+
+  DataFlowSolver &solver;
+};
+
 //===----------------------------------------------------------------------===//
 // Divisibility
 //===----------------------------------------------------------------------===//
@@ -396,7 +457,8 @@ class OptimizeIntArithmeticPass
 
     if (narrowToI32) {
       arith::populateIntRangeNarrowingPatterns(patterns, solver, {32});
-      patterns.add<RemoveIndexCastForAssumeOfI32>(ctx, solver);
+      patterns.add<NarrowSCFForIvToI32, RemoveIndexCastForAssumeOfI32>(ctx,
+                                                                       solver);
     }
 
     // Populate canonicalization patterns.
diff --git a/compiler/src/iree/compiler/Dialect/Util/Transforms/test/optimize_int_arithmetic_narrowing.mlir b/compiler/src/iree/compiler/Dialect/Util/Transforms/test/optimize_int_arithmetic_narrowing.mlir
@@ -13,8 +13,8 @@
 // CHECK-DAG: %[[TID_I32:.+]] = arith.index_castui %[[THREAD_ID_X]] : index to i32
 // CHECK: %[[V0:.+]] = arith.divui %[[TID_I32]], %[[C16]] : i32
 // CHECK-NEXT: %[[V1:.+]] = arith.remui %[[TID_I32]], %[[C16]] : i32
-// CHECK-NEXT: %[[V2:.+]] = arith.muli %[[V0]], %[[V32]] : i32
-// CHECK-NEXT; %[[V3:.+]] = arith.addi %[[V2]], %[[V1]] : i32
+// CHECK-NEXT: %[[V2:.+]] = arith.muli %[[V0]], %[[C32]] : i32
+// CHECK-NEXT: %[[V3:.+]] = arith.addi %[[V2]], %[[V1]] : i32
 // CHECK-NEXT: %[[RET:.+]] = arith.index_castui %[[V3]] : i32 to index
 // CHECK: return %[[RET]]
 util.func @narrow_tid_computations() -> index {
@@ -32,12 +32,37 @@ util.func @narrow_tid_computations() -> index {
 
 // CHECK-LABEL: @narrow_assumes
 // CHECK-SAME: (%[[ARG0:.+]]: i32)
-// CHECK-NEXT: %[[ASSUME:.+]] = util.assume.int %[[ARG0]][<umin = 16, umax = 122, udiv = 16>] : i32
+// CHECK-NEXT: %[[ASSUME:.+]] = util.assume.int %[[ARG0]]<umin = 16, umax = 122, udiv = 16> : i32
 // CHECK-NEXT: %[[AS_INDEX:.+]] = arith.index_castui %[[ASSUME]] : i32 to index
 // CHECK-NEXT: util.return %[[ASSUME]], %[[AS_INDEX]]
 util.func @narrow_assumes(%arg0: i32) -> (i32, index) {
   %0 = arith.index_castui %arg0 : i32 to index
-  %1 = util.assume.int %0[<umin = 16, umax = 122, udiv = 16>] : index
+  %1 = util.assume.int %0<umin = 16, umax = 122, udiv = 16> : index
   %2 = arith.index_castui %1 : index to i32
   util.return %2, %1 : i32, index
 }
+
+// -----
+
+// CHECK-LABEL: @narrow_scf_for
+// CHECK-DAG: %[[C64:.+]] = arith.constant 64 : i32
+// CHECK-DAG: %[[C96:.+]] = arith.constant 96 : i32
+// CHECK-DAG: %[[C512:.+]] = arith.constant 512 : i32
+// CHECK-DAG: %[[TID:.+]] = gpu.thread_id x upper_bound 64
+// CHECK-DAG: %[[TID_I32:.+]] = arith.index_castui %[[TID]] : index to i32
+// CHECK: scf.for %[[ARG1:.+]] = %[[TID_I32]] to %[[C96]] step %[[C64]]
+// CHECK-NEXT: %[[V0:.+]] = arith.addi %[[ARG1]], %[[C512]]
+// CHECK-NEXT: %[[V0_IDX:.+]] = arith.index_castui %[[V0]] : i32 to index
+// CHECK-NEXT: memref.store {{.*}}[%[[V0_IDX]]]
+util.func @narrow_scf_for(%arg0: memref<?xf32>) {
+  %c0_f32 = arith.constant 0.0 : f32
+  %c64 = arith.constant 64 : index
+  %c96 = arith.constant 96 : index
+  %c512 = arith.constant 512 : index
+  %tid = gpu.thread_id x upper_bound 64
+  scf.for %arg1 = %tid to %c96 step %c64 {
+    %0 = arith.addi %arg1, %c512 : index
+    memref.store %c0_f32, %arg0[%0] : memref<?xf32>
+  }
+  util.return
+}