-
Notifications
You must be signed in to change notification settings - Fork 13k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AArch64] Expand scmp/ucmp vector operations with sub #108830
Conversation
Unlike scalar, where AArch64 prefers expanding scmp/ucmp with select, under Neon we can use the arithmetic expansion to generate fewer instructions. Notably it also prevents the scalarization of vselect during vector-legalization.
@llvm/pr-subscribers-backend-systemz @llvm/pr-subscribers-llvm-selectiondag Author: David Green (davemgreen) ChangesUnlike scalar, where AArch64 prefers expanding scmp/ucmp with select, under Neon we can use the arithmetic expansion to generate fewer instructions. Notably it also prevents the scalarization of vselect during vector-legalization. Patch is 32.57 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/108830.diff 9 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 50dc7d5c54c54a..caa3a57ebabc2e 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -2451,7 +2451,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
CmpIntrinsic::getLTPredicate(IID),
CostKind);
- if (TLI->shouldExpandCmpUsingSelects()) {
+ EVT VT = TLI->getValueType(DL, CmpTy, true);
+ if (TLI->shouldExpandCmpUsingSelects(VT)) {
// x < y ? -1 : (x > y ? 1 : 0)
Cost += 2 * thisT()->getCmpSelInstrCost(
BinaryOperator::Select, RetTy, CondTy,
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index e17d68d2690c86..802510dd0e4fa0 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3409,7 +3409,7 @@ class TargetLoweringBase {
/// Should we expand [US]CMP nodes using two selects and two compares, or by
/// doing arithmetic on boolean types
- virtual bool shouldExpandCmpUsingSelects() const { return false; }
+ virtual bool shouldExpandCmpUsingSelects(EVT VT) const { return false; }
/// Does this target support complex deinterleaving
virtual bool isComplexDeinterleavingSupported() const { return false; }
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index ca379a691da918..95937886280685 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -10681,7 +10681,7 @@ SDValue TargetLowering::expandCMP(SDNode *Node, SelectionDAG &DAG) const {
// because one of the conditions can be merged with one of the selects.
// And finally, if we don't know the contents of high bits of a boolean value
// we can't perform any arithmetic either.
- if (shouldExpandCmpUsingSelects() || BoolVT.getScalarSizeInBits() == 1 ||
+ if (shouldExpandCmpUsingSelects(VT) || BoolVT.getScalarSizeInBits() == 1 ||
getBooleanContents(BoolVT) == UndefinedBooleanContent) {
SDValue SelectZeroOrOne =
DAG.getSelect(dl, ResVT, IsGT, DAG.getConstant(1, dl, ResVT),
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 47da9d577cd827..d41f45ac0ce823 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -27781,6 +27781,12 @@ bool AArch64TargetLowering::shouldConvertFpToSat(unsigned Op, EVT FPVT,
return TargetLowering::shouldConvertFpToSat(Op, FPVT, VT);
}
+bool AArch64TargetLowering::shouldExpandCmpUsingSelects(EVT VT) const {
+ // Expand scalar and SVE operations using selects. Neon vectors prefer sub to
+ // avoid vselect becoming bsl / unrolling.
+ return !VT.isFixedLengthVector();
+}
+
MachineInstr *
AArch64TargetLowering::EmitKCFICheck(MachineBasicBlock &MBB,
MachineBasicBlock::instr_iterator &MBBI,
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index f9d45b02d30e30..06b918f9ccaa28 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -914,7 +914,7 @@ class AArch64TargetLowering : public TargetLowering {
bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override;
- bool shouldExpandCmpUsingSelects() const override { return true; }
+ bool shouldExpandCmpUsingSelects(EVT VT) const override;
bool isComplexDeinterleavingSupported() const override;
bool isComplexDeinterleavingOperationSupported(
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
index 1e7285e3e0fc53..4a18bde00a0b98 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -507,7 +507,7 @@ class SystemZTargetLowering : public TargetLowering {
bool shouldConsiderGEPOffsetSplit() const override { return true; }
- bool shouldExpandCmpUsingSelects() const override { return true; }
+ bool shouldExpandCmpUsingSelects(EVT VT) const override { return true; }
const char *getTargetNodeName(unsigned Opcode) const override;
std::pair<unsigned, const TargetRegisterClass *>
diff --git a/llvm/test/Analysis/CostModel/AArch64/cmp.ll b/llvm/test/Analysis/CostModel/AArch64/cmp.ll
index 1b4b5eb616b5a9..a56ca8890e307b 100644
--- a/llvm/test/Analysis/CostModel/AArch64/cmp.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/cmp.ll
@@ -128,16 +128,16 @@ define void @uscmp() {
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %u16 = call i16 @llvm.ucmp.i16.i16(i16 undef, i16 undef)
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %u32 = call i32 @llvm.ucmp.i32.i32(i32 undef, i32 undef)
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %u64 = call i64 @llvm.ucmp.i64.i64(i64 undef, i64 undef)
-; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 98 for instruction: %uv16i8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> undef, <16 x i8> undef)
-; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %uv8i16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> undef, <8 x i16> undef)
-; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %uv4i32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef)
+; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %uv16i8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> undef, <16 x i8> undef)
+; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %uv8i16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> undef, <8 x i16> undef)
+; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %uv4i32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef)
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %s8 = call i8 @llvm.scmp.i8.i8(i8 undef, i8 undef)
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %s16 = call i16 @llvm.scmp.i16.i16(i16 undef, i16 undef)
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %s32 = call i32 @llvm.scmp.i32.i32(i32 undef, i32 undef)
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %s64 = call i64 @llvm.scmp.i64.i64(i64 undef, i64 undef)
-; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 98 for instruction: %sv16i8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> undef, <16 x i8> undef)
-; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %sv8i16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> undef, <8 x i16> undef)
-; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %sv4i32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef)
+; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %sv16i8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> undef, <16 x i8> undef)
+; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %sv8i16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> undef, <8 x i16> undef)
+; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %sv4i32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef)
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; CHECK-SIZE-LABEL: 'uscmp'
@@ -145,16 +145,16 @@ define void @uscmp() {
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %u16 = call i16 @llvm.ucmp.i16.i16(i16 undef, i16 undef)
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %u32 = call i32 @llvm.ucmp.i32.i32(i32 undef, i32 undef)
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %u64 = call i64 @llvm.ucmp.i64.i64(i64 undef, i64 undef)
-; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %uv16i8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> undef, <16 x i8> undef)
-; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %uv8i16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> undef, <8 x i16> undef)
-; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %uv4i32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef)
+; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %uv16i8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> undef, <16 x i8> undef)
+; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %uv8i16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> undef, <8 x i16> undef)
+; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %uv4i32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef)
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %s8 = call i8 @llvm.scmp.i8.i8(i8 undef, i8 undef)
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %s16 = call i16 @llvm.scmp.i16.i16(i16 undef, i16 undef)
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %s32 = call i32 @llvm.scmp.i32.i32(i32 undef, i32 undef)
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %s64 = call i64 @llvm.scmp.i64.i64(i64 undef, i64 undef)
-; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %sv16i8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> undef, <16 x i8> undef)
-; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %sv8i16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> undef, <8 x i16> undef)
-; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %sv4i32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef)
+; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %sv16i8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> undef, <16 x i8> undef)
+; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %sv8i16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> undef, <8 x i16> undef)
+; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %sv4i32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef)
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;
%u8 = call i8 @llvm.ucmp(i8 undef, i8 undef)
diff --git a/llvm/test/CodeGen/AArch64/scmp.ll b/llvm/test/CodeGen/AArch64/scmp.ll
index 3d18a904ed2d3f..4aff5a836e1a18 100644
--- a/llvm/test/CodeGen/AArch64/scmp.ll
+++ b/llvm/test/CodeGen/AArch64/scmp.ll
@@ -136,11 +136,9 @@ define i64 @scmp.64.64(i64 %x, i64 %y) nounwind {
define <8 x i8> @s_v8i8(<8 x i8> %a, <8 x i8> %b) {
; CHECK-SD-LABEL: s_v8i8:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: movi v2.8b, #1
-; CHECK-SD-NEXT: cmgt v3.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT: cmgt v2.8b, v0.8b, v1.8b
; CHECK-SD-NEXT: cmgt v0.8b, v1.8b, v0.8b
-; CHECK-SD-NEXT: and v1.8b, v3.8b, v2.8b
-; CHECK-SD-NEXT: orr v0.8b, v1.8b, v0.8b
+; CHECK-SD-NEXT: sub v0.8b, v0.8b, v2.8b
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: s_v8i8:
@@ -160,11 +158,9 @@ entry:
define <16 x i8> @s_v16i8(<16 x i8> %a, <16 x i8> %b) {
; CHECK-SD-LABEL: s_v16i8:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: movi v2.16b, #1
-; CHECK-SD-NEXT: cmgt v3.16b, v0.16b, v1.16b
+; CHECK-SD-NEXT: cmgt v2.16b, v0.16b, v1.16b
; CHECK-SD-NEXT: cmgt v0.16b, v1.16b, v0.16b
-; CHECK-SD-NEXT: and v1.16b, v3.16b, v2.16b
-; CHECK-SD-NEXT: orr v0.16b, v1.16b, v0.16b
+; CHECK-SD-NEXT: sub v0.16b, v0.16b, v2.16b
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: s_v16i8:
@@ -184,11 +180,9 @@ entry:
define <4 x i16> @s_v4i16(<4 x i16> %a, <4 x i16> %b) {
; CHECK-SD-LABEL: s_v4i16:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: movi v2.4h, #1
-; CHECK-SD-NEXT: cmgt v3.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT: cmgt v2.4h, v0.4h, v1.4h
; CHECK-SD-NEXT: cmgt v0.4h, v1.4h, v0.4h
-; CHECK-SD-NEXT: and v1.8b, v3.8b, v2.8b
-; CHECK-SD-NEXT: orr v0.8b, v1.8b, v0.8b
+; CHECK-SD-NEXT: sub v0.4h, v0.4h, v2.4h
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: s_v4i16:
@@ -208,11 +202,9 @@ entry:
define <8 x i16> @s_v8i16(<8 x i16> %a, <8 x i16> %b) {
; CHECK-SD-LABEL: s_v8i16:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: movi v2.8h, #1
-; CHECK-SD-NEXT: cmgt v3.8h, v0.8h, v1.8h
+; CHECK-SD-NEXT: cmgt v2.8h, v0.8h, v1.8h
; CHECK-SD-NEXT: cmgt v0.8h, v1.8h, v0.8h
-; CHECK-SD-NEXT: and v1.16b, v3.16b, v2.16b
-; CHECK-SD-NEXT: orr v0.16b, v1.16b, v0.16b
+; CHECK-SD-NEXT: sub v0.8h, v0.8h, v2.8h
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: s_v8i16:
@@ -232,15 +224,12 @@ entry:
define <16 x i16> @s_v16i16(<16 x i16> %a, <16 x i16> %b) {
; CHECK-SD-LABEL: s_v16i16:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: movi v4.8h, #1
+; CHECK-SD-NEXT: cmgt v4.8h, v1.8h, v3.8h
; CHECK-SD-NEXT: cmgt v5.8h, v0.8h, v2.8h
-; CHECK-SD-NEXT: cmgt v6.8h, v1.8h, v3.8h
; CHECK-SD-NEXT: cmgt v0.8h, v2.8h, v0.8h
; CHECK-SD-NEXT: cmgt v1.8h, v3.8h, v1.8h
-; CHECK-SD-NEXT: and v2.16b, v5.16b, v4.16b
-; CHECK-SD-NEXT: and v3.16b, v6.16b, v4.16b
-; CHECK-SD-NEXT: orr v0.16b, v2.16b, v0.16b
-; CHECK-SD-NEXT: orr v1.16b, v3.16b, v1.16b
+; CHECK-SD-NEXT: sub v0.8h, v0.8h, v5.8h
+; CHECK-SD-NEXT: sub v1.8h, v1.8h, v4.8h
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: s_v16i16:
@@ -264,11 +253,9 @@ entry:
define <2 x i32> @s_v2i32(<2 x i32> %a, <2 x i32> %b) {
; CHECK-SD-LABEL: s_v2i32:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: movi v2.2s, #1
-; CHECK-SD-NEXT: cmgt v3.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT: cmgt v2.2s, v0.2s, v1.2s
; CHECK-SD-NEXT: cmgt v0.2s, v1.2s, v0.2s
-; CHECK-SD-NEXT: and v1.8b, v3.8b, v2.8b
-; CHECK-SD-NEXT: orr v0.8b, v1.8b, v0.8b
+; CHECK-SD-NEXT: sub v0.2s, v0.2s, v2.2s
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: s_v2i32:
@@ -288,11 +275,9 @@ entry:
define <4 x i32> @s_v4i32(<4 x i32> %a, <4 x i32> %b) {
; CHECK-SD-LABEL: s_v4i32:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: movi v2.4s, #1
-; CHECK-SD-NEXT: cmgt v3.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: cmgt v2.4s, v0.4s, v1.4s
; CHECK-SD-NEXT: cmgt v0.4s, v1.4s, v0.4s
-; CHECK-SD-NEXT: and v1.16b, v3.16b, v2.16b
-; CHECK-SD-NEXT: orr v0.16b, v1.16b, v0.16b
+; CHECK-SD-NEXT: sub v0.4s, v0.4s, v2.4s
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: s_v4i32:
@@ -312,15 +297,12 @@ entry:
define <8 x i32> @s_v8i32(<8 x i32> %a, <8 x i32> %b) {
; CHECK-SD-LABEL: s_v8i32:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: movi v4.4s, #1
+; CHECK-SD-NEXT: cmgt v4.4s, v1.4s, v3.4s
; CHECK-SD-NEXT: cmgt v5.4s, v0.4s, v2.4s
-; CHECK-SD-NEXT: cmgt v6.4s, v1.4s, v3.4s
; CHECK-SD-NEXT: cmgt v0.4s, v2.4s, v0.4s
; CHECK-SD-NEXT: cmgt v1.4s, v3.4s, v1.4s
-; CHECK-SD-NEXT: and v2.16b, v5.16b, v4.16b
-; CHECK-SD-NEXT: and v3.16b, v6.16b, v4.16b
-; CHECK-SD-NEXT: orr v0.16b, v2.16b, v0.16b
-; CHECK-SD-NEXT: orr v1.16b, v3.16b, v1.16b
+; CHECK-SD-NEXT: sub v0.4s, v0.4s, v5.4s
+; CHECK-SD-NEXT: sub v1.4s, v1.4s, v4.4s
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: s_v8i32:
@@ -344,12 +326,9 @@ entry:
define <2 x i64> @s_v2i64(<2 x i64> %a, <2 x i64> %b) {
; CHECK-SD-LABEL: s_v2i64:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: mov w8, #1 // =0x1
; CHECK-SD-NEXT: cmgt v2.2d, v0.2d, v1.2d
; CHECK-SD-NEXT: cmgt v0.2d, v1.2d, v0.2d
-; CHECK-SD-NEXT: dup v3.2d, x8
-; CHECK-SD-NEXT: and v1.16b, v2.16b, v3.16b
-; CHECK-SD-NEXT: orr v0.16b, v1.16b, v0.16b
+; CHECK-SD-NEXT: sub v0.2d, v0.2d, v2.2d
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: s_v2i64:
@@ -370,16 +349,12 @@ entry:
define <4 x i64> @s_v4i64(<4 x i64> %a, <4 x i64> %b) {
; CHECK-SD-LABEL: s_v4i64:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: mov w8, #1 // =0x1
-; CHECK-SD-NEXT: cmgt v4.2d, v0.2d, v2.2d
-; CHECK-SD-NEXT: cmgt v6.2d, v1.2d, v3.2d
-; CHECK-SD-NEXT: dup v5.2d, x8
+; CHECK-SD-NEXT: cmgt v4.2d, v1.2d, v3.2d
+; CHECK-SD-NEXT: cmgt v5.2d, v0.2d, v2.2d
; CHECK-SD-NEXT: cmgt v0.2d, v2.2d, v0.2d
; CHECK-SD-NEXT: cmgt v1.2d, v3.2d, v1.2d
-; CHECK-SD-NEXT: and v2.16b, v4.16b, v5.16b
-; CHECK-SD-NEXT: and v3.16b, v6.16b, v5.16b
-; CHECK-SD-NEXT: orr v0.16b, v2.16b, v0.16b
-; CHECK-SD-NEXT: orr v1.16b, v3.16b, v1.16b
+; CHECK-SD-NEXT: sub v0.2d, v0.2d, v5.2d
+; CHECK-SD-NEXT: sub v1.2d, v1.2d, v4.2d
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: s_v4i64:
@@ -404,122 +379,13 @@ entry:
define <16 x i8> @signOf_neon_scmp(<8 x i16> %s0_lo, <8 x i16> %s0_hi, <8 x i16> %s1_lo, <8 x i16> %s1_hi) {
; CHECK-SD-LABEL: signOf_neon_scmp:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: cmgt v5.8h, v0.8h, v2.8h
-; CHECK-SD-NEXT: cmgt v2.8h, v2.8h, v0.8h
; CHECK-SD-NEXT: cmgt v4.8h, v1.8h, v3.8h
; CHECK-SD-NEXT: cmgt v1.8h, v3.8h, v1.8h
-; CHECK-SD-NEXT: umov w8, v5.h[1]
-; CHECK-SD-NEXT: umov w9, v2.h[1]
-; CHECK-SD-NEXT: umov w10, v5.h[0]
-; CHECK-SD-NEXT: umov w11, v2.h[0]
-; CHECK-SD-NEXT: tst w8, #0xffff
-; CHECK-SD-NEXT: cset w8, ne
-; CHECK-SD-NEXT: tst w9, #0xffff
-; CHECK-SD-NEXT: csinv w8, w8, wzr, eq
-; CHECK-SD-NEXT: tst w10, #0xffff
-; CHECK-SD-NEXT: umov w10, v5.h[2]
-; CHECK-SD-NEXT: cset w9, ne
-; CHECK-SD-NEXT: tst w11, #0xffff
-; CHECK-SD-NEXT: umov w11, v2.h[2]
-; CHECK-SD-NEXT: csinv w9, w9, wzr, eq
-; CHECK-SD-NEXT: fmov s0, w9
-; CHECK-SD-NEXT: tst w10, #0xffff
-; CHECK-SD-NEXT: umov w10, v2.h[3]
-; CHECK-SD-NEXT: cset w9, ne
-; CHECK-SD-NEXT: tst w11, #0xffff
-; CHECK-SD-NEXT: mov v0.b[1], w8
-; CHECK-SD-NEXT: umov w8, v5.h[3]
-; CHECK-SD-NEXT: csinv w9, w9, wzr, eq
-; CHECK-SD-NEXT: mov v0.b[2], w9
-; CHECK-SD-NEXT: tst w8, #0xffff
-; CHECK-SD-NEXT: umov w8, v5.h[4]
-; CHECK-SD-NEXT: cset w9, ne
-; CHECK-SD-NEXT: tst w10, #0xffff
-; CHECK-SD-NEXT: umov w10, v2.h[4]
-; CHECK-SD-NEXT: csinv w9, w9, wzr, eq
-; CHECK-SD-NEXT: mov v0.b[3], w9
-; CHECK-SD-NEXT: tst w8, #0xffff
-; CHECK-SD-NEXT: umov w8, v5.h[5]
-; CHECK-SD-NEXT: cset w9, ne
-; CHECK-SD-NEXT: tst w10, #0xffff
-; CHECK-SD-NEXT: umov w10, v2.h[5]
-; CHECK-SD-NEXT: csinv w9, w9, wzr, eq
-; CHECK-SD-NEXT: mov v0.b[4], w9
-; CHECK-SD-NEXT: tst w8, #0xffff
-; CHECK-SD-NEXT: umov w8, v5.h[6]
-; CHECK-SD-NEXT: cset w9, ne
-; CHECK-SD-NEXT: tst w10, #0xffff
-; CHECK-SD-NEXT: umov w10, v2.h[6]
-; CHECK-SD-NEXT: csinv w9, w9, wzr, eq
-; CHECK-SD-NEXT: mov v0.b[5], w9
-; CHECK-SD-NEXT: umov w9, v5.h[7]
-; CHECK-SD-NEXT: tst w8, #0xffff
-; CHECK-SD-NEXT: cset w8, ne
-; CHECK-SD-NEXT: tst w10, #0xffff
-; CHECK-SD-NEXT: umov w10, v2.h[7]
-; CHECK-SD-NEXT: csinv w8, w8, wzr, eq
-; CHECK-SD-NEXT: mov v0.b[6], w8
-; CHECK-SD-NEXT: tst w9, #0xffff
-; CHECK-SD-NEXT: umov w8, v4.h[0]
-; CHECK-SD-NEXT: cset w9, ne
-; CHECK-SD-NEXT: tst w10, #0xffff
-; CHECK-SD-NEXT: umov w10, v1.h[0]
-; CHECK-SD-NEXT: csinv w9, w9, wzr, eq
-; CHECK-SD-NEXT: mov v0.b[7], w9
-; CHECK-SD-NEXT: tst w8, #0xffff
-; CHECK-SD-NEXT: umov w8, v4.h[1]
-; CHECK-SD-NEXT: cset w9, ne
-; CHECK-SD-NEXT: tst w10, #0xffff
-; CHECK-SD-NEXT: umov w10, v1.h[1]
-; CHECK-SD-NEXT: csinv w9, w9, wzr, eq
-; CHECK-SD-NEXT: mov v0.b[8], w9
-; CHECK-SD-NEXT: tst w8, #0xffff
-; CHECK-SD-NEXT: umov w8, v4.h[2]
-; CHECK-SD-NEXT: cset w9, ne
-; CHECK-SD-NEXT: tst w10, #0xffff
-; CHECK-SD-NEXT: umov w10, v1.h[2]
-; CHECK-SD-NEXT: csinv w9, w9, wzr, eq
-; CHECK-SD-NEXT: mov v0.b[9], w9
-; CHECK-SD-NEXT: tst w8, #0xffff
-; CHECK-SD-NEXT: umov w8, v4.h[3]
-; CHECK-SD-NEXT: cset w9, ne
-; CHECK-SD-NEXT: tst ...
[truncated]
|
@llvm/pr-subscribers-llvm-analysis Author: David Green (davemgreen) ChangesUnlike scalar, where AArch64 prefers expanding scmp/ucmp with select, under Neon we can use the arithmetic expansion to generate fewer instructions. Notably it also prevents the scalarization of vselect during vector-legalization. Patch is 32.57 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/108830.diff 9 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 50dc7d5c54c54a..caa3a57ebabc2e 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -2451,7 +2451,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
CmpIntrinsic::getLTPredicate(IID),
CostKind);
- if (TLI->shouldExpandCmpUsingSelects()) {
+ EVT VT = TLI->getValueType(DL, CmpTy, true);
+ if (TLI->shouldExpandCmpUsingSelects(VT)) {
// x < y ? -1 : (x > y ? 1 : 0)
Cost += 2 * thisT()->getCmpSelInstrCost(
BinaryOperator::Select, RetTy, CondTy,
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index e17d68d2690c86..802510dd0e4fa0 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3409,7 +3409,7 @@ class TargetLoweringBase {
/// Should we expand [US]CMP nodes using two selects and two compares, or by
/// doing arithmetic on boolean types
- virtual bool shouldExpandCmpUsingSelects() const { return false; }
+ virtual bool shouldExpandCmpUsingSelects(EVT VT) const { return false; }
/// Does this target support complex deinterleaving
virtual bool isComplexDeinterleavingSupported() const { return false; }
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index ca379a691da918..95937886280685 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -10681,7 +10681,7 @@ SDValue TargetLowering::expandCMP(SDNode *Node, SelectionDAG &DAG) const {
// because one of the conditions can be merged with one of the selects.
// And finally, if we don't know the contents of high bits of a boolean value
// we can't perform any arithmetic either.
- if (shouldExpandCmpUsingSelects() || BoolVT.getScalarSizeInBits() == 1 ||
+ if (shouldExpandCmpUsingSelects(VT) || BoolVT.getScalarSizeInBits() == 1 ||
getBooleanContents(BoolVT) == UndefinedBooleanContent) {
SDValue SelectZeroOrOne =
DAG.getSelect(dl, ResVT, IsGT, DAG.getConstant(1, dl, ResVT),
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 47da9d577cd827..d41f45ac0ce823 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -27781,6 +27781,12 @@ bool AArch64TargetLowering::shouldConvertFpToSat(unsigned Op, EVT FPVT,
return TargetLowering::shouldConvertFpToSat(Op, FPVT, VT);
}
+bool AArch64TargetLowering::shouldExpandCmpUsingSelects(EVT VT) const {
+ // Expand scalar and SVE operations using selects. Neon vectors prefer sub to
+ // avoid vselect becoming bsl / unrolling.
+ return !VT.isFixedLengthVector();
+}
+
MachineInstr *
AArch64TargetLowering::EmitKCFICheck(MachineBasicBlock &MBB,
MachineBasicBlock::instr_iterator &MBBI,
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index f9d45b02d30e30..06b918f9ccaa28 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -914,7 +914,7 @@ class AArch64TargetLowering : public TargetLowering {
bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override;
- bool shouldExpandCmpUsingSelects() const override { return true; }
+ bool shouldExpandCmpUsingSelects(EVT VT) const override;
bool isComplexDeinterleavingSupported() const override;
bool isComplexDeinterleavingOperationSupported(
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
index 1e7285e3e0fc53..4a18bde00a0b98 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -507,7 +507,7 @@ class SystemZTargetLowering : public TargetLowering {
bool shouldConsiderGEPOffsetSplit() const override { return true; }
- bool shouldExpandCmpUsingSelects() const override { return true; }
+ bool shouldExpandCmpUsingSelects(EVT VT) const override { return true; }
const char *getTargetNodeName(unsigned Opcode) const override;
std::pair<unsigned, const TargetRegisterClass *>
diff --git a/llvm/test/Analysis/CostModel/AArch64/cmp.ll b/llvm/test/Analysis/CostModel/AArch64/cmp.ll
index 1b4b5eb616b5a9..a56ca8890e307b 100644
--- a/llvm/test/Analysis/CostModel/AArch64/cmp.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/cmp.ll
@@ -128,16 +128,16 @@ define void @uscmp() {
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %u16 = call i16 @llvm.ucmp.i16.i16(i16 undef, i16 undef)
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %u32 = call i32 @llvm.ucmp.i32.i32(i32 undef, i32 undef)
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %u64 = call i64 @llvm.ucmp.i64.i64(i64 undef, i64 undef)
-; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 98 for instruction: %uv16i8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> undef, <16 x i8> undef)
-; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %uv8i16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> undef, <8 x i16> undef)
-; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %uv4i32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef)
+; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %uv16i8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> undef, <16 x i8> undef)
+; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %uv8i16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> undef, <8 x i16> undef)
+; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %uv4i32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef)
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %s8 = call i8 @llvm.scmp.i8.i8(i8 undef, i8 undef)
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %s16 = call i16 @llvm.scmp.i16.i16(i16 undef, i16 undef)
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %s32 = call i32 @llvm.scmp.i32.i32(i32 undef, i32 undef)
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %s64 = call i64 @llvm.scmp.i64.i64(i64 undef, i64 undef)
-; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 98 for instruction: %sv16i8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> undef, <16 x i8> undef)
-; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %sv8i16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> undef, <8 x i16> undef)
-; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %sv4i32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef)
+; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %sv16i8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> undef, <16 x i8> undef)
+; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %sv8i16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> undef, <8 x i16> undef)
+; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %sv4i32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef)
; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; CHECK-SIZE-LABEL: 'uscmp'
@@ -145,16 +145,16 @@ define void @uscmp() {
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %u16 = call i16 @llvm.ucmp.i16.i16(i16 undef, i16 undef)
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %u32 = call i32 @llvm.ucmp.i32.i32(i32 undef, i32 undef)
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %u64 = call i64 @llvm.ucmp.i64.i64(i64 undef, i64 undef)
-; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %uv16i8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> undef, <16 x i8> undef)
-; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %uv8i16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> undef, <8 x i16> undef)
-; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %uv4i32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef)
+; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %uv16i8 = call <16 x i8> @llvm.ucmp.v16i8.v16i8(<16 x i8> undef, <16 x i8> undef)
+; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %uv8i16 = call <8 x i16> @llvm.ucmp.v8i16.v8i16(<8 x i16> undef, <8 x i16> undef)
+; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %uv4i32 = call <4 x i32> @llvm.ucmp.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef)
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %s8 = call i8 @llvm.scmp.i8.i8(i8 undef, i8 undef)
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %s16 = call i16 @llvm.scmp.i16.i16(i16 undef, i16 undef)
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %s32 = call i32 @llvm.scmp.i32.i32(i32 undef, i32 undef)
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %s64 = call i64 @llvm.scmp.i64.i64(i64 undef, i64 undef)
-; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %sv16i8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> undef, <16 x i8> undef)
-; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %sv8i16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> undef, <8 x i16> undef)
-; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %sv4i32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef)
+; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %sv16i8 = call <16 x i8> @llvm.scmp.v16i8.v16i8(<16 x i8> undef, <16 x i8> undef)
+; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %sv8i16 = call <8 x i16> @llvm.scmp.v8i16.v8i16(<8 x i16> undef, <8 x i16> undef)
+; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %sv4i32 = call <4 x i32> @llvm.scmp.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef)
; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;
%u8 = call i8 @llvm.ucmp(i8 undef, i8 undef)
diff --git a/llvm/test/CodeGen/AArch64/scmp.ll b/llvm/test/CodeGen/AArch64/scmp.ll
index 3d18a904ed2d3f..4aff5a836e1a18 100644
--- a/llvm/test/CodeGen/AArch64/scmp.ll
+++ b/llvm/test/CodeGen/AArch64/scmp.ll
@@ -136,11 +136,9 @@ define i64 @scmp.64.64(i64 %x, i64 %y) nounwind {
define <8 x i8> @s_v8i8(<8 x i8> %a, <8 x i8> %b) {
; CHECK-SD-LABEL: s_v8i8:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: movi v2.8b, #1
-; CHECK-SD-NEXT: cmgt v3.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT: cmgt v2.8b, v0.8b, v1.8b
; CHECK-SD-NEXT: cmgt v0.8b, v1.8b, v0.8b
-; CHECK-SD-NEXT: and v1.8b, v3.8b, v2.8b
-; CHECK-SD-NEXT: orr v0.8b, v1.8b, v0.8b
+; CHECK-SD-NEXT: sub v0.8b, v0.8b, v2.8b
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: s_v8i8:
@@ -160,11 +158,9 @@ entry:
define <16 x i8> @s_v16i8(<16 x i8> %a, <16 x i8> %b) {
; CHECK-SD-LABEL: s_v16i8:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: movi v2.16b, #1
-; CHECK-SD-NEXT: cmgt v3.16b, v0.16b, v1.16b
+; CHECK-SD-NEXT: cmgt v2.16b, v0.16b, v1.16b
; CHECK-SD-NEXT: cmgt v0.16b, v1.16b, v0.16b
-; CHECK-SD-NEXT: and v1.16b, v3.16b, v2.16b
-; CHECK-SD-NEXT: orr v0.16b, v1.16b, v0.16b
+; CHECK-SD-NEXT: sub v0.16b, v0.16b, v2.16b
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: s_v16i8:
@@ -184,11 +180,9 @@ entry:
define <4 x i16> @s_v4i16(<4 x i16> %a, <4 x i16> %b) {
; CHECK-SD-LABEL: s_v4i16:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: movi v2.4h, #1
-; CHECK-SD-NEXT: cmgt v3.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT: cmgt v2.4h, v0.4h, v1.4h
; CHECK-SD-NEXT: cmgt v0.4h, v1.4h, v0.4h
-; CHECK-SD-NEXT: and v1.8b, v3.8b, v2.8b
-; CHECK-SD-NEXT: orr v0.8b, v1.8b, v0.8b
+; CHECK-SD-NEXT: sub v0.4h, v0.4h, v2.4h
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: s_v4i16:
@@ -208,11 +202,9 @@ entry:
define <8 x i16> @s_v8i16(<8 x i16> %a, <8 x i16> %b) {
; CHECK-SD-LABEL: s_v8i16:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: movi v2.8h, #1
-; CHECK-SD-NEXT: cmgt v3.8h, v0.8h, v1.8h
+; CHECK-SD-NEXT: cmgt v2.8h, v0.8h, v1.8h
; CHECK-SD-NEXT: cmgt v0.8h, v1.8h, v0.8h
-; CHECK-SD-NEXT: and v1.16b, v3.16b, v2.16b
-; CHECK-SD-NEXT: orr v0.16b, v1.16b, v0.16b
+; CHECK-SD-NEXT: sub v0.8h, v0.8h, v2.8h
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: s_v8i16:
@@ -232,15 +224,12 @@ entry:
define <16 x i16> @s_v16i16(<16 x i16> %a, <16 x i16> %b) {
; CHECK-SD-LABEL: s_v16i16:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: movi v4.8h, #1
+; CHECK-SD-NEXT: cmgt v4.8h, v1.8h, v3.8h
; CHECK-SD-NEXT: cmgt v5.8h, v0.8h, v2.8h
-; CHECK-SD-NEXT: cmgt v6.8h, v1.8h, v3.8h
; CHECK-SD-NEXT: cmgt v0.8h, v2.8h, v0.8h
; CHECK-SD-NEXT: cmgt v1.8h, v3.8h, v1.8h
-; CHECK-SD-NEXT: and v2.16b, v5.16b, v4.16b
-; CHECK-SD-NEXT: and v3.16b, v6.16b, v4.16b
-; CHECK-SD-NEXT: orr v0.16b, v2.16b, v0.16b
-; CHECK-SD-NEXT: orr v1.16b, v3.16b, v1.16b
+; CHECK-SD-NEXT: sub v0.8h, v0.8h, v5.8h
+; CHECK-SD-NEXT: sub v1.8h, v1.8h, v4.8h
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: s_v16i16:
@@ -264,11 +253,9 @@ entry:
define <2 x i32> @s_v2i32(<2 x i32> %a, <2 x i32> %b) {
; CHECK-SD-LABEL: s_v2i32:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: movi v2.2s, #1
-; CHECK-SD-NEXT: cmgt v3.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT: cmgt v2.2s, v0.2s, v1.2s
; CHECK-SD-NEXT: cmgt v0.2s, v1.2s, v0.2s
-; CHECK-SD-NEXT: and v1.8b, v3.8b, v2.8b
-; CHECK-SD-NEXT: orr v0.8b, v1.8b, v0.8b
+; CHECK-SD-NEXT: sub v0.2s, v0.2s, v2.2s
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: s_v2i32:
@@ -288,11 +275,9 @@ entry:
define <4 x i32> @s_v4i32(<4 x i32> %a, <4 x i32> %b) {
; CHECK-SD-LABEL: s_v4i32:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: movi v2.4s, #1
-; CHECK-SD-NEXT: cmgt v3.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: cmgt v2.4s, v0.4s, v1.4s
; CHECK-SD-NEXT: cmgt v0.4s, v1.4s, v0.4s
-; CHECK-SD-NEXT: and v1.16b, v3.16b, v2.16b
-; CHECK-SD-NEXT: orr v0.16b, v1.16b, v0.16b
+; CHECK-SD-NEXT: sub v0.4s, v0.4s, v2.4s
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: s_v4i32:
@@ -312,15 +297,12 @@ entry:
define <8 x i32> @s_v8i32(<8 x i32> %a, <8 x i32> %b) {
; CHECK-SD-LABEL: s_v8i32:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: movi v4.4s, #1
+; CHECK-SD-NEXT: cmgt v4.4s, v1.4s, v3.4s
; CHECK-SD-NEXT: cmgt v5.4s, v0.4s, v2.4s
-; CHECK-SD-NEXT: cmgt v6.4s, v1.4s, v3.4s
; CHECK-SD-NEXT: cmgt v0.4s, v2.4s, v0.4s
; CHECK-SD-NEXT: cmgt v1.4s, v3.4s, v1.4s
-; CHECK-SD-NEXT: and v2.16b, v5.16b, v4.16b
-; CHECK-SD-NEXT: and v3.16b, v6.16b, v4.16b
-; CHECK-SD-NEXT: orr v0.16b, v2.16b, v0.16b
-; CHECK-SD-NEXT: orr v1.16b, v3.16b, v1.16b
+; CHECK-SD-NEXT: sub v0.4s, v0.4s, v5.4s
+; CHECK-SD-NEXT: sub v1.4s, v1.4s, v4.4s
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: s_v8i32:
@@ -344,12 +326,9 @@ entry:
define <2 x i64> @s_v2i64(<2 x i64> %a, <2 x i64> %b) {
; CHECK-SD-LABEL: s_v2i64:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: mov w8, #1 // =0x1
; CHECK-SD-NEXT: cmgt v2.2d, v0.2d, v1.2d
; CHECK-SD-NEXT: cmgt v0.2d, v1.2d, v0.2d
-; CHECK-SD-NEXT: dup v3.2d, x8
-; CHECK-SD-NEXT: and v1.16b, v2.16b, v3.16b
-; CHECK-SD-NEXT: orr v0.16b, v1.16b, v0.16b
+; CHECK-SD-NEXT: sub v0.2d, v0.2d, v2.2d
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: s_v2i64:
@@ -370,16 +349,12 @@ entry:
define <4 x i64> @s_v4i64(<4 x i64> %a, <4 x i64> %b) {
; CHECK-SD-LABEL: s_v4i64:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: mov w8, #1 // =0x1
-; CHECK-SD-NEXT: cmgt v4.2d, v0.2d, v2.2d
-; CHECK-SD-NEXT: cmgt v6.2d, v1.2d, v3.2d
-; CHECK-SD-NEXT: dup v5.2d, x8
+; CHECK-SD-NEXT: cmgt v4.2d, v1.2d, v3.2d
+; CHECK-SD-NEXT: cmgt v5.2d, v0.2d, v2.2d
; CHECK-SD-NEXT: cmgt v0.2d, v2.2d, v0.2d
; CHECK-SD-NEXT: cmgt v1.2d, v3.2d, v1.2d
-; CHECK-SD-NEXT: and v2.16b, v4.16b, v5.16b
-; CHECK-SD-NEXT: and v3.16b, v6.16b, v5.16b
-; CHECK-SD-NEXT: orr v0.16b, v2.16b, v0.16b
-; CHECK-SD-NEXT: orr v1.16b, v3.16b, v1.16b
+; CHECK-SD-NEXT: sub v0.2d, v0.2d, v5.2d
+; CHECK-SD-NEXT: sub v1.2d, v1.2d, v4.2d
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: s_v4i64:
@@ -404,122 +379,13 @@ entry:
define <16 x i8> @signOf_neon_scmp(<8 x i16> %s0_lo, <8 x i16> %s0_hi, <8 x i16> %s1_lo, <8 x i16> %s1_hi) {
; CHECK-SD-LABEL: signOf_neon_scmp:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: cmgt v5.8h, v0.8h, v2.8h
-; CHECK-SD-NEXT: cmgt v2.8h, v2.8h, v0.8h
; CHECK-SD-NEXT: cmgt v4.8h, v1.8h, v3.8h
; CHECK-SD-NEXT: cmgt v1.8h, v3.8h, v1.8h
-; CHECK-SD-NEXT: umov w8, v5.h[1]
-; CHECK-SD-NEXT: umov w9, v2.h[1]
-; CHECK-SD-NEXT: umov w10, v5.h[0]
-; CHECK-SD-NEXT: umov w11, v2.h[0]
-; CHECK-SD-NEXT: tst w8, #0xffff
-; CHECK-SD-NEXT: cset w8, ne
-; CHECK-SD-NEXT: tst w9, #0xffff
-; CHECK-SD-NEXT: csinv w8, w8, wzr, eq
-; CHECK-SD-NEXT: tst w10, #0xffff
-; CHECK-SD-NEXT: umov w10, v5.h[2]
-; CHECK-SD-NEXT: cset w9, ne
-; CHECK-SD-NEXT: tst w11, #0xffff
-; CHECK-SD-NEXT: umov w11, v2.h[2]
-; CHECK-SD-NEXT: csinv w9, w9, wzr, eq
-; CHECK-SD-NEXT: fmov s0, w9
-; CHECK-SD-NEXT: tst w10, #0xffff
-; CHECK-SD-NEXT: umov w10, v2.h[3]
-; CHECK-SD-NEXT: cset w9, ne
-; CHECK-SD-NEXT: tst w11, #0xffff
-; CHECK-SD-NEXT: mov v0.b[1], w8
-; CHECK-SD-NEXT: umov w8, v5.h[3]
-; CHECK-SD-NEXT: csinv w9, w9, wzr, eq
-; CHECK-SD-NEXT: mov v0.b[2], w9
-; CHECK-SD-NEXT: tst w8, #0xffff
-; CHECK-SD-NEXT: umov w8, v5.h[4]
-; CHECK-SD-NEXT: cset w9, ne
-; CHECK-SD-NEXT: tst w10, #0xffff
-; CHECK-SD-NEXT: umov w10, v2.h[4]
-; CHECK-SD-NEXT: csinv w9, w9, wzr, eq
-; CHECK-SD-NEXT: mov v0.b[3], w9
-; CHECK-SD-NEXT: tst w8, #0xffff
-; CHECK-SD-NEXT: umov w8, v5.h[5]
-; CHECK-SD-NEXT: cset w9, ne
-; CHECK-SD-NEXT: tst w10, #0xffff
-; CHECK-SD-NEXT: umov w10, v2.h[5]
-; CHECK-SD-NEXT: csinv w9, w9, wzr, eq
-; CHECK-SD-NEXT: mov v0.b[4], w9
-; CHECK-SD-NEXT: tst w8, #0xffff
-; CHECK-SD-NEXT: umov w8, v5.h[6]
-; CHECK-SD-NEXT: cset w9, ne
-; CHECK-SD-NEXT: tst w10, #0xffff
-; CHECK-SD-NEXT: umov w10, v2.h[6]
-; CHECK-SD-NEXT: csinv w9, w9, wzr, eq
-; CHECK-SD-NEXT: mov v0.b[5], w9
-; CHECK-SD-NEXT: umov w9, v5.h[7]
-; CHECK-SD-NEXT: tst w8, #0xffff
-; CHECK-SD-NEXT: cset w8, ne
-; CHECK-SD-NEXT: tst w10, #0xffff
-; CHECK-SD-NEXT: umov w10, v2.h[7]
-; CHECK-SD-NEXT: csinv w8, w8, wzr, eq
-; CHECK-SD-NEXT: mov v0.b[6], w8
-; CHECK-SD-NEXT: tst w9, #0xffff
-; CHECK-SD-NEXT: umov w8, v4.h[0]
-; CHECK-SD-NEXT: cset w9, ne
-; CHECK-SD-NEXT: tst w10, #0xffff
-; CHECK-SD-NEXT: umov w10, v1.h[0]
-; CHECK-SD-NEXT: csinv w9, w9, wzr, eq
-; CHECK-SD-NEXT: mov v0.b[7], w9
-; CHECK-SD-NEXT: tst w8, #0xffff
-; CHECK-SD-NEXT: umov w8, v4.h[1]
-; CHECK-SD-NEXT: cset w9, ne
-; CHECK-SD-NEXT: tst w10, #0xffff
-; CHECK-SD-NEXT: umov w10, v1.h[1]
-; CHECK-SD-NEXT: csinv w9, w9, wzr, eq
-; CHECK-SD-NEXT: mov v0.b[8], w9
-; CHECK-SD-NEXT: tst w8, #0xffff
-; CHECK-SD-NEXT: umov w8, v4.h[2]
-; CHECK-SD-NEXT: cset w9, ne
-; CHECK-SD-NEXT: tst w10, #0xffff
-; CHECK-SD-NEXT: umov w10, v1.h[2]
-; CHECK-SD-NEXT: csinv w9, w9, wzr, eq
-; CHECK-SD-NEXT: mov v0.b[9], w9
-; CHECK-SD-NEXT: tst w8, #0xffff
-; CHECK-SD-NEXT: umov w8, v4.h[3]
-; CHECK-SD-NEXT: cset w9, ne
-; CHECK-SD-NEXT: tst ...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
Thanks. It looks like this didn't effect clang-19 in the example I found it in, and the codegen is now better than it was before it started recognizing the ucmp/scmp. |
Unlike scalar, where AArch64 prefers expanding scmp/ucmp with select, under Neon we can use the arithmetic expansion to generate fewer instructions. Notably it also prevents the scalarization of vselect during vector-legalization.