From 6902c687b0f6466ea65c9a1e22d7e064a34df1f5 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Tue, 5 Aug 2014 17:35:22 +0000
Subject: [PATCH] Optimize vector fabs of bitcasted constant integer values.

Allow vector fabs operations on bitcasted constant integer values to be optimized
in the same way that we already optimize scalar fabs.

So for code like this:
%bitcast = bitcast i64 18446744069414584320 to <2 x float> ; 0xFFFF_FFFF_0000_0000
%fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %bitcast)
%ret = bitcast <2 x float> %fabs to i64

Instead of generating something like this:

movabsq (constant pool loadi of mask for sign bits)
vmovq   (move from integer register to vector/fp register)
vandps  (mask off sign bits)
vmovq   (move vector/fp register back to integer return register)

We should generate:

mov     (put constant value in return register)

I have also removed a redundant clause in the first 'if' statement:
N0.getOperand(0).getValueType().isInteger()

is the same thing as:
IntVT.isInteger()

Testcases for x86 and ARM added to existing files that deal with vector fabs.
One existing testcase for x86 removed because it is no longer ideal.

For more background, please see:
http://reviews.llvm.org/D4770

And:
http://llvm.org/bugs/show_bug.cgi?id=20354

Differential Revision: http://reviews.llvm.org/D4785


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@214892 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 24 ++++++++++------
 test/CodeGen/ARM/fabs-neon.ll            | 37 +++++++++++++++++++++++++
 test/CodeGen/X86/vec_fabs.ll             | 47 ++++++++++++++++++++++----------
 3 files changed, 84 insertions(+), 24 deletions(-)

diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 035ce57bf0f..d76b1eb39eb 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -7409,22 +7409,28 @@ SDValue DAGCombiner::visitFABS(SDNode *N) {
   if (N0.getOpcode() == ISD::FNEG || N0.getOpcode() == ISD::FCOPYSIGN)
     return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0.getOperand(0));
 
-  // Transform fabs(bitconvert(x)) -> bitconvert(x&~sign) to avoid loading
+  // Transform fabs(bitconvert(x)) -> bitconvert(x & ~sign) to avoid loading
   // constant pool values.
-  // TODO: We can also optimize for vectors here, but we need to make sure
-  // that the sign mask is created properly for each vector element.
   if (!TLI.isFAbsFree(VT) &&
-      N0.getOpcode() == ISD::BITCAST && N0.getNode()->hasOneUse() &&
-      N0.getOperand(0).getValueType().isInteger() &&
-      !VT.isVector()) {
+      N0.getOpcode() == ISD::BITCAST &&
+      N0.getNode()->hasOneUse()) {
     SDValue Int = N0.getOperand(0);
     EVT IntVT = Int.getValueType();
     if (IntVT.isInteger() && !IntVT.isVector()) {
+      APInt SignMask;
+      if (N0.getValueType().isVector()) {
+        // For a vector, get a mask such as 0x7f... per scalar element
+        // and splat it.
+        SignMask = ~APInt::getSignBit(N0.getValueType().getScalarSizeInBits());
+        SignMask = APInt::getSplat(IntVT.getSizeInBits(), SignMask);
+      } else {
+        // For a scalar, just generate 0x7f...
+        SignMask = ~APInt::getSignBit(IntVT.getSizeInBits());
+      }
       Int = DAG.getNode(ISD::AND, SDLoc(N0), IntVT, Int,
-             DAG.getConstant(~APInt::getSignBit(IntVT.getSizeInBits()), IntVT));
+                        DAG.getConstant(SignMask, IntVT));
       AddToWorklist(Int.getNode());
-      return DAG.getNode(ISD::BITCAST, SDLoc(N),
-                         N->getValueType(0), Int);
+      return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getValueType(0), Int);
     }
   }
 
diff --git a/test/CodeGen/ARM/fabs-neon.ll b/test/CodeGen/ARM/fabs-neon.ll
index e3094aaf57d..5a176b2e82c 100644
--- a/test/CodeGen/ARM/fabs-neon.ll
+++ b/test/CodeGen/ARM/fabs-neon.ll
@@ -15,3 +15,40 @@ define <2 x float> @test2(<2 x float> %a) {
     ret <2 x float> %foo
 }
 declare <2 x float> @llvm.fabs.v2f32(<2 x float> %a)
+
+; No constant pool loads or vector ops are needed for the fabs of a
+; bitcasted integer constant; we should just return integer constants
+; that have the sign bits turned off.
+;
+; So instead of something like this:
+; 	mvn	r0, #0
+; 	mov	r1, #0
+; 	vmov	d16, r1, r0
+; 	vabs.f32	d16, d16
+; 	vmov	r0, r1, d16
+; 	bx	lr
+;
+; We should generate:
+;	mov	r0, #0
+;	mvn	r1, #-2147483648
+;	mov	pc, lr
+
+; CHECK-LABEL: fabs_v2f32_1
+define i64 @fabs_v2f32_1() {
+ %bitcast = bitcast i64 18446744069414584320 to <2 x float> ; 0xFFFF_FFFF_0000_0000
+ %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %bitcast)
+ %ret = bitcast <2 x float> %fabs to i64
+ ret i64 %ret
+; CHECK: mvn r1, #-2147483648
+; CHECK-NOT: vabs
+}
+
+; CHECK-LABEL: fabs_v2f32_2
+define i64 @fabs_v2f32_2() {
+ %bitcast = bitcast i64 4294967295 to <2 x float> ; 0x0000_0000_FFFF_FFFF
+ %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %bitcast)
+ %ret = bitcast <2 x float> %fabs to i64
+ ret i64 %ret
+; CHECK: mvn r0, #-2147483648
+; CHECK-NOT: vabs
+}
diff --git a/test/CodeGen/X86/vec_fabs.ll b/test/CodeGen/X86/vec_fabs.ll
index 4c14a9602d4..2271946abef 100644
--- a/test/CodeGen/X86/vec_fabs.ll
+++ b/test/CodeGen/X86/vec_fabs.ll
@@ -38,21 +38,38 @@ define <8 x float> @fabs_v8f32(<8 x float> %p)
 declare <8 x float> @llvm.fabs.v8f32(<8 x float> %p)
 
 ; PR20354: when generating code for a vector fabs op,
-; make sure the correct mask is used for all vector elements.
-; CHECK-LABEL: .LCPI4_0:
-; CHECK-NEXT:    .long	2147483647
-; CHECK-NEXT:    .long	2147483647
-define i64 @fabs_v2f32(<2 x float> %v) {
-; CHECK-LABEL: fabs_v2f32:
-; CHECK:         movabsq $-9223372034707292160, %[[R:r[^ ]+]]
-; CHECK-NEXT:    vmovq %[[R]], %[[X:xmm[0-9]+]]
-; CHECK-NEXT:    vandps   {{.*}}.LCPI4_0{{.*}}, %[[X]], %[[X]]
-; CHECK-NEXT:    vmovq   %[[X]], %rax
-; CHECK-NEXT:    retq
-  %highbits = bitcast i64 9223372039002259456 to <2 x float> ; 0x8000_0000_8000_0000
-  %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %highbits)
-  %ret = bitcast <2 x float> %fabs to i64
-  ret i64 %ret
+; make sure that we're only turning off the sign bit of each float value.
+; No constant pool loads or vector ops are needed for the fabs of a
+; bitcasted integer constant; we should just return an integer constant
+; that has the sign bits turned off.
+;
+; So instead of something like this:
+;    movabsq (constant pool load of mask for sign bits) 
+;    vmovq   (move from integer register to vector/fp register)
+;    vandps  (mask off sign bits)
+;    vmovq   (move vector/fp register back to integer return register)
+;
+; We should generate:
+;    mov     (put constant value in return register)
+
+; CHECK-LABEL: fabs_v2f32_1
+define i64 @fabs_v2f32_1() {
+ %bitcast = bitcast i64 18446744069414584320 to <2 x float> ; 0xFFFF_FFFF_0000_0000
+ %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %bitcast)
+ %ret = bitcast <2 x float> %fabs to i64
+ ret i64 %ret
+; CHECK: movabsq $9223372032559808512, %rax
+;  # imm = 0x7FFF_FFFF_0000_0000
+}
+
+; CHECK-LABEL: fabs_v2f32_2
+define i64 @fabs_v2f32_2() {
+ %bitcast = bitcast i64 4294967295 to <2 x float> ; 0x0000_0000_FFFF_FFFF
+ %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %bitcast)
+ %ret = bitcast <2 x float> %fabs to i64
+ ret i64 %ret
+; CHECK: movl $2147483647, %eax
+;  # imm = 0x0000_0000_7FFF_FFFF
 }
 
 declare <2 x float> @llvm.fabs.v2f32(<2 x float> %p)
-- 
2.11.0