From 476769498e9d2f406a9f9f64218f513636734422 Mon Sep 17 00:00:00 2001
From: Chris Lattner <sabre@nondot.org>
Date: Sun, 8 Mar 2009 01:51:30 +0000
Subject: [PATCH] implement an optimization to codegen c ? 1.0 : 2.0 as load {
 2.0, 1.0 } + c*4. For 2009-03-07-FPConstSelect.ll we now produce:

_f:
	xorl	%eax, %eax
	testl	%edi, %edi
	movl	$4, %ecx
	cmovne	%rax, %rcx
	leaq	LCPI1_0(%rip), %rax
	movss	(%rcx,%rax), %xmm0
	ret

previously we produced:

_f:
	subl	$4, %esp
	cmpl	$0, 8(%esp)
	movss	LCPI1_0, %xmm0
	je	LBB1_2	## entry
LBB1_1:	## entry
	movss	LCPI1_1, %xmm0
LBB1_2:	## entry
	movss	%xmm0, (%esp)
	flds	(%esp)
	addl	$4, %esp
	ret

on PPC the code also improves to:

_f:
	cntlzw r2, r3
	srwi r2, r2, 5
	li r3, lo16(LCPI1_0)
	slwi r2, r2, 2
	addis r3, r3, ha16(LCPI1_0)
	lfsx f1, r3, r2
	blr

from:

_f:
	li r2, lo16(LCPI1_1)
	cmplwi cr0, r3, 0
	addis r2, r2, ha16(LCPI1_1)
	beq cr0, LBB1_2	; entry
LBB1_1:	; entry
	li r2, lo16(LCPI1_0)
	addis r2, r2, ha16(LCPI1_0)
LBB1_2:	; entry
	lfs f1, 0(r2)
	blr

This also improves the existing pic-cpool case from:

foo:
	subl	$12, %esp
	call	.Lllvm$1.$piclabel
.Lllvm$1.$piclabel:
	popl	%eax
	addl	$_GLOBAL_OFFSET_TABLE_ + [.-.Lllvm$1.$piclabel], %eax
	cmpl	$0, 16(%esp)
	movsd	.LCPI1_0@GOTOFF(%eax), %xmm0
	je	.LBB1_2	# entry
.LBB1_1:	# entry
	movsd	.LCPI1_1@GOTOFF(%eax), %xmm0
.LBB1_2:	# entry
	movsd	%xmm0, (%esp)
	fldl	(%esp)
	addl	$12, %esp
	ret

to:

foo:
	call	.Lllvm$1.$piclabel
.Lllvm$1.$piclabel:
	popl	%eax
	addl	$_GLOBAL_OFFSET_TABLE_ + [.-.Lllvm$1.$piclabel], %eax
	xorl	%ecx, %ecx
	cmpl	$0, 4(%esp)
	movl	$8, %edx
	cmovne	%ecx, %edx
	fldl	.LCPI1_0@GOTOFF(%eax,%edx)
	ret

This triggers a few dozen times in spec FP 2000.



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@66358 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/SelectionDAG/DAGCombiner.cpp     | 55 +++++++++++++++++++++++++++-
 test/CodeGen/X86/2009-03-07-FPConstSelect.ll | 12 ++++++
 test/CodeGen/X86/pic-cpool.ll                |  6 +--
 3 files changed, 68 insertions(+), 5 deletions(-)
 create mode 100644 test/CodeGen/X86/2009-03-07-FPConstSelect.ll
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 788ffb6ad01..407814ce133 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -14,8 +14,10 @@
 
 #define DEBUG_TYPE "dagcombine"
 #include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/DerivedTypes.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetFrameInfo.h"
@@ -2890,8 +2892,7 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
       return DAG.getNode(ISD::SELECT_CC, N->getDebugLoc(), VT,
                          N0.getOperand(0), N0.getOperand(1),
                          N1, N2, N0.getOperand(2));
-    else
-      return SimplifySelect(N->getDebugLoc(), N0, N1, N2);
+    return SimplifySelect(N->getDebugLoc(), N0, N1, N2);
   }
 
   return SDValue();
@@ -5674,9 +5675,14 @@ bool DAGCombiner::SimplifySelectOps(SDNode *TheSelect, SDValue LHS,
   return false;
 }
 
+/// SimplifySelectCC - Simplify an expression of the form (N0 cond N1) ? N2 : N3
+/// where 'cond' is the comparison specified by CC.
 SDValue DAGCombiner::SimplifySelectCC(DebugLoc DL, SDValue N0, SDValue N1,
                                       SDValue N2, SDValue N3,
                                       ISD::CondCode CC, bool NotExtCompare) {
+  // (x ? y : y) -> y.
+  if (N2 == N3) return N2;
+  
   MVT VT = N2.getValueType();
   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1.getNode());
   ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N2.getNode());
@@ -5712,6 +5718,51 @@ SDValue DAGCombiner::SimplifySelectCC(DebugLoc DL, SDValue N0, SDValue N1,
         return DAG.getNode(ISD::FABS, DL, VT, N3);
     }
   }
+  
+  // Turn "(a cond b) ? 1.0f : 2.0f" into "load (tmp + ((a cond b) ? 0 : 4)"
+  // where "tmp" is a constant pool entry containing an array with 1.0 and 2.0
+  // in it.  This is a win when the constant is not otherwise available because
+  // it replaces two constant pool loads with one.  We only do this if the FP
+  // type is known to be legal, because if it isn't, then we are before legalize
+  // types an we want the other legalization to happen first (e.g. to avoid
+  // messing with soft float).
+  if (ConstantFPSDNode *TV = dyn_cast<ConstantFPSDNode>(N2))
+    if (ConstantFPSDNode *FV = dyn_cast<ConstantFPSDNode>(N3)) {
+      if (TLI.isTypeLegal(N2.getValueType()) &&
+          // If both constants have multiple uses, then we won't need to do an
+          // extra load, they are likely around in registers for other users.
+          (TV->hasOneUse() || FV->hasOneUse())) {
+        Constant *Elts[] = {
+          const_cast<ConstantFP*>(FV->getConstantFPValue()),
+          const_cast<ConstantFP*>(TV->getConstantFPValue())
+        };
+        // Create a ConstantArray of the two constants.
+        Constant *CA =
+          ConstantArray::get(ArrayType::get(Elts[0]->getType(), 2), Elts, 2);
+        SDValue CPIdx = DAG.getConstantPool(CA, TLI.getPointerTy());
+        unsigned Alignment =
+          1 << cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
+        
+        // Get the offsets to the 0 and 1 element of the array so that we can
+        // select between them.
+        SDValue Zero = DAG.getIntPtrConstant(0);
+        unsigned EltSize =
+          (unsigned)TLI.getTargetData()->getTypePaddedSize(Elts[0]->getType());
+        SDValue One = DAG.getIntPtrConstant(EltSize);
+        
+        SDValue Cond = DAG.getSetCC(DL,
+                                    TLI.getSetCCResultType(N0.getValueType()),
+                                    N0, N1, CC);
+        SDValue CstOffset = DAG.getNode(ISD::SELECT, DL, Zero.getValueType(),
+                                        Cond, One, Zero);
+        CPIdx = DAG.getNode(ISD::ADD, DL, TLI.getPointerTy(), CPIdx,
+                            CstOffset);
+        return DAG.getLoad(TV->getValueType(0), DL, DAG.getEntryNode(), CPIdx,
+                           PseudoSourceValue::getConstantPool(), 0, false,
+                           Alignment);
+
+      }
+    }  
 
   // Check to see if we can perform the "gzip trick", transforming
   // (select_cc setlt X, 0, A, 0) -> (and (sra X, (sub size(X), 1), A)
diff --git a/test/CodeGen/X86/2009-03-07-FPConstSelect.ll b/test/CodeGen/X86/2009-03-07-FPConstSelect.ll
new file mode 100644
index 00000000000..28302c0f7b0
--- /dev/null
+++ b/test/CodeGen/X86/2009-03-07-FPConstSelect.ll
@@ -0,0 +1,12 @@
+; RUN: llvm-as < %s | llc -march=x86 -mcpu=yonah | not grep xmm
+; This should do a single load into the fp stack for the return, not diddle with xmm registers.
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+target triple = "i386-apple-darwin7"
+
+define float @f(i32 %x) nounwind readnone {
+entry:
+	%0 = icmp eq i32 %x, 0		; <i1> [#uses=1]
+	%iftmp.0.0 = select i1 %0, float 4.200000e+01, float 2.300000e+01
+	ret float %iftmp.0.0
+}
diff --git a/test/CodeGen/X86/pic-cpool.ll b/test/CodeGen/X86/pic-cpool.ll
index cac31ed2ad7..a2b5d265879 100644
--- a/test/CodeGen/X86/pic-cpool.ll
+++ b/test/CodeGen/X86/pic-cpool.ll
@@ -2,10 +2,10 @@
 ; RUN:   -o %t -f
 ; RUN: grep _GLOBAL_OFFSET_TABLE_ %t
 ; RUN: grep piclabel %t | count 3
-; RUN: grep GOTOFF %t | count 2
-; RUN: grep CPI %t | count 4
+; RUN: grep GOTOFF %t | count 1
+; RUN: grep CPI %t | count 2
 
-define double @foo(i32 %a.u) {
+define double @foo(i32 %a.u) nounwind {
 entry:
     %tmp = icmp eq i32 %a.u,0
     %retval = select i1 %tmp, double 4.561230e+02, double 1.234560e+02
-- 
2.11.0