return DAG.getBitcast(VT, Cvt);
}
+static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG) {
+ SDValue Src = N->getOperand(0);
+
+ // Turn MOVDQ2Q+simple_load into an mmx load.
+ if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
+ LoadSDNode *LN = cast<LoadSDNode>(Src.getNode());
+
+ if (LN->isSimple()) {
+ SDValue NewLd = DAG.getLoad(MVT::x86mmx, SDLoc(N), LN->getChain(),
+ LN->getBasePtr(),
+ LN->getPointerInfo(),
+ LN->getOriginalAlign(),
+ LN->getMemOperand()->getFlags());
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), NewLd.getValue(1));
+ return NewLd;
+ }
+ }
+
+ return SDValue();
+}
+
SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
case ISD::FP_EXTEND: return combineFP_EXTEND(N, DAG, Subtarget);
case ISD::FP_ROUND: return combineFP_ROUND(N, DAG, Subtarget);
case X86ISD::VBROADCAST_LOAD: return combineVBROADCAST_LOAD(N, DAG, DCI);
+ case X86ISD::MOVDQ2Q: return combineMOVDQ2Q(N, DAG);
}
return SDValue();
def : Pat<(x86mmx (MMX_X86movdq2q VR128:$src)),
(x86mmx (MMX_MOVDQ2Qrr VR128:$src))>;
-def : Pat<(x86mmx (MMX_X86movdq2q (v2i64 (simple_load addr:$src)))),
- (x86mmx (MMX_MOVQ64rm addr:$src))>;
-
def : Pat<(v2i64 (X86vzmovl (scalar_to_vector
(i64 (bitconvert (x86mmx VR64:$src)))))),
(MMX_MOVQ2DQrr VR64:$src)>;
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown -mattr=+mmx,+sse2 | FileCheck %s --check-prefix=X86
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+mmx,+sse2 | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -disable-peephole -mtriple=i686-unknown -mattr=+mmx,+sse2 | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown -mattr=+mmx,+sse2 | FileCheck %s --check-prefix=X64
define i64 @t0(<1 x i64>* %a, i32* %b) nounwind {
; X86-LABEL: t0:
declare void @llvm.lifetime.start(i64, i8* nocapture)
declare void @llvm.lifetime.end(i64, i8* nocapture)
+
+; Make sure we shrink this vector load and fold it.
+define x86_mmx @vec_load(<4 x float>* %x) {
+; X86-LABEL: vec_load:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: pshufw $68, (%eax), %mm0 # mm0 = mem[0,1,0,1]
+; X86-NEXT: paddsb %mm0, %mm0
+; X86-NEXT: retl
+;
+; X64-LABEL: vec_load:
+; X64: # %bb.0:
+; X64-NEXT: pshufw $68, (%rdi), %mm0 # mm0 = mem[0,1,0,1]
+; X64-NEXT: paddsb %mm0, %mm0
+; X64-NEXT: movq2dq %mm0, %xmm0
+; X64-NEXT: retq
+ %z = load <4 x float>, <4 x float>* %x
+ %y = extractelement <4 x float> %z, i32 0
+ %a = insertelement <2 x float> undef, float %y, i32 0
+ %b = insertelement <2 x float> %a, float %y, i32 1
+ %c = bitcast <2 x float> %b to x86_mmx
+ %d = tail call x86_mmx @llvm.x86.mmx.padds.b(x86_mmx %c, x86_mmx %c)
+ ret x86_mmx %d
+}
+
+declare x86_mmx @llvm.x86.mmx.padds.b(x86_mmx, x86_mmx)
+