{ X86::VMOVDQU8Z128rr, X86::VMOVDQU8Z128mr, TB_FOLDED_STORE },
{ X86::VMOVDQU16Z128rr, X86::VMOVDQU16Z128mr, TB_FOLDED_STORE },
{ X86::VMOVDQU32Z128rr, X86::VMOVDQU32Z128mr, TB_FOLDED_STORE },
- { X86::VMOVDQU64Z128rr, X86::VMOVDQU64Z128mr, TB_FOLDED_STORE }
+ { X86::VMOVDQU64Z128rr, X86::VMOVDQU64Z128mr, TB_FOLDED_STORE },
+ // F16C foldable instructions
+ { X86::VCVTPS2PHrr, X86::VCVTPS2PHmr, TB_FOLDED_STORE },
+ { X86::VCVTPS2PHYrr, X86::VCVTPS2PHYmr, TB_FOLDED_STORE }
};
for (unsigned i = 0, e = array_lengthof(OpTbl0); i != e; ++i) {
{ X86::CVTSD2SIrr, X86::CVTSD2SIrm, 0 },
{ X86::CVTSS2SI64rr, X86::CVTSS2SI64rm, 0 },
{ X86::CVTSS2SIrr, X86::CVTSS2SIrm, 0 },
+ { X86::CVTDQ2PDrr, X86::CVTDQ2PDrm, TB_ALIGN_16 },
{ X86::CVTDQ2PSrr, X86::CVTDQ2PSrm, TB_ALIGN_16 },
{ X86::CVTPD2DQrr, X86::CVTPD2DQrm, TB_ALIGN_16 },
{ X86::CVTPD2PSrr, X86::CVTPD2PSrm, TB_ALIGN_16 },
{ X86::CVTPS2DQrr, X86::CVTPS2DQrm, TB_ALIGN_16 },
+ { X86::CVTPS2PDrr, X86::CVTPS2PDrm, TB_ALIGN_16 },
{ X86::CVTTPD2DQrr, X86::CVTTPD2DQrm, TB_ALIGN_16 },
{ X86::CVTTPS2DQrr, X86::CVTTPS2DQrm, TB_ALIGN_16 },
{ X86::Int_CVTTSD2SI64rr,X86::Int_CVTTSD2SI64rm, 0 },
{ X86::PSHUFLWri, X86::PSHUFLWmi, TB_ALIGN_16 },
{ X86::RCPPSr, X86::RCPPSm, TB_ALIGN_16 },
{ X86::RCPPSr_Int, X86::RCPPSm_Int, TB_ALIGN_16 },
+ { X86::ROUNDPDr, X86::ROUNDPDm, TB_ALIGN_16 },
+ { X86::ROUNDPSr, X86::ROUNDPSm, TB_ALIGN_16 },
{ X86::RSQRTPSr, X86::RSQRTPSm, TB_ALIGN_16 },
{ X86::RSQRTPSr_Int, X86::RSQRTPSm_Int, TB_ALIGN_16 },
{ X86::RSQRTSSr, X86::RSQRTSSm, 0 },
{ X86::VCVTSD2SIrr, X86::VCVTSD2SIrm, 0 },
{ X86::VCVTSS2SI64rr, X86::VCVTSS2SI64rm, 0 },
{ X86::VCVTSS2SIrr, X86::VCVTSS2SIrm, 0 },
+ { X86::VCVTDQ2PDrr, X86::VCVTDQ2PDrm, 0 },
{ X86::VCVTDQ2PSrr, X86::VCVTDQ2PSrm, 0 },
{ X86::VCVTPD2DQrr, X86::VCVTPD2DQXrm, 0 },
{ X86::VCVTPD2PSrr, X86::VCVTPD2PSXrm, 0 },
{ X86::VCVTPS2DQrr, X86::VCVTPS2DQrm, 0 },
+ { X86::VCVTPS2PDrr, X86::VCVTPS2PDrm, 0 },
{ X86::VCVTTPD2DQrr, X86::VCVTTPD2DQXrm, 0 },
{ X86::VCVTTPS2DQrr, X86::VCVTTPS2DQrm, 0 },
{ X86::VMOV64toPQIrr, X86::VMOVQI2PQIrm, 0 },
{ X86::VPSHUFLWri, X86::VPSHUFLWmi, 0 },
{ X86::VRCPPSr, X86::VRCPPSm, 0 },
{ X86::VRCPPSr_Int, X86::VRCPPSm_Int, 0 },
+ { X86::VROUNDPDr, X86::VROUNDPDm, 0 },
+ { X86::VROUNDPSr, X86::VROUNDPSm, 0 },
{ X86::VRSQRTPSr, X86::VRSQRTPSm, 0 },
{ X86::VRSQRTPSr_Int, X86::VRSQRTPSm_Int, 0 },
{ X86::VSQRTPDr, X86::VSQRTPDm, 0 },
{ X86::VSQRTPSr, X86::VSQRTPSm, 0 },
+ { X86::VTESTPDrr, X86::VTESTPDrm, 0 },
+ { X86::VTESTPSrr, X86::VTESTPSrm, 0 },
{ X86::VUCOMISDrr, X86::VUCOMISDrm, 0 },
{ X86::VUCOMISSrr, X86::VUCOMISSrm, 0 },
{ X86::VBROADCASTSSrr, X86::VBROADCASTSSrm, TB_NO_REVERSE },
// AVX 256-bit foldable instructions
+ { X86::VCVTDQ2PDYrr, X86::VCVTDQ2PDYrm, 0 },
{ X86::VCVTDQ2PSYrr, X86::VCVTDQ2PSYrm, 0 },
{ X86::VCVTPD2DQYrr, X86::VCVTPD2DQYrm, 0 },
{ X86::VCVTPD2PSYrr, X86::VCVTPD2PSYrm, 0 },
{ X86::VCVTPS2DQYrr, X86::VCVTPS2DQYrm, 0 },
+ { X86::VCVTPS2PDYrr, X86::VCVTPS2PDYrm, 0 },
{ X86::VCVTTPD2DQYrr, X86::VCVTTPD2DQYrm, 0 },
{ X86::VCVTTPS2DQYrr, X86::VCVTTPS2DQYrm, 0 },
{ X86::VMOVAPDYrr, X86::VMOVAPDYrm, TB_ALIGN_32 },
{ X86::VPERMILPSYri, X86::VPERMILPSYmi, 0 },
{ X86::VRCPPSYr, X86::VRCPPSYm, 0 },
{ X86::VRCPPSYr_Int, X86::VRCPPSYm_Int, 0 },
+ { X86::VROUNDYPDr, X86::VROUNDYPDm, 0 },
+ { X86::VROUNDYPSr, X86::VROUNDYPSm, 0 },
{ X86::VRSQRTPSYr, X86::VRSQRTPSYm, 0 },
+ { X86::VRSQRTPSYr_Int, X86::VRSQRTPSYm_Int, 0 },
{ X86::VSQRTPDYr, X86::VSQRTPDYm, 0 },
{ X86::VSQRTPSYr, X86::VSQRTPSYm, 0 },
+ { X86::VTESTPDYrr, X86::VTESTPDYrm, 0 },
+ { X86::VTESTPSYrr, X86::VTESTPSYrm, 0 },
{ X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrm, TB_NO_REVERSE },
{ X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrm, TB_NO_REVERSE },
{ X86::VMOVUPDZ128rr, X86::VMOVUPDZ128rm, 0 },
{ X86::VMOVUPSZ128rr, X86::VMOVUPSZ128rm, 0 },
{ X86::VBROADCASTSSZ128r, X86::VBROADCASTSSZ128m, TB_NO_REVERSE },
-
+ // F16C foldable instructions
+ { X86::VCVTPH2PSrr, X86::VCVTPH2PSrm, 0 },
+ { X86::VCVTPH2PSYrr, X86::VCVTPH2PSYrm, 0 },
// AES foldable instructions
{ X86::AESIMCrr, X86::AESIMCrm, TB_ALIGN_16 },
{ X86::AESKEYGENASSIST128rr, X86::AESKEYGENASSIST128rm, TB_ALIGN_16 },
{ X86::ADDPDrr, X86::ADDPDrm, TB_ALIGN_16 },
{ X86::ADDPSrr, X86::ADDPSrm, TB_ALIGN_16 },
{ X86::ADDSDrr, X86::ADDSDrm, 0 },
+ { X86::ADDSDrr_Int, X86::ADDSDrm_Int, 0 },
{ X86::ADDSSrr, X86::ADDSSrm, 0 },
+ { X86::ADDSSrr_Int, X86::ADDSSrm_Int, 0 },
{ X86::ADDSUBPDrr, X86::ADDSUBPDrm, TB_ALIGN_16 },
{ X86::ADDSUBPSrr, X86::ADDSUBPSrm, TB_ALIGN_16 },
{ X86::AND16rr, X86::AND16rm, 0 },
{ X86::DIVPDrr, X86::DIVPDrm, TB_ALIGN_16 },
{ X86::DIVPSrr, X86::DIVPSrm, TB_ALIGN_16 },
{ X86::DIVSDrr, X86::DIVSDrm, 0 },
+ { X86::DIVSDrr_Int, X86::DIVSDrm_Int, 0 },
{ X86::DIVSSrr, X86::DIVSSrm, 0 },
+ { X86::DIVSSrr_Int, X86::DIVSSrm_Int, 0 },
+ { X86::DPPDrri, X86::DPPDrmi, TB_ALIGN_16 },
+ { X86::DPPSrri, X86::DPPSrmi, TB_ALIGN_16 },
{ X86::FsANDNPDrr, X86::FsANDNPDrm, TB_ALIGN_16 },
{ X86::FsANDNPSrr, X86::FsANDNPSrm, TB_ALIGN_16 },
{ X86::FsANDPDrr, X86::FsANDPDrm, TB_ALIGN_16 },
{ X86::MAXPDrr, X86::MAXPDrm, TB_ALIGN_16 },
{ X86::MAXPSrr, X86::MAXPSrm, TB_ALIGN_16 },
{ X86::MAXSDrr, X86::MAXSDrm, 0 },
+ { X86::MAXSDrr_Int, X86::MAXSDrm_Int, 0 },
{ X86::MAXSSrr, X86::MAXSSrm, 0 },
+ { X86::MAXSSrr_Int, X86::MAXSSrm_Int, 0 },
{ X86::MINPDrr, X86::MINPDrm, TB_ALIGN_16 },
{ X86::MINPSrr, X86::MINPSrm, TB_ALIGN_16 },
{ X86::MINSDrr, X86::MINSDrm, 0 },
+ { X86::MINSDrr_Int, X86::MINSDrm_Int, 0 },
{ X86::MINSSrr, X86::MINSSrm, 0 },
+ { X86::MINSSrr_Int, X86::MINSSrm_Int, 0 },
{ X86::MPSADBWrri, X86::MPSADBWrmi, TB_ALIGN_16 },
{ X86::MULPDrr, X86::MULPDrm, TB_ALIGN_16 },
{ X86::MULPSrr, X86::MULPSrm, TB_ALIGN_16 },
{ X86::MULSDrr, X86::MULSDrm, 0 },
+ { X86::MULSDrr_Int, X86::MULSDrm_Int, 0 },
{ X86::MULSSrr, X86::MULSSrm, 0 },
+ { X86::MULSSrr_Int, X86::MULSSrm_Int, 0 },
{ X86::OR16rr, X86::OR16rm, 0 },
{ X86::OR32rr, X86::OR32rm, 0 },
{ X86::OR64rr, X86::OR64rm, 0 },
{ X86::SUBPDrr, X86::SUBPDrm, TB_ALIGN_16 },
{ X86::SUBPSrr, X86::SUBPSrm, TB_ALIGN_16 },
{ X86::SUBSDrr, X86::SUBSDrm, 0 },
+ { X86::SUBSDrr_Int, X86::SUBSDrm_Int, 0 },
{ X86::SUBSSrr, X86::SUBSSrm, 0 },
+ { X86::SUBSSrr_Int, X86::SUBSSrm_Int, 0 },
// FIXME: TEST*rr -> swapped operand of TEST*mr.
{ X86::UNPCKHPDrr, X86::UNPCKHPDrm, TB_ALIGN_16 },
{ X86::UNPCKHPSrr, X86::UNPCKHPSrm, TB_ALIGN_16 },
{ X86::Int_VCVTSI2SSrr, X86::Int_VCVTSI2SSrm, 0 },
{ X86::VCVTSS2SDrr, X86::VCVTSS2SDrm, 0 },
{ X86::Int_VCVTSS2SDrr, X86::Int_VCVTSS2SDrm, 0 },
+ { X86::VRCPSSr, X86::VRCPSSm, 0 },
{ X86::VRSQRTSSr, X86::VRSQRTSSm, 0 },
{ X86::VSQRTSDr, X86::VSQRTSDm, 0 },
{ X86::VSQRTSSr, X86::VSQRTSSm, 0 },
{ X86::VADDPDrr, X86::VADDPDrm, 0 },
{ X86::VADDPSrr, X86::VADDPSrm, 0 },
{ X86::VADDSDrr, X86::VADDSDrm, 0 },
+ { X86::VADDSDrr_Int, X86::VADDSDrm_Int, 0 },
{ X86::VADDSSrr, X86::VADDSSrm, 0 },
+ { X86::VADDSSrr_Int, X86::VADDSSrm_Int, 0 },
{ X86::VADDSUBPDrr, X86::VADDSUBPDrm, 0 },
{ X86::VADDSUBPSrr, X86::VADDSUBPSrm, 0 },
{ X86::VANDNPDrr, X86::VANDNPDrm, 0 },
{ X86::VDIVPDrr, X86::VDIVPDrm, 0 },
{ X86::VDIVPSrr, X86::VDIVPSrm, 0 },
{ X86::VDIVSDrr, X86::VDIVSDrm, 0 },
+ { X86::VDIVSDrr_Int, X86::VDIVSDrm_Int, 0 },
{ X86::VDIVSSrr, X86::VDIVSSrm, 0 },
+ { X86::VDIVSSrr_Int, X86::VDIVSSrm_Int, 0 },
+ { X86::VDPPDrri, X86::VDPPDrmi, 0 },
+ { X86::VDPPSrri, X86::VDPPSrmi, 0 },
{ X86::VFsANDNPDrr, X86::VFsANDNPDrm, TB_ALIGN_16 },
{ X86::VFsANDNPSrr, X86::VFsANDNPSrm, TB_ALIGN_16 },
{ X86::VFsANDPDrr, X86::VFsANDPDrm, TB_ALIGN_16 },
{ X86::VMAXPDrr, X86::VMAXPDrm, 0 },
{ X86::VMAXPSrr, X86::VMAXPSrm, 0 },
{ X86::VMAXSDrr, X86::VMAXSDrm, 0 },
+ { X86::VMAXSDrr_Int, X86::VMAXSDrm_Int, 0 },
{ X86::VMAXSSrr, X86::VMAXSSrm, 0 },
+ { X86::VMAXSSrr_Int, X86::VMAXSSrm_Int, 0 },
{ X86::VMINPDrr, X86::VMINPDrm, 0 },
{ X86::VMINPSrr, X86::VMINPSrm, 0 },
{ X86::VMINSDrr, X86::VMINSDrm, 0 },
+ { X86::VMINSDrr_Int, X86::VMINSDrm_Int, 0 },
{ X86::VMINSSrr, X86::VMINSSrm, 0 },
+ { X86::VMINSSrr_Int, X86::VMINSSrm_Int, 0 },
{ X86::VMPSADBWrri, X86::VMPSADBWrmi, 0 },
{ X86::VMULPDrr, X86::VMULPDrm, 0 },
{ X86::VMULPSrr, X86::VMULPSrm, 0 },
{ X86::VMULSDrr, X86::VMULSDrm, 0 },
+ { X86::VMULSDrr_Int, X86::VMULSDrm_Int, 0 },
{ X86::VMULSSrr, X86::VMULSSrm, 0 },
+ { X86::VMULSSrr_Int, X86::VMULSSrm_Int, 0 },
{ X86::VORPDrr, X86::VORPDrm, 0 },
{ X86::VORPSrr, X86::VORPSrm, 0 },
{ X86::VPACKSSDWrr, X86::VPACKSSDWrm, 0 },
{ X86::VSUBPDrr, X86::VSUBPDrm, 0 },
{ X86::VSUBPSrr, X86::VSUBPSrm, 0 },
{ X86::VSUBSDrr, X86::VSUBSDrm, 0 },
+ { X86::VSUBSDrr_Int, X86::VSUBSDrm_Int, 0 },
{ X86::VSUBSSrr, X86::VSUBSSrm, 0 },
+ { X86::VSUBSSrr_Int, X86::VSUBSSrm_Int, 0 },
{ X86::VUNPCKHPDrr, X86::VUNPCKHPDrm, 0 },
{ X86::VUNPCKHPSrr, X86::VUNPCKHPSrm, 0 },
{ X86::VUNPCKLPDrr, X86::VUNPCKLPDrm, 0 },
{ X86::VCMPPSYrri, X86::VCMPPSYrmi, 0 },
{ X86::VDIVPDYrr, X86::VDIVPDYrm, 0 },
{ X86::VDIVPSYrr, X86::VDIVPSYrm, 0 },
+ { X86::VDPPSYrri, X86::VDPPSYrmi, 0 },
{ X86::VHADDPDYrr, X86::VHADDPDYrm, 0 },
{ X86::VHADDPSYrr, X86::VHADDPSYrm, 0 },
{ X86::VHSUBPDYrr, X86::VHSUBPDYrm, 0 },
-; RUN: llc -O3 -disable-peephole -mcpu=corei7-avx -mattr=+avx < %s | FileCheck %s
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-unknown"
+; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx,+f16c < %s | FileCheck %s\r
+\r
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"\r
+target triple = "x86_64-unknown-unknown"\r
; Stack reload folding tests.
;
;CHECK: vaddsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
%1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = fadd double %a0, %a1
- ret double %2
-}
-
-; TODO stack_fold_addsd_int
-declare <2 x double> @llvm.x86.sse2.add.sd(<2 x double>, <2 x double>) nounwind readnone
-
-define float @stack_fold_addss(float %a0, float %a1) {
+ ret double %2\r
+}\r
+\r
+define <2 x double> @stack_fold_addsd_int(<2 x double> %a0, <2 x double> %a1) {\r
+ ;CHECK-LABEL: stack_fold_addsd_int\r
+ ;CHECK: vaddsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload\r
+ %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()\r
+ %2 = call <2 x double> @llvm.x86.sse2.add.sd(<2 x double> %a0, <2 x double> %a1)\r
+ ret <2 x double> %2\r
+}\r
+declare <2 x double> @llvm.x86.sse2.add.sd(<2 x double>, <2 x double>) nounwind readnone\r
+\r
+define float @stack_fold_addss(float %a0, float %a1) {\r
;CHECK-LABEL: stack_fold_addss
;CHECK: vaddss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
%1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = fadd float %a0, %a1
- ret float %2
-}
-
-; TODO stack_fold_addss_int
-declare <4 x float> @llvm.x86.sse.add.ss(<4 x float>, <4 x float>) nounwind readnone
-
-define <2 x double> @stack_fold_addsubpd(<2 x double> %a0, <2 x double> %a1) {
+ ret float %2\r
+}\r
+\r
+define <4 x float> @stack_fold_addss_int(<4 x float> %a0, <4 x float> %a1) {\r
+ ;CHECK-LABEL: stack_fold_addss_int\r
+ ;CHECK: vaddss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload\r
+ %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()\r
+ %2 = call <4 x float> @llvm.x86.sse.add.ss(<4 x float> %a0, <4 x float> %a1)\r
+ ret <4 x float> %2\r
+}\r
+declare <4 x float> @llvm.x86.sse.add.ss(<4 x float>, <4 x float>) nounwind readnone\r
+\r
+define <2 x double> @stack_fold_addsubpd(<2 x double> %a0, <2 x double> %a1) {\r
;CHECK-LABEL: stack_fold_addsubpd
;CHECK: vaddsubpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call i32 @llvm.x86.sse.comieq.ss(<4 x float> %a0, <4 x float> %a1)
ret i32 %2
-}
-declare i32 @llvm.x86.sse.comieq.ss(<4 x float>, <4 x float>) nounwind readnone
-
-; TODO stack_fold_cvtdq2pd
-declare <2 x double> @llvm.x86.sse2.cvtdq2pd(<4 x i32>) nounwind readnone
-
-; TODO stack_fold_cvtdq2pd_ymm
-declare <4 x double> @llvm.x86.avx.cvtdq2.pd.256(<4 x i32>) nounwind readnone
-
-define <4 x float> @stack_fold_cvtdq2ps(<4 x i32> %a0) {
+}\r
+declare i32 @llvm.x86.sse.comieq.ss(<4 x float>, <4 x float>) nounwind readnone\r
+\r
+define <2 x double> @stack_fold_cvtdq2pd(<4 x i32> %a0) {\r
+ ;CHECK-LABEL: stack_fold_cvtdq2pd\r
+ ;CHECK: vcvtdq2pd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload\r
+ %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()\r
+ %2 = call <2 x double> @llvm.x86.sse2.cvtdq2pd(<4 x i32> %a0)\r
+ ret <2 x double> %2\r
+}\r
+declare <2 x double> @llvm.x86.sse2.cvtdq2pd(<4 x i32>) nounwind readnone\r
+\r
+define <4 x double> @stack_fold_cvtdq2pd_ymm(<4 x i32> %a0) {\r
+ ;CHECK-LABEL: stack_fold_cvtdq2pd_ymm\r
+ ;CHECK: vcvtdq2pd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload\r
+ %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()\r
+ %2 = call <4 x double> @llvm.x86.avx.cvtdq2.pd.256(<4 x i32> %a0)\r
+ ret <4 x double> %2\r
+}\r
+declare <4 x double> @llvm.x86.avx.cvtdq2.pd.256(<4 x i32>) nounwind readnone\r
+\r
+define <4 x float> @stack_fold_cvtdq2ps(<4 x i32> %a0) {\r
;CHECK-LABEL: stack_fold_cvtdq2ps
;CHECK: vcvtdq2ps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
;CHECK: vcvtpd2psy {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
%1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = fptrunc <4 x double> %a0 to <4 x float>
- ret <4 x float> %2
-}
-
-; TODO stack_fold_cvtph2ps
-declare <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16>) nounwind readonly
-
-; TODO stack_fold_cvtph2ps_ymm
-declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readonly
-
-define <4 x i32> @stack_fold_cvtps2dq(<4 x float> %a0) {
+ ret <4 x float> %2\r
+}\r
+\r
+define <4 x float> @stack_fold_cvtph2ps(<8 x i16> %a0) {\r
+ ;CHECK-LABEL: stack_fold_cvtph2ps\r
+ ;CHECK: vcvtph2ps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload\r
+ %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()\r
+ %2 = call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %a0)\r
+ ret <4 x float> %2\r
+}\r
+declare <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16>) nounwind readonly\r
+\r
+define <8 x float> @stack_fold_cvtph2ps_ymm(<8 x i16> %a0) {\r
+ ;CHECK-LABEL: stack_fold_cvtph2ps_ymm\r
+ ;CHECK: vcvtph2ps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload\r
+ %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()\r
+ %2 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %a0)\r
+ ret <8 x float> %2\r
+}\r
+declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readonly\r
+\r
+define <4 x i32> @stack_fold_cvtps2dq(<4 x float> %a0) {\r
;CHECK-LABEL: stack_fold_cvtps2dq
;CHECK: vcvtps2dq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float> %a0)
ret <8 x i32> %2
-}
-declare <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float>) nounwind readnone
-
-; TODO stack_fold_cvtps2pd
-; TODO stack_fold_cvtps2pd_ymm
-
-; TODO stack_fold_cvtps2ph
-declare <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float>, i32) nounwind readonly
-
-; TODO stack_fold_cvtps2ph_ymm
-declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readonly
-
-; TODO stack_fold_cvtsd2si
+}\r
+declare <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float>) nounwind readnone\r
+\r
+define <2 x double> @stack_fold_cvtps2pd(<4 x float> %a0) {\r
+ ;CHECK-LABEL: stack_fold_cvtps2pd\r
+ ;CHECK: vcvtps2pd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload\r
+ %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()\r
+ %2 = call <2 x double> @llvm.x86.sse2.cvtps2pd(<4 x float> %a0)\r
+ ret <2 x double> %2\r
+}\r
+declare <2 x double> @llvm.x86.sse2.cvtps2pd(<4 x float>) nounwind readnone\r
+\r
+define <4 x double> @stack_fold_cvtps2pd_ymm(<4 x float> %a0) {\r
+ ;CHECK-LABEL: stack_fold_cvtps2pd_ymm\r
+ ;CHECK: vcvtps2pd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload\r
+ %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()\r
+ %2 = call <4 x double> @llvm.x86.avx.cvt.ps2.pd.256(<4 x float> %a0)\r
+ ret <4 x double> %2\r
+}\r
+declare <4 x double> @llvm.x86.avx.cvt.ps2.pd.256(<4 x float>) nounwind readnone\r
+\r
+define <8 x i16> @stack_fold_cvtps2ph(<4 x float> %a0) {\r
+ ;CHECK-LABEL: stack_fold_cvtps2ph\r
+ ;CHECK: vcvtps2ph $0, {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill\r
+ %1 = call <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float> %a0, i32 0)\r
+ %2 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()\r
+ ret <8 x i16> %1\r
+}\r
+declare <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float>, i32) nounwind readonly\r
+\r
+define <8 x i16> @stack_fold_cvtps2ph_ymm(<8 x float> %a0) {\r
+ ;CHECK-LABEL: stack_fold_cvtps2ph_ymm\r
+ ;CHECK: vcvtps2ph $0, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill\r
+ %1 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %a0, i32 0)\r
+ %2 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()\r
+ ret <8 x i16> %1\r
+}\r
+declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readonly\r
+\r
+; TODO stack_fold_cvtsd2si\r
define i32 @stack_fold_cvtsd2si_int(<2 x double> %a0) {
;CHECK-LABEL: stack_fold_cvtsd2si_int
%1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %a0)
ret i64 %2
-}
-declare i64 @llvm.x86.sse.cvtss2si64(<4 x float>) nounwind readnone
-
-; TODO stack_fold_cvttpd2dq
-
-define <4 x i32> @stack_fold_cvttpd2dq_ymm(<4 x double> %a0) {
- ;CHECK-LABEL: stack_fold_cvttpd2dq_ymm
+}\r
+declare i64 @llvm.x86.sse.cvtss2si64(<4 x float>) nounwind readnone\r
+\r
+define <4 x i32> @stack_fold_cvttpd2dq(<2 x double> %a0) {\r
+ ;CHECK-LABEL: stack_fold_cvttpd2dq\r
+ ;CHECK: vcvttpd2dqx {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload\r
+ %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()\r
+ %2 = call <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double> %a0)\r
+ ret <4 x i32> %2\r
+}\r
+declare <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double>) nounwind readnone\r
+\r
+define <4 x i32> @stack_fold_cvttpd2dq_ymm(<4 x double> %a0) {\r
+ ;CHECK-LABEL: stack_fold_cvttpd2dq_ymm\r
;CHECK: vcvttpd2dqy {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
%1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = fptosi <4 x double> %a0 to <4 x i32>
;CHECK: vdivsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
%1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = fdiv double %a0, %a1
- ret double %2
-}
-
-; TODO stack_fold_divsd_int
-declare <2 x double> @llvm.x86.sse2.div.sd(<2 x double>, <2 x double>) nounwind readnone
-
-define float @stack_fold_divss(float %a0, float %a1) {
+ ret double %2\r
+}\r
+\r
+define <2 x double> @stack_fold_divsd_int(<2 x double> %a0, <2 x double> %a1) {\r
+ ;CHECK-LABEL: stack_fold_divsd_int\r
+ ;CHECK: vdivsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload\r
+ %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()\r
+ %2 = call <2 x double> @llvm.x86.sse2.div.sd(<2 x double> %a0, <2 x double> %a1)\r
+ ret <2 x double> %2\r
+}\r
+declare <2 x double> @llvm.x86.sse2.div.sd(<2 x double>, <2 x double>) nounwind readnone\r
+\r
+define float @stack_fold_divss(float %a0, float %a1) {\r
;CHECK-LABEL: stack_fold_divss
;CHECK: vdivss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
%1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = fdiv float %a0, %a1
- ret float %2
-}
-
-; TODO stack_fold_divss_int
-declare <4 x float> @llvm.x86.sse.div.ss(<4 x float>, <4 x float>) nounwind readnone
-
-; TODO stack_fold_dppd
-; TODO stack_fold_dppd_ymm
-; TODO stack_fold_dpps
-; TODO stack_fold_dpps_ymm
-
-define <4 x float> @stack_fold_extractf128(<8 x float> %a0, <8 x float> %a1) {
- ;CHECK-LABEL: stack_fold_extractf128
+ ret float %2\r
+}\r
+\r
+define <4 x float> @stack_fold_divss_int(<4 x float> %a0, <4 x float> %a1) {\r
+ ;CHECK-LABEL: stack_fold_divss_int\r
+ ;CHECK: vdivss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload\r
+ %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()\r
+ %2 = call <4 x float> @llvm.x86.sse.div.ss(<4 x float> %a0, <4 x float> %a1)\r
+ ret <4 x float> %2\r
+}\r
+declare <4 x float> @llvm.x86.sse.div.ss(<4 x float>, <4 x float>) nounwind readnone\r
+\r
+define <2 x double> @stack_fold_dppd(<2 x double> %a0, <2 x double> %a1) {\r
+ ;CHECK-LABEL: stack_fold_dppd\r
+ ;CHECK: vdppd $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload\r
+ %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()\r
+ %2 = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i8 7)\r
+ ret <2 x double> %2\r
+}\r
+declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i8) nounwind readnone\r
+\r
+define <4 x float> @stack_fold_dpps(<4 x float> %a0, <4 x float> %a1) {\r
+ ;CHECK-LABEL: stack_fold_dpps\r
+ ;CHECK: vdpps $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload\r
+ %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()\r
+ %2 = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i8 7)\r
+ ret <4 x float> %2\r
+}\r
+declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i8) nounwind readnone\r
+\r
+define <8 x float> @stack_fold_dpps_ymm(<8 x float> %a0, <8 x float> %a1) {\r
+ ;CHECK-LABEL: stack_fold_dpps_ymm\r
+ ;CHECK: vdpps $7, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload\r
+ %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()\r
+ %2 = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 7)\r
+ ret <8 x float> %2\r
+}\r
+declare <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone\r
+\r
+define <4 x float> @stack_fold_extractf128(<8 x float> %a0, <8 x float> %a1) {\r
+ ;CHECK-LABEL: stack_fold_extractf128\r
;CHECK: vextractf128 $1, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill
;CHECK: vmovaps {{-?[0-9]*}}(%rsp), %xmm0 {{.*#+}} 16-byte Reload
%1 = shufflevector <8 x float> %a0, <8 x float> %a1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = fcmp ogt double %a0, %a1
%3 = select i1 %2, double %a0, double %a1
- ret double %3
-}
-
-; TODO stack_fold_maxsd_int
-declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
-
-define float @stack_fold_maxss(float %a0, float %a1) {
+ ret double %3\r
+}\r
+\r
+define <2 x double> @stack_fold_maxsd_int(<2 x double> %a0, <2 x double> %a1) {\r
+ ;CHECK-LABEL: stack_fold_maxsd_int\r
+ ;CHECK: vmaxsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload\r
+ %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()\r
+ %2 = call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %a0, <2 x double> %a1)\r
+ ret <2 x double> %2\r
+}\r
+declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone\r
+\r
+define float @stack_fold_maxss(float %a0, float %a1) {\r
;CHECK-LABEL: stack_fold_maxss
;CHECK: vmaxss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
%1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = fcmp ogt float %a0, %a1
%3 = select i1 %2, float %a0, float %a1
- ret float %3
-}
-
-; TODO stack_fold_maxss_int
-declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
-
-define <2 x double> @stack_fold_minpd(<2 x double> %a0, <2 x double> %a1) {
+ ret float %3\r
+}\r
+\r
+define <4 x float> @stack_fold_maxss_int(<4 x float> %a0, <4 x float> %a1) {\r
+ ;CHECK-LABEL: stack_fold_maxss_int\r
+ ;CHECK: vmaxss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload\r
+ %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()\r
+ %2 = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a0, <4 x float> %a1)\r
+ ret <4 x float> %2\r
+}\r
+declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone\r
+\r
+define <2 x double> @stack_fold_minpd(<2 x double> %a0, <2 x double> %a1) {\r
;CHECK-LABEL: stack_fold_minpd
;CHECK: vminpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = fcmp olt double %a0, %a1
%3 = select i1 %2, double %a0, double %a1
- ret double %3
-}
-
-; TODO stack_fold_minsd_int
-declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
-
-define float @stack_fold_minss(float %a0, float %a1) {
+ ret double %3\r
+}\r
+\r
+define <2 x double> @stack_fold_minsd_int(<2 x double> %a0, <2 x double> %a1) {\r
+ ;CHECK-LABEL: stack_fold_minsd_int\r
+ ;CHECK: vminsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload\r
+ %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()\r
+ %2 = call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %a0, <2 x double> %a1)\r
+ ret <2 x double> %2\r
+}\r
+declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone\r
+\r
+define float @stack_fold_minss(float %a0, float %a1) {\r
;CHECK-LABEL: stack_fold_minss
;CHECK: vminss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
%1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = fcmp olt float %a0, %a1
%3 = select i1 %2, float %a0, float %a1
- ret float %3
-}
-
-; TODO stack_fold_minss_int
-declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
-
-; TODO stack_fold_movd (load / store)
+ ret float %3\r
+}\r
+\r
+define <4 x float> @stack_fold_minss_int(<4 x float> %a0, <4 x float> %a1) {\r
+ ;CHECK-LABEL: stack_fold_minss_int\r
+ ;CHECK: vminss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload\r
+ %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()\r
+ %2 = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a0, <4 x float> %a1)\r
+ ret <4 x float> %2\r
+}\r
+declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone\r
+\r
+; TODO stack_fold_movd (load / store)\r
; TODO stack_fold_movq (load / store)
; TODO stack_fold_movddup
;CHECK: vmulsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
%1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = fmul double %a0, %a1
- ret double %2
-}
-
-; TODO stack_fold_mulsd_int
-declare <2 x double> @llvm.x86.sse2.mul.sd(<2 x double>, <2 x double>) nounwind readnone
-
-define float @stack_fold_mulss(float %a0, float %a1) {
+ ret double %2\r
+}\r
+\r
+define <2 x double> @stack_fold_mulsd_int(<2 x double> %a0, <2 x double> %a1) {\r
+ ;CHECK-LABEL: stack_fold_mulsd_int\r
+ ;CHECK: vmulsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload\r
+ %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()\r
+ %2 = call <2 x double> @llvm.x86.sse2.mul.sd(<2 x double> %a0, <2 x double> %a1)\r
+ ret <2 x double> %2\r
+}\r
+declare <2 x double> @llvm.x86.sse2.mul.sd(<2 x double>, <2 x double>) nounwind readnone\r
+\r
+define float @stack_fold_mulss(float %a0, float %a1) {\r
;CHECK-LABEL: stack_fold_mulss
;CHECK: vmulss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
%1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = fmul float %a0, %a1
- ret float %2
-}
-
-; TODO stack_fold_mulss_int
-declare <4 x float> @llvm.x86.sse.mul.ss(<4 x float>, <4 x float>) nounwind readnone
-
-define <2 x double> @stack_fold_orpd(<2 x double> %a0, <2 x double> %a1) {
+ ret float %2\r
+}\r
+\r
+define <4 x float> @stack_fold_mulss_int(<4 x float> %a0, <4 x float> %a1) {\r
+ ;CHECK-LABEL: stack_fold_mulss_int\r
+ ;CHECK: vmulss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload\r
+ %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()\r
+ %2 = call <4 x float> @llvm.x86.sse.mul.ss(<4 x float> %a0, <4 x float> %a1)\r
+ ret <4 x float> %2\r
+}\r
+declare <4 x float> @llvm.x86.sse.mul.ss(<4 x float>, <4 x float>) nounwind readnone\r
+\r
+define <2 x double> @stack_fold_orpd(<2 x double> %a0, <2 x double> %a1) {\r
;CHECK-LABEL: stack_fold_orpd
;CHECK: vorpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
;CHECK: vpermilpd $5, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
%1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
- ret <4 x double> %2
-}
-
-define <4 x float> @stack_fold_permilps(<4 x float> %a0) {
- ;CHECK-LABEL: stack_fold_permilps
- ;CHECK: vpermilps $27, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+ ret <4 x double> %2\r
+}\r
+\r
+define <2 x double> @stack_fold_permilpdvar(<2 x double> %a0, <2 x i64> %a1) {\r
+ ;CHECK-LABEL: stack_fold_permilpdvar\r
+ ;CHECK: vpermilpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload\r
+ %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()\r
+ %2 = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %a0, <2 x i64> %a1)\r
+ ret <2 x double> %2\r
+}\r
+declare <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double>, <2 x i64>) nounwind readnone\r
+\r
+define <4 x double> @stack_fold_permilpdvar_ymm(<4 x double> %a0, <4 x i64> %a1) {\r
+ ;CHECK-LABEL: stack_fold_permilpdvar_ymm\r
+ ;CHECK: vpermilpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload\r
+ %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()\r
+ %2 = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> %a1)\r
+ ret <4 x double> %2\r
+}\r
+declare <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double>, <4 x i64>) nounwind readnone\r
+\r
+define <4 x float> @stack_fold_permilps(<4 x float> %a0) {\r
+ ;CHECK-LABEL: stack_fold_permilps\r
+ ;CHECK: vpermilps $27, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload\r
%1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
ret <4 x float> %2
;CHECK: vpermilps $27, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
%1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
- ret <8 x float> %2
-}
-
-; TODO stack_fold_rcpps
-
-define <4 x float> @stack_fold_rcpps_int(<4 x float> %a0) {
+ ret <8 x float> %2\r
+}\r
+\r
+define <4 x float> @stack_fold_permilpsvar(<4 x float> %a0, <4 x i32> %a1) {\r
+ ;CHECK-LABEL: stack_fold_permilpsvar\r
+ ;CHECK: vpermilps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload\r
+ %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()\r
+ %2 = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> %a1)\r
+ ret <4 x float> %2\r
+}\r
+declare <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>, <4 x i32>) nounwind readnone\r
+\r
+define <8 x float> @stack_fold_permilpsvar_ymm(<8 x float> %a0, <8 x i32> %a1) {\r
+ ;CHECK-LABEL: stack_fold_permilpsvar_ymm\r
+ ;CHECK: vpermilps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload\r
+ %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()\r
+ %2 = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> %a1)\r
+ ret <8 x float> %2\r
+}\r
+declare <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>, <8 x i32>) nounwind readnone\r
+\r
+; TODO stack_fold_rcpps\r
+\r
+define <4 x float> @stack_fold_rcpps_int(<4 x float> %a0) {\r
;CHECK-LABEL: stack_fold_rcpps_int
;CHECK: vrcpps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %a0)
ret <8 x float> %2
}
-declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone
-
-; TODO stack_fold_rcpss
-; TODO stack_fold_rcpss_int
-
-; TODO stack_fold_roundpd
-; TODO stack_fold_roundps
-; TODO stack_fold_roundsd (+ int)
-; TODO stack_fold_roundss (+ int)
-
-; TODO stack_fold_rsqrtps
-
+declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone\r
+\r
+; TODO stack_fold_rcpss\r
+\r
+define <4 x float> @stack_fold_rcpss_int(<4 x float> %a0) {\r
+ ;CHECK-LABEL: stack_fold_rcpss_int\r
+ ;CHECK: vrcpss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload\r
+ %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()\r
+ %2 = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %a0)\r
+ ret <4 x float> %2\r
+}\r
+declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone\r
+\r
+define <2 x double> @stack_fold_roundpd(<2 x double> %a0) {\r
+ ;CHECK-LABEL: stack_fold_roundpd\r
+ ;CHECK: vroundpd $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload\r
+ %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()\r
+ %2 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 7)\r
+ ret <2 x double> %2\r
+}\r
+declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone\r
+\r
+define <4 x double> @stack_fold_roundpd_ymm(<4 x double> %a0) {\r
+ ;CHECK-LABEL: stack_fold_roundpd_ymm\r
+ ;CHECK: vroundpd $7, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload\r
+ %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()\r
+ %2 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 7)\r
+ ret <4 x double> %2\r
+}\r
+declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone\r
+\r
+define <4 x float> @stack_fold_roundps(<4 x float> %a0) {\r
+ ;CHECK-LABEL: stack_fold_roundps\r
+ ;CHECK: vroundps $7, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload\r
+ %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()\r
+ %2 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 7)\r
+ ret <4 x float> %2\r
+}\r
+declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone\r
+\r
+define <8 x float> @stack_fold_roundps_ymm(<8 x float> %a0) {\r
+ ;CHECK-LABEL: stack_fold_roundps_ymm\r
+ ;CHECK: vroundps $7, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload\r
+ %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()\r
+ %2 = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 7)\r
+ ret <8 x float> %2\r
+}\r
+declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readnone\r
+\r
+; TODO stack_fold_roundsd\r
+\r
+; TODO stack_fold_roundsd_int\r
+declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone\r
+\r
+; TODO stack_fold_roundss\r
+\r
+; TODO stack_fold_roundss_int\r
+declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone\r
+\r
+; TODO stack_fold_rsqrtps\r
+\r
define <4 x float> @stack_fold_rsqrtps_int(<4 x float> %a0) {
;CHECK-LABEL: stack_fold_rsqrtps_int
;CHECK: vrsqrtps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%2 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0)
ret <4 x float> %2
}
-declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
-
-; TODO stack_fold_rsqrtps_ymm
-; TODO stack_fold_rsqrtps_ymm_int
-; TODO stack_fold_rsqrtss
-; TODO stack_fold_rsqrtss_int
-
-define <2 x double> @stack_fold_shufpd(<2 x double> %a0, <2 x double> %a1) {
- ;CHECK-LABEL: stack_fold_shufpd
+declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone\r
+\r
+; TODO stack_fold_rsqrtps_ymm\r
+\r
+define <8 x float> @stack_fold_rsqrtps_ymm_int(<8 x float> %a0) {\r
+ ;CHECK-LABEL: stack_fold_rsqrtps_ymm_int\r
+ ;CHECK: vrsqrtps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload\r
+ %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()\r
+ %2 = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %a0)\r
+ ret <8 x float> %2\r
+}\r
+declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone\r
+\r
+; TODO stack_fold_rsqrtss\r
+\r
+define <4 x float> @stack_fold_rsqrtss_int(<4 x float> %a0) {\r
+ ;CHECK-LABEL: stack_fold_rsqrtss_int\r
+ ;CHECK: vrsqrtss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload\r
+ %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()\r
+ %2 = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %a0)\r
+ ret <4 x float> %2\r
+}\r
+declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone\r
+\r
+define <2 x double> @stack_fold_shufpd(<2 x double> %a0, <2 x double> %a1) {\r
+ ;CHECK-LABEL: stack_fold_shufpd\r
;CHECK: vshufpd $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 1, i32 2>
;CHECK: vsubsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
%1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = fsub double %a0, %a1
- ret double %2
-}
-
-; TODO stack_fold_subsd_int
-declare <2 x double> @llvm.x86.sse2.sub.sd(<2 x double>, <2 x double>) nounwind readnone
-
-define float @stack_fold_subss(float %a0, float %a1) {
+ ret double %2\r
+}\r
+\r
+define <2 x double> @stack_fold_subsd_int(<2 x double> %a0, <2 x double> %a1) {\r
+ ;CHECK-LABEL: stack_fold_subsd_int\r
+ ;CHECK: vsubsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload\r
+ %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()\r
+ %2 = call <2 x double> @llvm.x86.sse2.sub.sd(<2 x double> %a0, <2 x double> %a1)\r
+ ret <2 x double> %2\r
+}\r
+declare <2 x double> @llvm.x86.sse2.sub.sd(<2 x double>, <2 x double>) nounwind readnone\r
+\r
+define float @stack_fold_subss(float %a0, float %a1) {\r
;CHECK-LABEL: stack_fold_subss
;CHECK: vsubss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
%1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = fsub float %a0, %a1
- ret float %2
-}
-
-; TODO stack_fold_subss_int
-declare <4 x float> @llvm.x86.sse.sub.ss(<4 x float>, <4 x float>) nounwind readnone
-
-; TODO stack_fold_testpd
-declare i32 @llvm.x86.avx.vtestc.pd(<2 x double>, <2 x double>) nounwind readnone
-
-; TODO stack_fold_testpd_ymm
-declare i32 @llvm.x86.avx.vtestc.pd.256(<4 x double>, <4 x double>) nounwind readnone
-
-; TODO stack_fold_testps
-declare i32 @llvm.x86.avx.vtestc.ps(<4 x float>, <4 x float>) nounwind readnone
-
-; TODO stack_fold_testps_ymm
-declare i32 @llvm.x86.avx.vtestc.ps.256(<8 x float>, <8 x float>) nounwind readnone
-
-define i32 @stack_fold_ucomisd(double %a0, double %a1) {
+ ret float %2\r
+}\r
+\r
+define <4 x float> @stack_fold_subss_int(<4 x float> %a0, <4 x float> %a1) {\r
+ ;CHECK-LABEL: stack_fold_subss_int\r
+ ;CHECK: vsubss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload\r
+ %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()\r
+ %2 = call <4 x float> @llvm.x86.sse.sub.ss(<4 x float> %a0, <4 x float> %a1)\r
+ ret <4 x float> %2\r
+}\r
+declare <4 x float> @llvm.x86.sse.sub.ss(<4 x float>, <4 x float>) nounwind readnone\r
+\r
+define i32 @stack_fold_testpd(<2 x double> %a0, <2 x double> %a1) {\r
+ ;CHECK-LABEL: stack_fold_testpd\r
+ ;CHECK: vtestpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload\r
+ %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()\r
+ %2 = call i32 @llvm.x86.avx.vtestc.pd(<2 x double> %a0, <2 x double> %a1)\r
+ ret i32 %2\r
+}\r
+declare i32 @llvm.x86.avx.vtestc.pd(<2 x double>, <2 x double>) nounwind readnone\r
+\r
+define i32 @stack_fold_testpd_ymm(<4 x double> %a0, <4 x double> %a1) {\r
+ ;CHECK-LABEL: stack_fold_testpd_ymm\r
+ ;CHECK: vtestpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload\r
+ %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()\r
+ %2 = call i32 @llvm.x86.avx.vtestc.pd.256(<4 x double> %a0, <4 x double> %a1)\r
+ ret i32 %2\r
+}\r
+declare i32 @llvm.x86.avx.vtestc.pd.256(<4 x double>, <4 x double>) nounwind readnone\r
+\r
+define i32 @stack_fold_testps(<4 x float> %a0, <4 x float> %a1) {\r
+ ;CHECK-LABEL: stack_fold_testps\r
+ ;CHECK: vtestps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload\r
+ %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()\r
+ %2 = call i32 @llvm.x86.avx.vtestc.ps(<4 x float> %a0, <4 x float> %a1)\r
+ ret i32 %2\r
+}\r
+declare i32 @llvm.x86.avx.vtestc.ps(<4 x float>, <4 x float>) nounwind readnone\r
+\r
+define i32 @stack_fold_testps_ymm(<8 x float> %a0, <8 x float> %a1) {\r
+ ;CHECK-LABEL: stack_fold_testps_ymm\r
+ ;CHECK: vtestps {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload\r
+ %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()\r
+ %2 = call i32 @llvm.x86.avx.vtestc.ps.256(<8 x float> %a0, <8 x float> %a1)\r
+ ret i32 %2\r
+}\r
+declare i32 @llvm.x86.avx.vtestc.ps.256(<8 x float>, <8 x float>) nounwind readnone\r
+\r
+define i32 @stack_fold_ucomisd(double %a0, double %a1) {\r
;CHECK-LABEL: stack_fold_ucomisd
;CHECK: vucomisd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
%1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()