From: Simon Pilgrim Date: Sat, 1 Oct 2016 14:26:11 +0000 (+0000) Subject: [X86][SSE] Enable commutation from MOVSD/MOVSS to BLENDPD/BLENDPS on SSE41+ targets X-Git-Tag: android-x86-7.1-r4~26378 X-Git-Url: http://git.osdn.net/view?a=commitdiff_plain;h=1c8d24e339205522c3e73d183d500c14aeb34098;p=android-x86%2Fexternal-llvm.git [X86][SSE] Enable commutation from MOVSD/MOVSS to BLENDPD/BLENDPS on SSE41+ targets Instead of selecting between MOVSD/MOVSS and BLENDPD/BLENDPS at shuffle lowering by subtarget this will help us select the instruction based on actual commutation requirements. We could possibly add BLENDPD/BLENDPS -> MOVSD/MOVSS commutation and MOVSD/MOVSS memory folding using a similar approach if it proves useful I avoided adding AVX512 handling as I'm not sure when we should be making use of VBLENDPD/VBLENDPS on EVEX targets git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@283037 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index 182e1a68a6d..f26d83c598b 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -3543,6 +3543,28 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, OpIdx1, OpIdx2); } + case X86::MOVSDrr: + case X86::MOVSSrr: + case X86::VMOVSDrr: + case X86::VMOVSSrr:{ + // On SSE41 or later we can commute a MOVSS/MOVSD to a BLENDPS/BLENDPD. + if (!Subtarget.hasSSE41()) + return nullptr; + + unsigned Mask, Opc; + switch (MI.getOpcode()) { + default: llvm_unreachable("Unreachable!"); + case X86::MOVSDrr: Opc = X86::BLENDPDrri; Mask = 0x02; break; + case X86::MOVSSrr: Opc = X86::BLENDPSrri; Mask = 0x0E; break; + case X86::VMOVSDrr: Opc = X86::VBLENDPDrri; Mask = 0x02; break; + case X86::VMOVSSrr: Opc = X86::VBLENDPSrri; Mask = 0x0E; break; + } + auto &WorkingMI = cloneIfNew(MI); + WorkingMI.setDesc(get(Opc)); + WorkingMI.addOperand(MachineOperand::CreateImm(Mask)); + return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, + OpIdx1, OpIdx2); + } case X86::PCLMULQDQrr: case X86::VPCLMULQDQrr:{ // SRC1 64bits = Imm[0] ? SRC1[127:64] : SRC1[63:0] @@ -3915,6 +3937,14 @@ bool X86InstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1, } return false; } + case X86::MOVSDrr: + case X86::MOVSSrr: + case X86::VMOVSDrr: + case X86::VMOVSSrr: { + if (Subtarget.hasSSE41()) + return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2); + return false; + } case X86::VPTERNLOGDZrri: case X86::VPTERNLOGDZrmi: case X86::VPTERNLOGDZ128rri: case X86::VPTERNLOGDZ128rmi: case X86::VPTERNLOGDZ256rri: case X86::VPTERNLOGDZ256rmi: diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index bad2d4ea415..056479c7124 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -508,6 +508,7 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, multiclass sse12_move_rr { + let isCommutable = 1 in def rr : SI<0x10, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, RC:$src2), !strconcat(base_opc, asm_opr), diff --git a/test/CodeGen/X86/f16c-intrinsics-fast-isel.ll b/test/CodeGen/X86/f16c-intrinsics-fast-isel.ll index aabb79ed05a..f66c53e8ee6 100644 --- a/test/CodeGen/X86/f16c-intrinsics-fast-isel.ll +++ b/test/CodeGen/X86/f16c-intrinsics-fast-isel.ll @@ -40,7 +40,7 @@ define i16 @test_cvtss_sh(float %a0) nounwind { ; X32: # BB#0: ; X32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X32-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; X32-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; X32-NEXT: vcvtps2ph $0, %xmm0, %xmm0 ; X32-NEXT: vmovd %xmm0, %eax ; X32-NEXT: # kill: %AX %AX %EAX @@ -49,7 +49,7 @@ define i16 @test_cvtss_sh(float %a0) nounwind { ; X64-LABEL: test_cvtss_sh: ; X64: # BB#0: ; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X64-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; X64-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; X64-NEXT: vcvtps2ph $0, %xmm0, %xmm0 ; X64-NEXT: vmovd %xmm0, %eax ; X64-NEXT: # kill: %AX %AX %EAX diff --git a/test/CodeGen/X86/sse-scalar-fp-arith.ll b/test/CodeGen/X86/sse-scalar-fp-arith.ll index 4b12a61a73e..1fe5b54d2bb 100644 --- a/test/CodeGen/X86/sse-scalar-fp-arith.ll +++ b/test/CodeGen/X86/sse-scalar-fp-arith.ll @@ -172,17 +172,29 @@ define <2 x double> @test_div_sd(<2 x double> %a, <2 x double> %b) { } define <2 x double> @test_sqrt_sd(<2 x double> %a) { -; SSE-LABEL: test_sqrt_sd: -; SSE: # BB#0: -; SSE-NEXT: sqrtsd %xmm0, %xmm1 -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE-NEXT: retq +; SSE2-LABEL: test_sqrt_sd: +; SSE2: # BB#0: +; SSE2-NEXT: sqrtsd %xmm0, %xmm1 +; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE2-NEXT: retq ; -; AVX-LABEL: test_sqrt_sd: -; AVX: # BB#0: -; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm1 -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; AVX-NEXT: retq +; SSE41-LABEL: test_sqrt_sd: +; SSE41: # BB#0: +; SSE41-NEXT: sqrtsd %xmm0, %xmm1 +; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; SSE41-NEXT: retq +; +; AVX1-LABEL: test_sqrt_sd: +; AVX1: # BB#0: +; AVX1-NEXT: vsqrtsd %xmm0, %xmm0, %xmm1 +; AVX1-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; AVX1-NEXT: retq +; +; AVX512-LABEL: test_sqrt_sd: +; AVX512: # BB#0: +; AVX512-NEXT: vsqrtsd %xmm0, %xmm0, %xmm1 +; AVX512-NEXT: vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] +; AVX512-NEXT: retq %1 = extractelement <2 x double> %a, i32 0 %2 = call double @llvm.sqrt.f64(double %1) %3 = insertelement <2 x double> %a, double %2, i32 0 diff --git a/test/CodeGen/X86/vec_ss_load_fold.ll b/test/CodeGen/X86/vec_ss_load_fold.ll index 3d4fe110a4e..51b874c3a6c 100644 --- a/test/CodeGen/X86/vec_ss_load_fold.ll +++ b/test/CodeGen/X86/vec_ss_load_fold.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=+sse,+sse2,+sse4.1 | FileCheck %s --check-prefix=X32 ; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=+sse,+sse2,+sse4.1 | FileCheck %s --check-prefix=X64 -; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=+avx | FileCheck %s --check-prefix=X32_AVX -; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=+avx | FileCheck %s --check-prefix=X64_AVX -; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=+avx512f | FileCheck %s --check-prefix=X32_AVX -; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=+avx512f | FileCheck %s --check-prefix=X64_AVX +; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=+avx | FileCheck %s --check-prefix=X32_AVX --check-prefix=X32_AVX1 +; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=+avx | FileCheck %s --check-prefix=X64_AVX --check-prefix=X64_AVX1 +; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=+avx512f | FileCheck %s --check-prefix=X32_AVX --check-prefix=X32_AVX512 +; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=+avx512f | FileCheck %s --check-prefix=X64_AVX --check-prefix=X64_AVX512 define i16 @test1(float %f) nounwind { ; X32-LABEL: test1: @@ -43,17 +43,29 @@ define i16 @test1(float %f) nounwind { ; X32_AVX-NEXT: ## kill: %AX %AX %EAX ; X32_AVX-NEXT: retl ; -; X64_AVX-LABEL: test1: -; X64_AVX: ## BB#0: -; X64_AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X64_AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; X64_AVX-NEXT: vsubss {{.*}}(%rip), %xmm0, %xmm0 -; X64_AVX-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 -; X64_AVX-NEXT: vminss {{.*}}(%rip), %xmm0, %xmm0 -; X64_AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm0 -; X64_AVX-NEXT: vcvttss2si %xmm0, %eax -; X64_AVX-NEXT: ## kill: %AX %AX %EAX -; X64_AVX-NEXT: retq +; X64_AVX1-LABEL: test1: +; X64_AVX1: ## BB#0: +; X64_AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64_AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; X64_AVX1-NEXT: vsubss {{.*}}(%rip), %xmm0, %xmm0 +; X64_AVX1-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; X64_AVX1-NEXT: vminss {{.*}}(%rip), %xmm0, %xmm0 +; X64_AVX1-NEXT: vmaxss %xmm1, %xmm0, %xmm0 +; X64_AVX1-NEXT: vcvttss2si %xmm0, %eax +; X64_AVX1-NEXT: ## kill: %AX %AX %EAX +; X64_AVX1-NEXT: retq +; +; X64_AVX512-LABEL: test1: +; X64_AVX512: ## BB#0: +; X64_AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64_AVX512-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; X64_AVX512-NEXT: vsubss {{.*}}(%rip), %xmm0, %xmm0 +; X64_AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 +; X64_AVX512-NEXT: vminss {{.*}}(%rip), %xmm0, %xmm0 +; X64_AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm0 +; X64_AVX512-NEXT: vcvttss2si %xmm0, %eax +; X64_AVX512-NEXT: ## kill: %AX %AX %EAX +; X64_AVX512-NEXT: retq %tmp = insertelement <4 x float> undef, float %f, i32 0 ; <<4 x float>> [#uses=1] %tmp10 = insertelement <4 x float> %tmp, float 0.000000e+00, i32 1 ; <<4 x float>> [#uses=1] %tmp11 = insertelement <4 x float> %tmp10, float 0.000000e+00, i32 2 ; <<4 x float>> [#uses=1] diff --git a/test/CodeGen/X86/vector-shuffle-128-v2.ll b/test/CodeGen/X86/vector-shuffle-128-v2.ll index 0a94f31be66..6ef383c858e 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v2.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v2.ll @@ -1159,16 +1159,43 @@ define <2 x i64> @insert_mem_hi_v2i64(i64* %ptr, <2 x i64> %b) { } define <2 x double> @insert_reg_lo_v2f64(double %a, <2 x double> %b) { -; SSE-LABEL: insert_reg_lo_v2f64: -; SSE: # BB#0: -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: insert_reg_lo_v2f64: +; SSE2: # BB#0: +; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: retq ; -; AVX-LABEL: insert_reg_lo_v2f64: -; AVX: # BB#0: -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1] -; AVX-NEXT: retq +; SSE3-LABEL: insert_reg_lo_v2f64: +; SSE3: # BB#0: +; SSE3-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE3-NEXT: movapd %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: insert_reg_lo_v2f64: +; SSSE3: # BB#0: +; SSSE3-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSSE3-NEXT: movapd %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: insert_reg_lo_v2f64: +; SSE41: # BB#0: +; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; SSE41-NEXT: retq +; +; AVX1-LABEL: insert_reg_lo_v2f64: +; AVX1: # BB#0: +; AVX1-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; AVX1-NEXT: retq +; +; AVX2-LABEL: insert_reg_lo_v2f64: +; AVX2: # BB#0: +; AVX2-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: insert_reg_lo_v2f64: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; AVX512VL-NEXT: retq %v = insertelement <2 x double> undef, double %a, i32 0 %shuffle = shufflevector <2 x double> %v, <2 x double> %b, <2 x i32> ret <2 x double> %shuffle diff --git a/test/CodeGen/X86/vector-shuffle-128-v4.ll b/test/CodeGen/X86/vector-shuffle-128-v4.ll index 9c431efc4ff..a0ad2388126 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v4.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v4.ll @@ -2096,11 +2096,17 @@ define <4 x float> @insert_reg_and_zero_v4f32(float %a) { ; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: insert_reg_and_zero_v4f32: -; AVX: # BB#0: -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; AVX-NEXT: retq +; AVX1OR2-LABEL: insert_reg_and_zero_v4f32: +; AVX1OR2: # BB#0: +; AVX1OR2-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX1OR2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX1OR2-NEXT: retq +; +; AVX512VL-LABEL: insert_reg_and_zero_v4f32: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512VL-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; AVX512VL-NEXT: retq %v = insertelement <4 x float> undef, float %a, i32 0 %shuffle = shufflevector <4 x float> %v, <4 x float> zeroinitializer, <4 x i32> ret <4 x float> %shuffle @@ -2254,16 +2260,38 @@ define <4 x i32> @insert_mem_hi_v4i32(<2 x i32>* %ptr, <4 x i32> %b) { } define <4 x float> @insert_reg_lo_v4f32(double %a, <4 x float> %b) { -; SSE-LABEL: insert_reg_lo_v4f32: -; SSE: # BB#0: -; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE-NEXT: movapd %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: insert_reg_lo_v4f32: +; SSE2: # BB#0: +; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: retq ; -; AVX-LABEL: insert_reg_lo_v4f32: -; AVX: # BB#0: -; AVX-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1] -; AVX-NEXT: retq +; SSE3-LABEL: insert_reg_lo_v4f32: +; SSE3: # BB#0: +; SSE3-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE3-NEXT: movapd %xmm1, %xmm0 +; SSE3-NEXT: retq +; +; SSSE3-LABEL: insert_reg_lo_v4f32: +; SSSE3: # BB#0: +; SSSE3-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSSE3-NEXT: movapd %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: insert_reg_lo_v4f32: +; SSE41: # BB#0: +; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; SSE41-NEXT: retq +; +; AVX1OR2-LABEL: insert_reg_lo_v4f32: +; AVX1OR2: # BB#0: +; AVX1OR2-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; AVX1OR2-NEXT: retq +; +; AVX512VL-LABEL: insert_reg_lo_v4f32: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; AVX512VL-NEXT: retq %a.cast = bitcast double %a to <2 x float> %v = shufflevector <2 x float> %a.cast, <2 x float> undef, <4 x i32> %shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32>