From 3f2b2c218faf9c08975f24629e01c28cf88f5feb Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 2 Nov 2011 04:42:13 +0000 Subject: [PATCH] Add a bunch more X86 AVX2 instructions and their corresponding intrinsics. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@143529 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/IntrinsicsX86.td | 104 +++++++++++++ lib/Target/X86/X86InstrFragmentsSIMD.td | 5 +- lib/Target/X86/X86InstrSSE.td | 253 +++++++++++++++++++++++++++++++- test/CodeGen/X86/avx2-intrinsics-x86.ll | 224 ++++++++++++++++++++++++++++ 4 files changed, 584 insertions(+), 2 deletions(-) diff --git a/include/llvm/IntrinsicsX86.td b/include/llvm/IntrinsicsX86.td index 2092c810dde..acccb149596 100644 --- a/include/llvm/IntrinsicsX86.td +++ b/include/llvm/IntrinsicsX86.td @@ -1525,6 +1525,110 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". llvm_v16i16_ty], [IntrNoMem]>; } +// Absolute value ops +let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". + def int_x86_avx2_pabs_b : GCCBuiltin<"__builtin_ia32_pabsb256">, + Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty], [IntrNoMem]>; + def int_x86_avx2_pabs_w : GCCBuiltin<"__builtin_ia32_pabsw256">, + Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty], [IntrNoMem]>; + def int_x86_avx2_pabs_d : GCCBuiltin<"__builtin_ia32_pabsd256">, + Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty], [IntrNoMem]>; +} + +// Horizontal arithmetic ops +let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". + def int_x86_avx2_phadd_w : GCCBuiltin<"__builtin_ia32_phaddw256">, + Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, + llvm_v16i16_ty], [IntrNoMem]>; + def int_x86_avx2_phadd_d : GCCBuiltin<"__builtin_ia32_phaddd256">, + Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, + llvm_v8i32_ty], [IntrNoMem]>; + def int_x86_avx2_phadd_sw : GCCBuiltin<"__builtin_ia32_phaddsw256">, + Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, + llvm_v16i16_ty], [IntrNoMem]>; + def int_x86_avx2_phsub_w : GCCBuiltin<"__builtin_ia32_phsubw256">, + Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, + llvm_v16i16_ty], [IntrNoMem]>; + def int_x86_avx2_phsub_d : GCCBuiltin<"__builtin_ia32_phsubd256">, + Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, + llvm_v8i32_ty], [IntrNoMem]>; + def int_x86_avx2_phsub_sw : GCCBuiltin<"__builtin_ia32_phsubsw256">, + Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, + llvm_v16i16_ty], [IntrNoMem]>; + def int_x86_avx2_pmadd_ub_sw : GCCBuiltin<"__builtin_ia32_pmaddubsw256">, + Intrinsic<[llvm_v16i16_ty], [llvm_v32i8_ty, + llvm_v32i8_ty], [IntrNoMem]>; +} + +// Sign ops +let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". + def int_x86_avx2_psign_b : GCCBuiltin<"__builtin_ia32_psignb256">, + Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, + llvm_v32i8_ty], [IntrNoMem]>; + def int_x86_avx2_psign_w : GCCBuiltin<"__builtin_ia32_psignw256">, + Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, + llvm_v16i16_ty], [IntrNoMem]>; + def int_x86_avx2_psign_d : GCCBuiltin<"__builtin_ia32_psignd256">, + Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, + llvm_v8i32_ty], [IntrNoMem]>; +} + +// Packed multiply high with round and scale +let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". + def int_x86_avx2_pmul_hr_sw : GCCBuiltin<"__builtin_ia32_pmulhrsw256">, + Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, + llvm_v16i16_ty], [IntrNoMem, Commutative]>; +} + +// Vector sign and zero extend +let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". + def int_x86_avx2_pmovsxbd : GCCBuiltin<"__builtin_ia32_pmovsxbd256">, + Intrinsic<[llvm_v8i32_ty], [llvm_v16i8_ty], + [IntrNoMem]>; + def int_x86_avx2_pmovsxbq : GCCBuiltin<"__builtin_ia32_pmovsxbq256">, + Intrinsic<[llvm_v4i64_ty], [llvm_v16i8_ty], + [IntrNoMem]>; + def int_x86_avx2_pmovsxbw : GCCBuiltin<"__builtin_ia32_pmovsxbw256">, + Intrinsic<[llvm_v16i16_ty], [llvm_v16i8_ty], + [IntrNoMem]>; + def int_x86_avx2_pmovsxdq : GCCBuiltin<"__builtin_ia32_pmovsxdq256">, + Intrinsic<[llvm_v4i64_ty], [llvm_v4i32_ty], + [IntrNoMem]>; + def int_x86_avx2_pmovsxwd : GCCBuiltin<"__builtin_ia32_pmovsxwd256">, + Intrinsic<[llvm_v8i32_ty], [llvm_v8i16_ty], + [IntrNoMem]>; + def int_x86_avx2_pmovsxwq : GCCBuiltin<"__builtin_ia32_pmovsxwq256">, + Intrinsic<[llvm_v4i64_ty], [llvm_v8i16_ty], + [IntrNoMem]>; + def int_x86_avx2_pmovzxbd : GCCBuiltin<"__builtin_ia32_pmovzxbd256">, + Intrinsic<[llvm_v8i32_ty], [llvm_v16i8_ty], + [IntrNoMem]>; + def int_x86_avx2_pmovzxbq : GCCBuiltin<"__builtin_ia32_pmovzxbq256">, + Intrinsic<[llvm_v4i64_ty], [llvm_v16i8_ty], + [IntrNoMem]>; + def int_x86_avx2_pmovzxbw : GCCBuiltin<"__builtin_ia32_pmovzxbw256">, + Intrinsic<[llvm_v16i16_ty], [llvm_v16i8_ty], + [IntrNoMem]>; + def int_x86_avx2_pmovzxdq : GCCBuiltin<"__builtin_ia32_pmovzxdq256">, + Intrinsic<[llvm_v4i64_ty], [llvm_v4i32_ty], + [IntrNoMem]>; + def int_x86_avx2_pmovzxwd : GCCBuiltin<"__builtin_ia32_pmovzxwd256">, + Intrinsic<[llvm_v8i32_ty], [llvm_v8i16_ty], + [IntrNoMem]>; + def int_x86_avx2_pmovzxwq : GCCBuiltin<"__builtin_ia32_pmovzxwq256">, + Intrinsic<[llvm_v4i64_ty], [llvm_v8i16_ty], + [IntrNoMem]>; +} + +// Misc. +let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". + def int_x86_avx2_pmovmskb : GCCBuiltin<"__builtin_ia32_pmovmskb256">, + Intrinsic<[llvm_i32_ty], [llvm_v32i8_ty], [IntrNoMem]>; + def int_x86_avx2_pshuf_b : GCCBuiltin<"__builtin_ia32_pshufb256">, + Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, + llvm_v32i8_ty], [IntrNoMem]>; +} + //===----------------------------------------------------------------------===// // MMX diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index af919fba8ee..6fd2efdab8f 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -276,11 +276,12 @@ def memopv8i16 : PatFrag<(ops node:$ptr), (v8i16 (memop node:$ptr))>; def memopv16i8 : PatFrag<(ops node:$ptr), (v16i8 (memop node:$ptr))>; // 256-bit memop pattern fragments -def memopv32i8 : PatFrag<(ops node:$ptr), (v32i8 (memop node:$ptr))>; def memopv8f32 : PatFrag<(ops node:$ptr), (v8f32 (memop node:$ptr))>; def memopv4f64 : PatFrag<(ops node:$ptr), (v4f64 (memop node:$ptr))>; def memopv4i64 : PatFrag<(ops node:$ptr), (v4i64 (memop node:$ptr))>; def memopv8i32 : PatFrag<(ops node:$ptr), (v8i32 (memop node:$ptr))>; +def memopv16i16 : PatFrag<(ops node:$ptr), (v16i16 (memop node:$ptr))>; +def memopv32i8 : PatFrag<(ops node:$ptr), (v32i8 (memop node:$ptr))>; // SSSE3 uses MMX registers for some instructions. They aren't aligned on a // 16-byte boundary. @@ -326,6 +327,8 @@ def bc_v4i32 : PatFrag<(ops node:$in), (v4i32 (bitconvert node:$in))>; def bc_v2i64 : PatFrag<(ops node:$in), (v2i64 (bitconvert node:$in))>; // 256-bit bitconvert pattern fragments +def bc_v32i8 : PatFrag<(ops node:$in), (v32i8 (bitconvert node:$in))>; +def bc_v16i16 : PatFrag<(ops node:$in), (v16i16 (bitconvert node:$in))>; def bc_v8i32 : PatFrag<(ops node:$in), (v8i32 (bitconvert node:$in))>; def bc_v4i64 : PatFrag<(ops node:$in), (v4i64 (bitconvert node:$in))>; diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index b5eea457800..f30a0c4699a 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -4008,6 +4008,23 @@ def mi : Ii8<0x70, MRMSrcMem, (bc_frag (memopv2i64 addr:$src1)), (undef))))]>; } + +multiclass sse2_pshuffle_y { +def Yri : Ii8<0x70, MRMSrcReg, + (outs VR256:$dst), (ins VR256:$src1, i8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, (vt (pshuf_frag:$src2 VR256:$src1, + (undef))))]>; +def Ymi : Ii8<0x70, MRMSrcMem, + (outs VR256:$dst), (ins i256mem:$src1, i8imm:$src2), + !strconcat(OpcodeStr, + "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), + [(set VR256:$dst, (vt (pshuf_frag:$src2 + (bc_frag (memopv4i64 addr:$src1)), + (undef))))]>; +} } // ExeDomain = SSEPackedInt let Predicates = [HasAVX] in { @@ -4052,6 +4069,20 @@ let Predicates = [HasAVX] in { (VPSHUFLWmi addr:$src, imm:$imm)>; } +let Predicates = [HasAVX2] in { + let AddedComplexity = 5 in + defm VPSHUFD : sse2_pshuffle_y<"vpshufd", v8i32, pshufd, bc_v8i32>, TB, + OpSize, VEX; + + // SSE2 with ImmT == Imm8 and XS prefix. + defm VPSHUFHW : sse2_pshuffle_y<"vpshufhw", v16i16, pshufhw, bc_v16i16>, XS, + VEX; + + // SSE2 with ImmT == Imm8 and XD prefix. + defm VPSHUFLW : sse2_pshuffle_y<"vpshuflw", v16i16, pshuflw, bc_v16i16>, XD, + VEX; +} + let Predicates = [HasSSE2] in { let AddedComplexity = 5 in defm PSHUFD : sse2_pshuffle<"pshufd", v4i32, pshufd, bc_v4i32>, TB, OpSize; @@ -4114,6 +4145,19 @@ multiclass sse2_unpack opc, string OpcodeStr, ValueType vt, addr:$src2))))]>; } +multiclass sse2_unpack_y opc, string OpcodeStr, ValueType vt, + SDNode OpNode, PatFrag bc_frag> { + def Yrr : PDI; + def Yrm : PDI; +} + let Predicates = [HasAVX] in { defm VPUNPCKLBW : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Punpcklbw, bc_v16i8, 0>, VEX_4V; @@ -4156,6 +4200,48 @@ let Predicates = [HasAVX] in { (memopv2i64 addr:$src2))))]>, VEX_4V; } +let Predicates = [HasAVX2] in { + defm VPUNPCKLBW : sse2_unpack_y<0x60, "vpunpcklbw", v32i8, X86Punpcklbw, + bc_v32i8>, VEX_4V; + defm VPUNPCKLWD : sse2_unpack_y<0x61, "vpunpcklwd", v16i16, X86Punpcklwd, + bc_v16i16>, VEX_4V; + defm VPUNPCKLDQ : sse2_unpack_y<0x62, "vpunpckldq", v8i32, X86Punpckldq, + bc_v8i32>, VEX_4V; + + /// FIXME: we could eliminate this and use sse2_unpack_y instead if tblgen + /// knew to collapse (bitconvert VT to VT) into its operand. + def VPUNPCKLQDQYrr : PDI<0x6C, MRMSrcReg, + (outs VR256:$dst), (ins VR256:$src1, VR256:$src2), + "vpunpcklqdq\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR256:$dst, (v4i64 (X86Punpcklqdq VR256:$src1, + VR256:$src2)))]>, VEX_4V; + def VPUNPCKLQDQYrm : PDI<0x6C, MRMSrcMem, + (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2), + "vpunpcklqdq\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR256:$dst, (v4i64 (X86Punpcklqdq VR256:$src1, + (memopv4i64 addr:$src2))))]>, VEX_4V; + + defm VPUNPCKHBW : sse2_unpack_y<0x68, "vpunpckhbw", v32i8, X86Punpckhbw, + bc_v32i8>, VEX_4V; + defm VPUNPCKHWD : sse2_unpack_y<0x69, "vpunpckhwd", v16i16, X86Punpckhwd, + bc_v16i16>, VEX_4V; + defm VPUNPCKHDQ : sse2_unpack_y<0x6A, "vpunpckhdq", v8i32, X86Punpckhdq, + bc_v8i32>, VEX_4V; + + /// FIXME: we could eliminate this and use sse2_unpack_y instead if tblgen + /// knew to collapse (bitconvert VT to VT) into its operand. + def VPUNPCKHQDQYrr : PDI<0x6D, MRMSrcReg, + (outs VR256:$dst), (ins VR256:$src1, VR256:$src2), + "vpunpckhqdq\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR256:$dst, (v4i64 (X86Punpckhqdq VR256:$src1, + VR256:$src2)))]>, VEX_4V; + def VPUNPCKHQDQYrm : PDI<0x6D, MRMSrcMem, + (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2), + "vpunpckhqdq\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [(set VR256:$dst, (v4i64 (X86Punpckhqdq VR256:$src1, + (memopv4i64 addr:$src2))))]>, VEX_4V; +} + let Constraints = "$src1 = $dst" in { defm PUNPCKLBW : sse2_unpack<0x60, "punpcklbw", v16i8, X86Punpcklbw, bc_v16i8>; defm PUNPCKLWD : sse2_unpack<0x61, "punpcklwd", v8i16, X86Punpcklwd, bc_v8i16>; @@ -4266,6 +4352,15 @@ def VPMOVMSKBrr : VPDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src), [(set GR32:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))]>, VEX; def VPMOVMSKBr64r : VPDI<0xD7, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src), "pmovmskb\t{$src, $dst|$dst, $src}", []>, VEX; + +let Predicates = [HasAVX2] in { +def VPMOVMSKBYrr : VPDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR256:$src), + "pmovmskb\t{$src, $dst|$dst, $src}", + [(set GR32:$dst, (int_x86_avx2_pmovmskb VR256:$src))]>, VEX; +def VPMOVMSKBYr64r : VPDI<0xD7, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src), + "pmovmskb\t{$src, $dst|$dst, $src}", []>, VEX; +} + def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src), "pmovmskb\t{$src, $dst|$dst, $src}", [(set GR32:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))]>; @@ -5016,6 +5111,23 @@ multiclass SS3I_unop_rm_int opc, string OpcodeStr, (bitconvert (mem_frag128 addr:$src))))]>, OpSize; } +/// SS3I_unop_rm_int_y - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}. +multiclass SS3I_unop_rm_int_y opc, string OpcodeStr, + PatFrag mem_frag256, Intrinsic IntId256> { + def rr256 : SS38I, + OpSize; + + def rm256 : SS38I, OpSize; +} + let Predicates = [HasAVX] in { defm VPABSB : SS3I_unop_rm_int<0x1C, "vpabsb", memopv16i8, int_x86_ssse3_pabs_b_128>, VEX; @@ -5025,6 +5137,15 @@ let Predicates = [HasAVX] in { int_x86_ssse3_pabs_d_128>, VEX; } +let Predicates = [HasAVX2] in { + defm VPABSB : SS3I_unop_rm_int_y<0x1C, "vpabsb", memopv32i8, + int_x86_avx2_pabs_b>, VEX; + defm VPABSW : SS3I_unop_rm_int_y<0x1D, "vpabsw", memopv16i16, + int_x86_avx2_pabs_w>, VEX; + defm VPABSD : SS3I_unop_rm_int_y<0x1E, "vpabsd", memopv8i32, + int_x86_avx2_pabs_d>, VEX; +} + defm PABSB : SS3I_unop_rm_int<0x1C, "pabsb", memopv16i8, int_x86_ssse3_pabs_b_128>; defm PABSW : SS3I_unop_rm_int<0x1D, "pabsw", memopv8i16, @@ -5055,7 +5176,23 @@ multiclass SS3I_binop_rm_int opc, string OpcodeStr, !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set VR128:$dst, (IntId128 VR128:$src1, - (bitconvert (memopv16i8 addr:$src2))))]>, OpSize; + (bitconvert (mem_frag128 addr:$src2))))]>, OpSize; +} + +multiclass SS3I_binop_rm_int_y opc, string OpcodeStr, + PatFrag mem_frag256, Intrinsic IntId256> { + let isCommutable = 1 in + def rr256 : SS38I, + OpSize; + def rm256 : SS38I, OpSize; } let ImmT = NoImm, Predicates = [HasAVX] in { @@ -5087,6 +5224,35 @@ defm VPMULHRSW : SS3I_binop_rm_int<0x0B, "vpmulhrsw", memopv8i16, int_x86_ssse3_pmul_hr_sw_128, 0>, VEX_4V; } +let ImmT = NoImm, Predicates = [HasAVX2] in { +let isCommutable = 0 in { + defm VPHADDW : SS3I_binop_rm_int_y<0x01, "vphaddw", memopv16i16, + int_x86_avx2_phadd_w>, VEX_4V; + defm VPHADDD : SS3I_binop_rm_int_y<0x02, "vphaddd", memopv8i32, + int_x86_avx2_phadd_d>, VEX_4V; + defm VPHADDSW : SS3I_binop_rm_int_y<0x03, "vphaddsw", memopv16i16, + int_x86_avx2_phadd_sw>, VEX_4V; + defm VPHSUBW : SS3I_binop_rm_int_y<0x05, "vphsubw", memopv16i16, + int_x86_avx2_phsub_w>, VEX_4V; + defm VPHSUBD : SS3I_binop_rm_int_y<0x06, "vphsubd", memopv8i32, + int_x86_avx2_phsub_d>, VEX_4V; + defm VPHSUBSW : SS3I_binop_rm_int_y<0x07, "vphsubsw", memopv16i16, + int_x86_avx2_phsub_sw>, VEX_4V; + defm VPMADDUBSW : SS3I_binop_rm_int_y<0x04, "vpmaddubsw", memopv32i8, + int_x86_avx2_pmadd_ub_sw>, VEX_4V; + defm VPSHUFB : SS3I_binop_rm_int_y<0x00, "vpshufb", memopv32i8, + int_x86_avx2_pshuf_b>, VEX_4V; + defm VPSIGNB : SS3I_binop_rm_int_y<0x08, "vpsignb", memopv16i8, + int_x86_avx2_psign_b>, VEX_4V; + defm VPSIGNW : SS3I_binop_rm_int_y<0x09, "vpsignw", memopv8i16, + int_x86_avx2_psign_w>, VEX_4V; + defm VPSIGND : SS3I_binop_rm_int_y<0x0A, "vpsignd", memopv4i32, + int_x86_avx2_psign_d>, VEX_4V; +} +defm VPMULHRSW : SS3I_binop_rm_int_y<0x0B, "vpmulhrsw", memopv16i16, + int_x86_avx2_pmul_hr_sw>, VEX_4V; +} + // None of these have i8 immediate fields. let ImmT = NoImm, Constraints = "$src1 = $dst" in { let isCommutable = 0 in { @@ -5166,8 +5332,23 @@ multiclass ssse3_palign { []>, OpSize; } +multiclass ssse3_palign_y { + def R256rr : SS3AI<0x0F, MRMSrcReg, (outs VR256:$dst), + (ins VR256:$src1, VR256:$src2, i8imm:$src3), + !strconcat(asm, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + []>, OpSize; + def R256rm : SS3AI<0x0F, MRMSrcMem, (outs VR256:$dst), + (ins VR256:$src1, i256mem:$src2, i8imm:$src3), + !strconcat(asm, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + []>, OpSize; +} + let Predicates = [HasAVX] in defm VPALIGN : ssse3_palign<"vpalignr", 0>, VEX_4V; +let Predicates = [HasAVX2] in + defm VPALIGN : ssse3_palign_y<"vpalignr", 0>, VEX_4V; let Constraints = "$src1 = $dst", Predicates = [HasSSSE3] in defm PALIGN : ssse3_palign<"palignr">; @@ -5235,6 +5416,17 @@ multiclass SS41I_binop_rm_int8 opc, string OpcodeStr, Intrinsic IntId> { OpSize; } +multiclass SS41I_binop_rm_int16_y opc, string OpcodeStr, + Intrinsic IntId> { + def Yrr : SS48I, OpSize; + + def Yrm : SS48I, OpSize; +} + let Predicates = [HasAVX] in { defm VPMOVSXBW : SS41I_binop_rm_int8<0x20, "vpmovsxbw", int_x86_sse41_pmovsxbw>, VEX; @@ -5250,6 +5442,21 @@ defm VPMOVZXDQ : SS41I_binop_rm_int8<0x35, "vpmovzxdq", int_x86_sse41_pmovzxdq>, VEX; } +let Predicates = [HasAVX2] in { +defm VPMOVSXBW : SS41I_binop_rm_int16_y<0x20, "vpmovsxbw", + int_x86_avx2_pmovsxbw>, VEX; +defm VPMOVSXWD : SS41I_binop_rm_int16_y<0x23, "vpmovsxwd", + int_x86_avx2_pmovsxwd>, VEX; +defm VPMOVSXDQ : SS41I_binop_rm_int16_y<0x25, "vpmovsxdq", + int_x86_avx2_pmovsxdq>, VEX; +defm VPMOVZXBW : SS41I_binop_rm_int16_y<0x30, "vpmovzxbw", + int_x86_avx2_pmovzxbw>, VEX; +defm VPMOVZXWD : SS41I_binop_rm_int16_y<0x33, "vpmovzxwd", + int_x86_avx2_pmovzxwd>, VEX; +defm VPMOVZXDQ : SS41I_binop_rm_int16_y<0x35, "vpmovzxdq", + int_x86_avx2_pmovzxdq>, VEX; +} + defm PMOVSXBW : SS41I_binop_rm_int8<0x20, "pmovsxbw", int_x86_sse41_pmovsxbw>; defm PMOVSXWD : SS41I_binop_rm_int8<0x23, "pmovsxwd", int_x86_sse41_pmovsxwd>; defm PMOVSXDQ : SS41I_binop_rm_int8<0x25, "pmovsxdq", int_x86_sse41_pmovsxdq>; @@ -5336,6 +5543,19 @@ multiclass SS41I_binop_rm_int4 opc, string OpcodeStr, Intrinsic IntId> { OpSize; } +multiclass SS41I_binop_rm_int8_y opc, string OpcodeStr, + Intrinsic IntId> { + def Yrr : SS48I, OpSize; + + def Yrm : SS48I, + OpSize; +} + let Predicates = [HasAVX] in { defm VPMOVSXBD : SS41I_binop_rm_int4<0x21, "vpmovsxbd", int_x86_sse41_pmovsxbd>, VEX; @@ -5347,6 +5567,17 @@ defm VPMOVZXWQ : SS41I_binop_rm_int4<0x34, "vpmovzxwq", int_x86_sse41_pmovzxwq>, VEX; } +let Predicates = [HasAVX2] in { +defm VPMOVSXBD : SS41I_binop_rm_int8_y<0x21, "vpmovsxbd", + int_x86_avx2_pmovsxbd>, VEX; +defm VPMOVSXWQ : SS41I_binop_rm_int8_y<0x24, "vpmovsxwq", + int_x86_avx2_pmovsxwq>, VEX; +defm VPMOVZXBD : SS41I_binop_rm_int8_y<0x31, "vpmovzxbd", + int_x86_avx2_pmovzxbd>, VEX; +defm VPMOVZXWQ : SS41I_binop_rm_int8_y<0x34, "vpmovzxwq", + int_x86_avx2_pmovzxwq>, VEX; +} + defm PMOVSXBD : SS41I_binop_rm_int4<0x21, "pmovsxbd", int_x86_sse41_pmovsxbd>; defm PMOVSXWQ : SS41I_binop_rm_int4<0x24, "pmovsxwq", int_x86_sse41_pmovsxwq>; defm PMOVZXBD : SS41I_binop_rm_int4<0x31, "pmovzxbd", int_x86_sse41_pmovzxbd>; @@ -5391,12 +5622,32 @@ multiclass SS41I_binop_rm_int2 opc, string OpcodeStr, Intrinsic IntId> { OpSize; } +multiclass SS41I_binop_rm_int4_y opc, string OpcodeStr, + Intrinsic IntId> { + def Yrr : SS48I, OpSize; + + // Expecting a i16 load any extended to i32 value. + def Yrm : SS48I, + OpSize; +} + let Predicates = [HasAVX] in { defm VPMOVSXBQ : SS41I_binop_rm_int2<0x22, "vpmovsxbq", int_x86_sse41_pmovsxbq>, VEX; defm VPMOVZXBQ : SS41I_binop_rm_int2<0x32, "vpmovzxbq", int_x86_sse41_pmovzxbq>, VEX; } +let Predicates = [HasAVX2] in { +defm VPMOVSXBQ : SS41I_binop_rm_int4_y<0x22, "vpmovsxbq", + int_x86_avx2_pmovsxbq>, VEX; +defm VPMOVZXBQ : SS41I_binop_rm_int4_y<0x32, "vpmovzxbq", + int_x86_avx2_pmovzxbq>, VEX; +} defm PMOVSXBQ : SS41I_binop_rm_int2<0x22, "pmovsxbq", int_x86_sse41_pmovsxbq>; defm PMOVZXBQ : SS41I_binop_rm_int2<0x32, "pmovzxbq", int_x86_sse41_pmovzxbq>; diff --git a/test/CodeGen/X86/avx2-intrinsics-x86.ll b/test/CodeGen/X86/avx2-intrinsics-x86.ll index 9e24f50f05e..2dc0f5ce2e5 100644 --- a/test/CodeGen/X86/avx2-intrinsics-x86.ll +++ b/test/CodeGen/X86/avx2-intrinsics-x86.ll @@ -160,6 +160,14 @@ define <32 x i8> @test_x86_avx2_pminu_b(<32 x i8> %a0, <32 x i8> %a1) { declare <32 x i8> @llvm.x86.avx2.pminu.b(<32 x i8>, <32 x i8>) nounwind readnone +define i32 @test_x86_avx2_pmovmskb(<32 x i8> %a0) { + ; CHECK: vpmovmskb + %res = call i32 @llvm.x86.avx2.pmovmskb(<32 x i8> %a0) ; [#uses=1] + ret i32 %res +} +declare i32 @llvm.x86.avx2.pmovmskb(<32 x i8>) nounwind readnone + + define <16 x i16> @test_x86_avx2_pmulh_w(<16 x i16> %a0, <16 x i16> %a1) { ; CHECK: vpmulhw %res = call <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] @@ -382,3 +390,219 @@ define <16 x i16> @test_x86_avx2_psubus_w(<16 x i16> %a0, <16 x i16> %a1) { ret <16 x i16> %res } declare <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16>, <16 x i16>) nounwind readnone + + +define <32 x i8> @test_x86_avx2_pabs_b(<32 x i8> %a0) { + ; CHECK: vpabsb + %res = call <32 x i8> @llvm.x86.avx2.pabs.b(<32 x i8> %a0) ; <<32 x i8>> [#uses=1] + ret <32 x i8> %res +} +declare <32 x i8> @llvm.x86.avx2.pabs.b(<32 x i8>) nounwind readnone + + +define <8 x i32> @test_x86_avx2_pabs_d(<8 x i32> %a0) { + ; CHECK: vpabsd + %res = call <8 x i32> @llvm.x86.avx2.pabs.d(<8 x i32> %a0) ; <<8 x i32>> [#uses=1] + ret <8 x i32> %res +} +declare <8 x i32> @llvm.x86.avx2.pabs.d(<8 x i32>) nounwind readnone + + +define <16 x i16> @test_x86_avx2_pabs_w(<16 x i16> %a0) { + ; CHECK: vpabsw + %res = call <16 x i16> @llvm.x86.avx2.pabs.w(<16 x i16> %a0) ; <<16 x i16>> [#uses=1] + ret <16 x i16> %res +} +declare <16 x i16> @llvm.x86.avx2.pabs.w(<16 x i16>) nounwind readnone + + +define <8 x i32> @test_x86_avx2_phadd_d(<8 x i32> %a0, <8 x i32> %a1) { + ; CHECK: vphaddd + %res = call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1] + ret <8 x i32> %res +} +declare <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32>, <8 x i32>) nounwind readnone + + +define <16 x i16> @test_x86_avx2_phadd_sw(<16 x i16> %a0, <16 x i16> %a1) { + ; CHECK: vphaddsw + %res = call <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] + ret <16 x i16> %res +} +declare <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16>, <16 x i16>) nounwind readnone + + +define <16 x i16> @test_x86_avx2_phadd_w(<16 x i16> %a0, <16 x i16> %a1) { + ; CHECK: vphaddw + %res = call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] + ret <16 x i16> %res +} +declare <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16>, <16 x i16>) nounwind readnone + + +define <8 x i32> @test_x86_avx2_phsub_d(<8 x i32> %a0, <8 x i32> %a1) { + ; CHECK: vphsubd + %res = call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1] + ret <8 x i32> %res +} +declare <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32>, <8 x i32>) nounwind readnone + + +define <16 x i16> @test_x86_avx2_phsub_sw(<16 x i16> %a0, <16 x i16> %a1) { + ; CHECK: vphsubsw + %res = call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] + ret <16 x i16> %res +} +declare <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16>, <16 x i16>) nounwind readnone + + +define <16 x i16> @test_x86_avx2_phsub_w(<16 x i16> %a0, <16 x i16> %a1) { + ; CHECK: vphsubw + %res = call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] + ret <16 x i16> %res +} +declare <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16>, <16 x i16>) nounwind readnone + + +define <16 x i16> @test_x86_avx2_pmadd_ub_sw(<32 x i8> %a0, <32 x i8> %a1) { + ; CHECK: vpmaddubsw + %res = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %a0, <32 x i8> %a1) ; <<16 x i16>> [#uses=1] + ret <16 x i16> %res +} +declare <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>) nounwind readnone + + +define <16 x i16> @test_x86_avx2_pmul_hr_sw(<16 x i16> %a0, <16 x i16> %a1) { + ; CHECK: vpmulhrsw + %res = call <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] + ret <16 x i16> %res +} +declare <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16>, <16 x i16>) nounwind readnone + + +define <32 x i8> @test_x86_avx2_pshuf_b(<32 x i8> %a0, <32 x i8> %a1) { + ; CHECK: vpshufb + %res = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> %a1) ; <<16 x i8>> [#uses=1] + ret <32 x i8> %res +} +declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>) nounwind readnone + + +define <32 x i8> @test_x86_avx2_psign_b(<32 x i8> %a0, <32 x i8> %a1) { + ; CHECK: vpsignb + %res = call <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1] + ret <32 x i8> %res +} +declare <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8>, <32 x i8>) nounwind readnone + + +define <8 x i32> @test_x86_avx2_psign_d(<8 x i32> %a0, <8 x i32> %a1) { + ; CHECK: vpsignd + %res = call <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32> %a0, <8 x i32> %a1) ; <<4 x i32>> [#uses=1] + ret <8 x i32> %res +} +declare <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32>, <8 x i32>) nounwind readnone + + +define <16 x i16> @test_x86_avx2_psign_w(<16 x i16> %a0, <16 x i16> %a1) { + ; CHECK: vpsignw + %res = call <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] + ret <16 x i16> %res +} +declare <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16>, <16 x i16>) nounwind readnone + + +define <8 x i32> @test_x86_avx2_pmovsxbd(<16 x i8> %a0) { + ; CHECK: vpmovsxbd + %res = call <8 x i32> @llvm.x86.avx2.pmovsxbd(<16 x i8> %a0) ; <<8 x i32>> [#uses=1] + ret <8 x i32> %res +} +declare <8 x i32> @llvm.x86.avx2.pmovsxbd(<16 x i8>) nounwind readnone + + +define <4 x i64> @test_x86_avx2_pmovsxbq(<16 x i8> %a0) { + ; CHECK: vpmovsxbq + %res = call <4 x i64> @llvm.x86.avx2.pmovsxbq(<16 x i8> %a0) ; <<4 x i64>> [#uses=1] + ret <4 x i64> %res +} +declare <4 x i64> @llvm.x86.avx2.pmovsxbq(<16 x i8>) nounwind readnone + + +define <16 x i16> @test_x86_avx2_pmovsxbw(<16 x i8> %a0) { + ; CHECK: vpmovsxbw + %res = call <16 x i16> @llvm.x86.avx2.pmovsxbw(<16 x i8> %a0) ; <<8 x i16>> [#uses=1] + ret <16 x i16> %res +} +declare <16 x i16> @llvm.x86.avx2.pmovsxbw(<16 x i8>) nounwind readnone + + +define <4 x i64> @test_x86_avx2_pmovsxdq(<4 x i32> %a0) { + ; CHECK: vpmovsxdq + %res = call <4 x i64> @llvm.x86.avx2.pmovsxdq(<4 x i32> %a0) ; <<4 x i64>> [#uses=1] + ret <4 x i64> %res +} +declare <4 x i64> @llvm.x86.avx2.pmovsxdq(<4 x i32>) nounwind readnone + + +define <8 x i32> @test_x86_avx2_pmovsxwd(<8 x i16> %a0) { + ; CHECK: vpmovsxwd + %res = call <8 x i32> @llvm.x86.avx2.pmovsxwd(<8 x i16> %a0) ; <<8 x i32>> [#uses=1] + ret <8 x i32> %res +} +declare <8 x i32> @llvm.x86.avx2.pmovsxwd(<8 x i16>) nounwind readnone + + +define <4 x i64> @test_x86_avx2_pmovsxwq(<8 x i16> %a0) { + ; CHECK: vpmovsxwq + %res = call <4 x i64> @llvm.x86.avx2.pmovsxwq(<8 x i16> %a0) ; <<4 x i64>> [#uses=1] + ret <4 x i64> %res +} +declare <4 x i64> @llvm.x86.avx2.pmovsxwq(<8 x i16>) nounwind readnone + + +define <8 x i32> @test_x86_avx2_pmovzxbd(<16 x i8> %a0) { + ; CHECK: vpmovzxbd + %res = call <8 x i32> @llvm.x86.avx2.pmovzxbd(<16 x i8> %a0) ; <<8 x i32>> [#uses=1] + ret <8 x i32> %res +} +declare <8 x i32> @llvm.x86.avx2.pmovzxbd(<16 x i8>) nounwind readnone + + +define <4 x i64> @test_x86_avx2_pmovzxbq(<16 x i8> %a0) { + ; CHECK: vpmovzxbq + %res = call <4 x i64> @llvm.x86.avx2.pmovzxbq(<16 x i8> %a0) ; <<4 x i64>> [#uses=1] + ret <4 x i64> %res +} +declare <4 x i64> @llvm.x86.avx2.pmovzxbq(<16 x i8>) nounwind readnone + + +define <16 x i16> @test_x86_avx2_pmovzxbw(<16 x i8> %a0) { + ; CHECK: vpmovzxbw + %res = call <16 x i16> @llvm.x86.avx2.pmovzxbw(<16 x i8> %a0) ; <<16 x i16>> [#uses=1] + ret <16 x i16> %res +} +declare <16 x i16> @llvm.x86.avx2.pmovzxbw(<16 x i8>) nounwind readnone + + +define <4 x i64> @test_x86_avx2_pmovzxdq(<4 x i32> %a0) { + ; CHECK: vpmovzxdq + %res = call <4 x i64> @llvm.x86.avx2.pmovzxdq(<4 x i32> %a0) ; <<4 x i64>> [#uses=1] + ret <4 x i64> %res +} +declare <4 x i64> @llvm.x86.avx2.pmovzxdq(<4 x i32>) nounwind readnone + + +define <8 x i32> @test_x86_avx2_pmovzxwd(<8 x i16> %a0) { + ; CHECK: vpmovzxwd + %res = call <8 x i32> @llvm.x86.avx2.pmovzxwd(<8 x i16> %a0) ; <<8 x i32>> [#uses=1] + ret <8 x i32> %res +} +declare <8 x i32> @llvm.x86.avx2.pmovzxwd(<8 x i16>) nounwind readnone + + +define <4 x i64> @test_x86_avx2_pmovzxwq(<8 x i16> %a0) { + ; CHECK: vpmovzxwq + %res = call <4 x i64> @llvm.x86.avx2.pmovzxwq(<8 x i16> %a0) ; <<4 x i64>> [#uses=1] + ret <4 x i64> %res +} +declare <4 x i64> @llvm.x86.avx2.pmovzxwq(<8 x i16>) nounwind readnone -- 2.11.0