From c700b40b547f3401eab615d40dcbca6f158c31f7 Mon Sep 17 00:00:00 2001 From: Ayman Musa Date: Tue, 6 Dec 2016 12:24:14 +0000 Subject: [PATCH] [X86][AVX512] Detect repeated constant patterns in BUILD_VECTOR suitable for broadcasting. Check if a build_vector node includes a repeated constant pattern and replace it with a broadcast of that pattern. For example: "build_vector <0, 1, 2, 3, 0, 1, 2, 3>" would be replaced by "broadcast <0, 1, 2, 3>" Differential Revision: https://reviews.llvm.org/D26802 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@288804 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 119 +- test/CodeGen/X86/avg.ll | 4 +- test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll | 1205 +++++++++++++++++++++ test/CodeGen/X86/vec_shift6.ll | 6 +- test/CodeGen/X86/vector-shuffle-512-v64.ll | 6 +- 5 files changed, 1329 insertions(+), 11 deletions(-) create mode 100644 test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 128dbff1219..bac2c8e8145 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -6316,8 +6316,47 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef Elts, return SDValue(); } -/// Attempt to use the vbroadcast instruction to generate a splat value for a -/// splat BUILD_VECTOR which uses a single scalar load, or a constant. +static Constant *getConstantVector(MVT VT, APInt SplatValue, + unsigned SplatBitSize, LLVMContext &C) { + unsigned ScalarSize = VT.getScalarSizeInBits(); + unsigned NumElm = SplatBitSize / ScalarSize; + + SmallVector ConstantVec; + for (unsigned i = 0; i < NumElm; i++) { + APInt Val = SplatValue.lshr(ScalarSize * i).trunc(ScalarSize); + Constant *Const; + if (VT.isFloatingPoint()) { + assert((ScalarSize == 32 || ScalarSize == 64) && + "Unsupported floating point scalar size"); + if (ScalarSize == 32) + Const = ConstantFP::get(Type::getFloatTy(C), Val.bitsToFloat()); + else + Const = ConstantFP::get(Type::getDoubleTy(C), Val.bitsToDouble()); + } else + Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val); + ConstantVec.push_back(Const); + } + return ConstantVector::get(ArrayRef(ConstantVec)); +} + +static bool isUseOfShuffle(SDNode *N) { + for (auto *U : N->uses()) { + if (isTargetShuffle(U->getOpcode())) + return true; + if (U->getOpcode() == ISD::BITCAST) // Ignore bitcasts + return isUseOfShuffle(U); + } + return false; +} + +/// Attempt to use the vbroadcast instruction to generate a splat value for the +/// following cases: +/// 1. A splat BUILD_VECTOR which uses: +/// a. A single scalar load, or a constant. +/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>). +/// 2. A splat shuffle which uses a scalar_to_vector node which comes from +/// a scalar load, or a constant. +/// /// The VBROADCAST node is returned when a pattern is found, /// or SDValue() otherwise. static SDValue LowerVectorBroadcast(BuildVectorSDNode *BVOp, const X86Subtarget &Subtarget, @@ -6339,8 +6378,82 @@ static SDValue LowerVectorBroadcast(BuildVectorSDNode *BVOp, const X86Subtarget // We need a splat of a single value to use broadcast, and it doesn't // make any sense if the value is only in one element of the vector. - if (!Ld || (VT.getVectorNumElements() - UndefElements.count()) <= 1) + if (!Ld || (VT.getVectorNumElements() - UndefElements.count()) <= 1) { + APInt SplatValue, Undef; + unsigned SplatBitSize; + bool HasUndef; + // Check if this is a repeated constant pattern suitable for broadcasting. + if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) && + SplatBitSize > VT.getScalarSizeInBits() && + SplatBitSize < VT.getSizeInBits()) { + // Avoid replacing with broadcast when it's a use of a shuffle + // instruction to preserve the present custom lowering of shuffles. + if (isUseOfShuffle(BVOp) || BVOp->hasOneUse()) + return SDValue(); + // replace BUILD_VECTOR with broadcast of the repeated constants. + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + LLVMContext *Ctx = DAG.getContext(); + MVT PVT = TLI.getPointerTy(DAG.getDataLayout()); + if (Subtarget.hasAVX()) { + if (SplatBitSize <= 64 && Subtarget.hasAVX2() && + !(SplatBitSize == 64 && Subtarget.is32Bit())) { + // Splatted value can fit in one INTEGER constant in constant pool. + // Load the constant and broadcast it. + MVT CVT = MVT::getIntegerVT(SplatBitSize); + Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize); + Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue); + SDValue CP = DAG.getConstantPool(C, PVT); + unsigned Repeat = VT.getSizeInBits() / SplatBitSize; + + unsigned Alignment = cast(CP)->getAlignment(); + Ld = DAG.getLoad( + CVT, dl, DAG.getEntryNode(), CP, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), + Alignment); + SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl, + MVT::getVectorVT(CVT, Repeat), Ld); + return DAG.getBitcast(VT, Brdcst); + } else if (SplatBitSize == 32 || SplatBitSize == 64) { + // Splatted value can fit in one FLOAT constant in constant pool. + // Load the constant and broadcast it. + // AVX have support for 32 and 64 bit broadcast for floats only. + // No 64bit integer in 32bit subtarget. + MVT CVT = MVT::getFloatingPointVT(SplatBitSize); + Constant *C = SplatBitSize == 32 + ? ConstantFP::get(Type::getFloatTy(*Ctx), + SplatValue.bitsToFloat()) + : ConstantFP::get(Type::getDoubleTy(*Ctx), + SplatValue.bitsToDouble()); + SDValue CP = DAG.getConstantPool(C, PVT); + unsigned Repeat = VT.getSizeInBits() / SplatBitSize; + + unsigned Alignment = cast(CP)->getAlignment(); + Ld = DAG.getLoad( + CVT, dl, DAG.getEntryNode(), CP, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), + Alignment); + SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl, + MVT::getVectorVT(CVT, Repeat), Ld); + return DAG.getBitcast(VT, Brdcst); + } else if (SplatBitSize > 64) { + // Load the vector of constants and broadcast it. + MVT CVT = VT.getScalarType(); + Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize, + *Ctx); + SDValue VCP = DAG.getConstantPool(VecC, PVT); + unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits(); + unsigned Alignment = cast(VCP)->getAlignment(); + Ld = DAG.getLoad( + MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP, + MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), + Alignment); + SDValue Brdcst = DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld); + return DAG.getBitcast(VT, Brdcst); + } + } + } return SDValue(); + } bool ConstSplatVal = (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP); diff --git a/test/CodeGen/X86/avg.ll b/test/CodeGen/X86/avg.ll index a3b3353c5e3..99d3f206f0c 100644 --- a/test/CodeGen/X86/avg.ll +++ b/test/CodeGen/X86/avg.ll @@ -2132,7 +2132,7 @@ define void @avg_v64i8_const(<64 x i8>* %a) { ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm4 = [1,2,3,4,5,6,7,8,1,2,3,4,5,6,7,8] +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vpaddd %zmm4, %zmm3, %zmm3 ; AVX512F-NEXT: vpaddd %zmm4, %zmm2, %zmm2 ; AVX512F-NEXT: vpaddd %zmm4, %zmm1, %zmm1 @@ -2405,7 +2405,7 @@ define void @avg_v32i16_const(<32 x i16>* %a) { ; AVX512F: # BB#0: ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm2 = [1,2,3,4,5,6,7,8,1,2,3,4,5,6,7,8] +; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = mem[0,1,2,3,0,1,2,3] ; AVX512F-NEXT: vpaddd %zmm2, %zmm1, %zmm1 ; AVX512F-NEXT: vpaddd %zmm2, %zmm0, %zmm0 ; AVX512F-NEXT: vpsrld $1, %zmm0, %zmm0 diff --git a/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll b/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll new file mode 100644 index 00000000000..b13965a30ed --- /dev/null +++ b/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll @@ -0,0 +1,1205 @@ +; NOTE: Assertions have been simpilfied MANUALLY after running utils/update_llc_test_checks.py +; Assertions for constant pools have been added MANUALLY. +; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX +; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx2 | FileCheck %s -check-prefix=ALL -check-prefix=ALL32 -check-prefix=NO-AVX512BW -check-prefix=AVX2 +; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512f | FileCheck %s -check-prefix=ALL -check-prefix=ALL32 -check-prefix=NO-AVX512BW -check-prefix=AVX512 +; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512f,+avx512bw | FileCheck %s -check-prefix=ALL -check-prefix=ALL32 -check-prefix=AVX512BW -check-prefix=AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s -check-prefix=ALL -check-prefix=ALL64 -check-prefix=NO-AVX512BW -check-prefix=AVX2 -check-prefix=AVX2-64 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s -check-prefix=ALL -check-prefix=ALL64 -check-prefix=NO-AVX512BW -check-prefix=AVX512 -check-prefix=AVX512F-64 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f,+avx512bw | FileCheck %s -check-prefix=ALL -check-prefix=ALL64 -check-prefix=AVX512BW -check-prefix=AVX512 -check-prefix=AVX512BW-64 + +;===-----------------------------------------------------------------------------=== +; This test checks the ability to recognize a cross element pattern of +; constants and perform the load via broadcasting a smaller constant +; vector. +; For example: +; => broadcast of the constant vector +;===-----------------------------------------------------------------------------=== + +; ALL: LCPI0 +; ALL-NEXT: .short 256 # 0x100 + +define <16 x i8> @f16xi8_i16(<16 x i8> %a) { +; ALL32-LABEL: f16xi8_i16: +; ALL32: # BB#0: +; ALL32-NEXT: vpbroadcastw {{\.LCPI.*}}, %xmm1 +; ALL32-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0 +; ALL32-NEXT: retl +; +; ALL64-LABEL: f16xi8_i16: +; ALL64: # BB#0: +; ALL64-NEXT: vpbroadcastw {{.*}}(%rip), %xmm1 +; ALL64-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0 +; ALL64-NEXT: retq + %res1 = add <16 x i8> , %a + %res2 = and <16 x i8> , %res1 + ret <16 x i8> %res2 +} + + +; ALL: .LCPI1 +; ALL-NEXT: .long 50462976 # 0x3020100 + +; AVX: .LCPI1 +; AVX-NEXT .long 50462976 # float 3.82047143E-37 + +define <16 x i8> @f16xi8_i32(<16 x i8> %a) { +; ALL32-LABEL: f16xi8_i32: +; ALL32: # BB#0: +; ALL32-NEXT: vpbroadcastd {{\.LCPI.*}}, %xmm1 +; ALL32-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0 +; ALL32-NEXT: retl +; +; ALL64-LABEL: f16xi8_i32: +; ALL64: # BB#0: +; ALL64-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; ALL64-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0 +; ALL64-NEXT: retq +; +; AVX-LABEL: f16xi8_i32: +; AVX: # BB#0: +; AVX-NEXT: vbroadcastss {{\.LCPI.*}}, %xmm1 +; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 + %res1 = add <16 x i8> , %a + %res2 = and <16 x i8> , %res1 + ret <16 x i8> %res2 +} + + +; ALL64: .LCPI2 +; ALL64-NEXT: .quad 506097522914230528 # 0x706050403020100 + +; AVX: .LCPI2 +; AVX-NEXT: .quad 506097522914230528 # double 7.9499288951273625E-275 + +define <16 x i8> @f16xi8_i64(<16 x i8> %a) { +; ALL32-LABEL: f16xi8_i64: +; ALL32: # BB#0: +; ALL32-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; ALL32-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0 +; ALL32-NEXT: retl +; +; ALL64-LABEL: f16xi8_i64: +; ALL64: # BB#0: +; ALL64-NEXT: vpbroadcastq {{.*}}(%rip), %xmm1 +; ALL64-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0 +; ALL64-NEXT: retq +; +; AVX-LABEL: f16xi8_i64: +; AVX: # BB#0: +; AVX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 + %res1 = add <16 x i8> , %a + %res2 = and <16 x i8> , %res1 + ret <16 x i8> %res2 +} + + +; ALL: .LCPI3 +; ALL-NEXT: .short 256 # 0x100 + +define <32 x i8> @f32xi8_i16(<32 x i8> %a) { +; ALL32-LABEL: f32xi8_i16: +; ALL32: # BB#0: +; ALL32-NEXT: vpbroadcastw {{\.LCPI.*}}, %ymm1 +; ALL32-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0 +; ALL32-NEXT: retl +; +; ALL64-LABEL: f32xi8_i16: +; ALL64: # BB#0: +; ALL64-NEXT: vpbroadcastw {{.*}}(%rip), %ymm1 +; ALL64-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0 +; ALL64-NEXT: retq + %res1 = add <32 x i8> , %a + %res2 = and <32 x i8> , %res1 + ret <32 x i8> %res2 +} + + +; ALL: .LCPI4 +; ALL-NEXT: .long 50462976 # 0x3020100 + +; AVX: .LCPI4 +; AVX-NEXT: .long 50462976 # float 3.82047143E-37 + +define <32 x i8> @f32xi8_i32(<32 x i8> %a) { +; ALL32-LABEL: f32xi8_i32: +; ALL32: # BB#0: +; ALL32-NEXT: vpbroadcastd {{\.LCPI.*}}, %ymm1 +; ALL32-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0 +; ALL32-NEXT: retl +; +; ALL64-LABEL: f32xi8_i32: +; ALL64: # BB#0: +; ALL64-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 +; ALL64-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0 +; ALL64-NEXT: retq +; +; AVX-LABEL: f32xi8_i32: +; AVX: # BB#0: +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-NEXT: vbroadcastss {{\.LCPI.*}}, %xmm2 +; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0 + %res1 = add <32 x i8> , %a + %res2 = and <32 x i8> , %res1 + ret <32 x i8> %res2 +} + + +; ALL64: .LCPI5 +; ALL64-NEXT: .quad 506097522914230528 # 0x706050403020100 + +; AVX: .LCPI5 +; AVX-NEXT: .quad 506097522914230528 # double 7.9499288951273625E-275 + +define <32 x i8> @f32xi8_i64(<32 x i8> %a) { +; ALL32-LABEL: f32xi8_i64: +; ALL32: # BB#0: +; ALL32-NEXT: vpbroadcastq {{\.LCPI.*}}, %ymm1 +; ALL32-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0 +; ALL32-NEXT: retl +; +; ALL64-LABEL: f32xi8_i64: +; ALL64: # BB#0: +; ALL64-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 +; ALL64-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0 +; ALL64-NEXT: retq +; +; AVX-LABEL: f32xi8_i64: +; AVX: # BB#0: +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0 + %res1 = add <32 x i8> , %a + %res2 = and <32 x i8> , %res1 + ret <32 x i8> %res2 +} + + +; ALL: .LCPI6 +; ALL-NEXT: .byte 0 # 0x0 +; ALL-NEXT: .byte 1 # 0x1 +; ALL-NEXT: .byte 2 # 0x2 +; ALL-NEXT: .byte 3 # 0x3 +; ALL-NEXT: .byte 4 # 0x4 +; ALL-NEXT: .byte 5 # 0x5 +; ALL-NEXT: .byte 6 # 0x6 +; ALL-NEXT: .byte 7 # 0x7 +; ALL-NEXT: .byte 8 # 0x8 +; ALL-NEXT: .byte 9 # 0x9 +; ALL-NEXT: .byte 10 # 0xa +; ALL-NEXT: .byte 11 # 0xb +; ALL-NEXT: .byte 12 # 0xc +; ALL-NEXT: .byte 13 # 0xd +; ALL-NEXT: .byte 14 # 0xe +; ALL-NEXT: .byte 15 # 0xf +; ALL-NOT: .byte + +define <32 x i8> @f32xi8_i128(<32 x i8> %a) { +; ALL-LABEL: f32xi8_i128: +; ALL: # BB#0: +; ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] +; ALL-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; ALL-NEXT: vpand %ymm1, %ymm0, %ymm0 + %res1 = add <32 x i8> , %a + %res2 = and <32 x i8> , %res1 + ret <32 x i8> %res2 +} + + +; ALL: .LCPI7 +; ALL-NEXT: .short 256 # 0x100 + +define <64 x i8> @f64xi8_i16(<64 x i8> %a) { +; NO-AVX512BW-LABEL: f64xi8_i16: +; NO-AVX512BW: # BB#0: +; NO-AVX512BW-NEXT: vpbroadcastw {{\.LCPI.*}}, %ymm2 +; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm1, %ymm1 +; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0 +; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1 +; +; AVX512BW-LABEL: f64xi8_i16: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vpbroadcastw {{\.LCPI.*}}, %zmm1 +; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 + %res1 = add <64 x i8> , %a + %res2 = and <64 x i8> , %res1 + ret <64 x i8> %res2 +} + + +; ALL: .LCPI8 +; ALL-NEXT: .long 50462976 # 0x3020100 + +; AVX: .LCPI8 +; AVX-NEXT: .long 50462976 # float 3.82047143E-37 + +define <64 x i8> @f64i8_i32(<64 x i8> %a) { +; NO-AVX512BW-LABEL: f64i8_i32: +; NO-AVX512BW: # BB#0: +; NO-AVX512BW-NEXT: vpbroadcastd {{\.LCPI.*}}, %ymm2 +; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm1, %ymm1 +; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0 +; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1 +; +; AVX512BW-LABEL: f64i8_i32: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vpbroadcastd {{\.LCPI.*}}, %zmm1 +; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; +; AVX-LABEL: f64i8_i32: +; AVX: # BB#0: +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-NEXT: vbroadcastss {{\.LCPI.*}}, %xmm3 +; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpaddb %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 + %res1 = add <64 x i8> , %a + %res2 = and <64 x i8> , %res1 + ret <64 x i8> %res2 +} + + +; ALL64: .LCPI9 +; ALL64-NEXT: .quad 506097522914230528 # 0x706050403020100 + +; ALL32: .LCPI9 +; ALL32-NEXT: .quad 506097522914230528 # double 7.9499288951273625E-275 + +; AVX: .LCPI9 +; AVX-NEXT: .quad 506097522914230528 # double 7.9499288951273625E-275 + +define <64 x i8> @f64xi8_i64(<64 x i8> %a) { +; NO-AVX512BW-LABEL: f64xi8_i64: +; NO-AVX512BW: # BB#0: +; NO-AVX512BW-NEXT: vpbroadcastq {{.*}}, %ymm2 +; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm1, %ymm1 +; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0 +; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1 +; +; AVX512BW-LABEL: f64xi8_i64: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vpbroadcastq {{.*}}, %zmm1 +; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; +; AVX-LABEL: f64xi8_i64: +; AVX: # BB#0: +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpaddb %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 + %res1 = add <64 x i8> , %a + %res2 = and <64 x i8> , %res1 + ret <64 x i8> %res2 +} + + +; ALL: .LCPI10 +; ALL-NEXT: .byte 0 # 0x0 +; ALL-NEXT: .byte 1 # 0x1 +; ALL-NEXT: .byte 2 # 0x2 +; ALL-NEXT: .byte 3 # 0x3 +; ALL-NEXT: .byte 4 # 0x4 +; ALL-NEXT: .byte 5 # 0x5 +; ALL-NEXT: .byte 6 # 0x6 +; ALL-NEXT: .byte 7 # 0x7 +; ALL-NEXT: .byte 8 # 0x8 +; ALL-NEXT: .byte 9 # 0x9 +; ALL-NEXT: .byte 10 # 0xa +; ALL-NEXT: .byte 11 # 0xb +; ALL-NEXT: .byte 12 # 0xc +; ALL-NEXT: .byte 13 # 0xd +; ALL-NEXT: .byte 14 # 0xe +; ALL-NEXT: .byte 15 # 0xf +; ALL-NOT: .byte + +define <64 x i8> @f64xi8_i128(<64 x i8> %a) { +; NO-AVX512BW-LABEL: f64xi8_i128: +; NO-AVX512BW: # BB#0: +; NO-AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1] +; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm1, %ymm1 +; NO-AVX512BW-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0 +; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1 +; +; AVX512BW-LABEL: f64xi8_i128: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 + %res1 = add <64 x i8> , %a + %res2 = and <64 x i8> , %res1 + ret <64 x i8> %res2 +} + + +; AVX512BW: .LCPI11 +; AVX512BW-NEXT: .byte 0 # 0x0 +; AVX512BW-NEXT: .byte 1 # 0x1 +; AVX512BW-NEXT: .byte 2 # 0x2 +; AVX512BW-NEXT: .byte 3 # 0x3 +; AVX512BW-NEXT: .byte 4 # 0x4 +; AVX512BW-NEXT: .byte 5 # 0x5 +; AVX512BW-NEXT: .byte 6 # 0x6 +; AVX512BW-NEXT: .byte 7 # 0x7 +; AVX512BW-NEXT: .byte 8 # 0x8 +; AVX512BW-NEXT: .byte 9 # 0x9 +; AVX512BW-NEXT: .byte 10 # 0xa +; AVX512BW-NEXT: .byte 11 # 0xb +; AVX512BW-NEXT: .byte 12 # 0xc +; AVX512BW-NEXT: .byte 13 # 0xd +; AVX512BW-NEXT: .byte 14 # 0xe +; AVX512BW-NEXT: .byte 15 # 0xf +; AVX512BW-NEXT: .byte 16 # 0x10 +; AVX512BW-NEXT: .byte 17 # 0x11 +; AVX512BW-NEXT: .byte 18 # 0x12 +; AVX512BW-NEXT: .byte 19 # 0x13 +; AVX512BW-NEXT: .byte 20 # 0x14 +; AVX512BW-NEXT: .byte 21 # 0x15 +; AVX512BW-NEXT: .byte 22 # 0x16 +; AVX512BW-NEXT: .byte 23 # 0x17 +; AVX512BW-NEXT: .byte 24 # 0x18 +; AVX512BW-NEXT: .byte 25 # 0x19 +; AVX512BW-NEXT: .byte 26 # 0x1a +; AVX512BW-NEXT: .byte 27 # 0x1b +; AVX512BW-NEXT: .byte 28 # 0x1c +; AVX512BW-NEXT: .byte 29 # 0x1d +; AVX512BW-NEXT: .byte 30 # 0x1e +; AVX512BW-NEXT: .byte 31 # 0x1f +; AVX512BW-NOT: .byte + +define <64 x i8> @f64xi8_i256(<64 x i8> %a) { +; AVX512BW-LABEL: f64xi8_i256: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 + %res1 = add <64 x i8> , %a + %res2 = and <64 x i8> , %res1 + ret <64 x i8> %res2 +} + + +; ALL: .LCPI12 +; ALL-NEXT: .long 65536 # 0x10000 + +; AVX: .LCPI12 +; AVX-NEXT: .long 65536 # float 9.18354962E-41 + +define <8 x i16> @f8xi16_i32(<8 x i16> %a) { +; ALL32-LABEL: f8xi16_i32: +; ALL32: # BB#0: +; ALL32-NEXT: vpbroadcastd {{\.LCPI.*}}, %xmm1 +; ALL32-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0 +; ALL32-NEXT: retl +; +; ALL64-LABEL: f8xi16_i32: +; ALL64: # BB#0: +; ALL64-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 +; ALL64-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0 +; ALL64-NEXT: retq +; +; AVX-LABEL: f8xi16_i32: +; AVX: # BB#0: +; AVX-NEXT: vbroadcastss {{\.LCPI.*}}, %xmm1 +; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 + %res1 = add <8 x i16> , %a + %res2 = and <8 x i16> , %res1 + ret <8 x i16> %res2 +} + + +; ALL64: .LCPI13 +; ALL64-NEXT: .quad 844433520132096 # 0x3000200010000 + +; ALL32: .LCPI13 +; ALL32-NEXT: .quad 844433520132096 # double 4.1720559249406128E-309 + +; AVX: .LCPI13 +; AVX-NEXT: .quad 844433520132096 # double 4.1720559249406128E-309 + +define <8 x i16> @f8xi16_i64(<8 x i16> %a) { +; ALL32-LABEL: f8xi16_i64: +; ALL32: # BB#0: +; ALL32-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; ALL32-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0 +; ALL32-NEXT: retl +; +; ALL64-LABEL: f8xi16_i64: +; ALL64: # BB#0: +; ALL64-NEXT: vpbroadcastq {{.*}}(%rip), %xmm1 +; ALL64-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0 +; ALL64-NEXT: retq +; +; AVX-LABEL: f8xi16_i64: +; AVX: # BB#0: +; AVX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 + %res1 = add <8 x i16> , %a + %res2 = and <8 x i16> , %res1 + ret <8 x i16> %res2 +} + + +; ALL: .LCPI14 +; ALL-NEXT: .long 65536 # 0x10000 + +; AVX: .LCPI14 +; AVX-NEXT: .long 65536 # float 9.18354962E-41 + +define <16 x i16> @f16xi16_i32(<16 x i16> %a) { +; ALL-LABEL: f16xi16_i32: +; ALL: # BB#0: +; ALL-NEXT: vpbroadcastd {{\.LCPI.*}}, %ymm1 +; ALL-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; ALL-NEXT: vpand %ymm1, %ymm0, %ymm0 +; +; AVX-LABEL: f16xi16_i32: +; AVX: # BB#0: +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-NEXT: vbroadcastss {{\.LCPI.*}}, %xmm2 +; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpaddw %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0 + %res1 = add <16 x i16> , %a + %res2 = and <16 x i16> , %res1 + ret <16 x i16> %res2 +} + + +; ALL64: .LCPI15 +; ALL64-NEXT: .quad 844433520132096 # 0x3000200010000 + +; ALL32: .LCPI15 +; ALL32-NEXT: .quad 844433520132096 # double 4.1720559249406128E-309 + +; AVX: .LCPI15 +; AVX-NEXT: .quad 844433520132096 # double 4.1720559249406128E-309 + +define <16 x i16> @f16xi16_i64(<16 x i16> %a) { +; ALL-LABEL: f16xi16_i64: +; ALL: # BB#0: +; ALL-NEXT: vpbroadcastq {{.*}}, %ymm1 +; ALL-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; ALL-NEXT: vpand %ymm1, %ymm0, %ymm0 +; +; AVX-LABEL: f16xi16_i64: +; AVX: # BB#0: +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpaddw %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0 + %res1 = add <16 x i16> , %a + %res2 = and <16 x i16> , %res1 + ret <16 x i16> %res2 +} + + +; ALL: .LCPI16 +; ALL-NEXT: .short 0 # 0x0 +; ALL-NEXT: .short 1 # 0x1 +; ALL-NEXT: .short 2 # 0x2 +; ALL-NEXT: .short 3 # 0x3 +; ALL-NEXT: .short 4 # 0x4 +; ALL-NEXT: .short 5 # 0x5 +; ALL-NEXT: .short 6 # 0x6 +; ALL-NEXT: .short 7 # 0x7 +; ALL-NOT: .short + +define <16 x i16> @f16xi16_i128(<16 x i16> %a) { +; ALL-LABEL: f16xi16_i128: +; ALL: # BB#0: +; ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] +; ALL-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; ALL-NEXT: vpand %ymm1, %ymm0, %ymm0 + %res1 = add <16 x i16> , %a + %res2 = and <16 x i16> , %res1 + ret <16 x i16> %res2 +} + + +; ALL: .LCPI17 +; ALL-NEXT: .long 65536 # 0x10000 + +; AVX: .LCPI17 +; AVX-NEXT: .long 65536 # float 9.18354962E-41 + +define <32 x i16> @f32xi16_i32(<32 x i16> %a) { +; NO-AVX512BW-LABEL: f32xi16_i32: +; NO-AVX512BW: # BB#0: +; NO-AVX512BW-NEXT: vpbroadcastd {{\.LCPI.*}}, %ymm2 +; NO-AVX512BW-NEXT: vpaddw %ymm2, %ymm1, %ymm1 +; NO-AVX512BW-NEXT: vpaddw %ymm2, %ymm0, %ymm0 +; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0 +; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1 +; +; AVX512BW-LABEL: f32xi16_i32: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vpbroadcastd {{\.LCPI.*}}, %zmm1 +; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; +; AVX-LABEL: f32xi16_i32: +; AVX: # BB#0: +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-NEXT: vbroadcastss {{\.LCPI.*}}, %xmm3 +; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpaddw %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpaddw %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 + %res1 = add <32 x i16> , %a + %res2 = and <32 x i16> , %res1 + ret <32 x i16> %res2 +} + + +; ALL64: .LCPI18 +; ALL64-NEXT: .quad 844433520132096 # 0x3000200010000 + +; ALL32: .LCPI18 +; ALL32-NEXT: .quad 844433520132096 # double 4.1720559249406128E-309 + +; AVX: .LCPI18 +; AVX-NEXT: .quad 844433520132096 # double 4.1720559249406128E-309 + +define <32 x i16> @f32xi16_i64(<32 x i16> %a) { +; NO-AVX512BW-LABEL: f32xi16_i64: +; NO-AVX512BW: # BB#0: +; NO-AVX512BW-NEXT: vpbroadcastq {{.*}}, %ymm2 +; NO-AVX512BW-NEXT: vpaddw %ymm2, %ymm1, %ymm1 +; NO-AVX512BW-NEXT: vpaddw %ymm2, %ymm0, %ymm0 +; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0 +; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1 +; +; AVX512BW-LABEL: f32xi16_i64: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vpbroadcastq {{.*}}, %zmm1 +; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; +; AVX-LABEL: f32xi16_i64: +; AVX: # BB#0: +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpaddw %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpaddw %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 + %res1 = add <32 x i16> , %a + %res2 = and <32 x i16> , %res1 + ret <32 x i16> %res2 +} + + +; ALL: .LCPI19 +; ALL-NEXT: .short 0 # 0x0 +; ALL-NEXT: .short 1 # 0x1 +; ALL-NEXT: .short 2 # 0x2 +; ALL-NEXT: .short 3 # 0x3 +; ALL-NEXT: .short 4 # 0x4 +; ALL-NEXT: .short 5 # 0x5 +; ALL-NEXT: .short 6 # 0x6 +; ALL-NEXT: .short 7 # 0x7 +; ALL-NOT: .short + +define <32 x i16> @f32xi16_i128(<32 x i16> %a) { +; NO-AVX512BW-LABEL: f32xi16_i128: +; NO-AVX512BW: # BB#0: +; NO-AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1] +; NO-AVX512BW-NEXT: vpaddw %ymm2, %ymm1, %ymm1 +; NO-AVX512BW-NEXT: vpaddw %ymm2, %ymm0, %ymm0 +; NO-AVX512BW-NEXT: vpand %ymm2, %ymm0, %ymm0 +; NO-AVX512BW-NEXT: vpand %ymm2, %ymm1, %ymm1 +; +; AVX512BW-LABEL: f32xi16_i128: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 + %res1 = add <32 x i16> , %a + %res2 = and <32 x i16> , %res1 + ret <32 x i16> %res2 +} + + +; AVX512BW: .LCPI20 +; AVX512BW-NEXT: .short 0 # 0x0 +; AVX512BW-NEXT: .short 1 # 0x1 +; AVX512BW-NEXT: .short 2 # 0x2 +; AVX512BW-NEXT: .short 3 # 0x3 +; AVX512BW-NEXT: .short 4 # 0x4 +; AVX512BW-NEXT: .short 5 # 0x5 +; AVX512BW-NEXT: .short 6 # 0x6 +; AVX512BW-NEXT: .short 7 # 0x7 +; AVX512BW-NEXT: .short 8 # 0x8 +; AVX512BW-NEXT: .short 9 # 0x9 +; AVX512BW-NEXT: .short 10 # 0xa +; AVX512BW-NEXT: .short 11 # 0xb +; AVX512BW-NEXT: .short 12 # 0xc +; AVX512BW-NEXT: .short 13 # 0xd +; AVX512BW-NEXT: .short 14 # 0xe +; AVX512BW-NEXT: .short 15 # 0xf +; AVX512BW-NOT: .short + +define <32 x i16> @f32xi16_i256(<32 x i16> %a) { +; AVX512BW-LABEL: f32xi16_i256: +; AVX512BW: # BB#0: +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 + %res1 = add <32 x i16> , %a + %res2 = and <32 x i16> , %res1 + ret <32 x i16> %res2 +} + + +; ALL64: .LCPI21 +; ALL64-NEXT: .quad 4294967296 # 0x100000000 + +; ALL32: .LCPI21 +; ALL32-NEXT: .quad 4294967296 # double 2.1219957909652723E-314 + +; AVX: .LCPI21 +; AVX-NEXT: .quad 4294967296 # double 2.1219957909652723E-314 + +define <4 x i32> @f4xi32_i64(<4 x i32> %a) { +; ALL32-LABEL: f4xi32_i64: +; ALL32: # BB#0: +; ALL32-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; ALL32-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; ALL32-NEXT: vpand %xmm1, %xmm0, %xmm0 +; ALL32-NEXT: retl +; +; ALL64-LABEL: f4xi32_i64: +; ALL64: # BB#0: +; ALL64-NEXT: vpbroadcastq {{.*}}(%rip), %xmm1 +; ALL64-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; ALL64-NEXT: vpand %xmm1, %xmm0, %xmm0 +; ALL64-NEXT: retq +; +; AVX-LABEL: f4xi32_i64: +; AVX: # BB#0: +; AVX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 + %res1 = add <4 x i32> , %a + %res2 = and <4 x i32> , %res1 + ret <4 x i32> %res2 +} + + +; ALL64: .LCPI22 +; ALL64-NEXT: .quad 4294967296 # 0x100000000 + +; ALL32: .LCPI22 +; ALL32-NEXT: .quad 4294967296 # double 2.1219957909652723E-314 + +; AVX: .LCPI22 +; AVX-NEXT: .quad 4294967296 # double 2.1219957909652723E-314 + +define <8 x i32> @f8xi32_i64(<8 x i32> %a) { +; ALL-LABEL: f8xi32_i64: +; ALL: # BB#0: +; ALL-NEXT: vpbroadcastq {{.*}}, %ymm1 +; ALL-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; ALL-NEXT: vpand %ymm1, %ymm0, %ymm0 +; +; AVX-LABEL: f8xi32_i64: +; AVX: # BB#0: +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-NEXT: vmovddup {{.*#+}} xmm2 = mem[0,0] +; AVX-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpaddd %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: vandps {{\.LCPI.*}}, %ymm0, %ymm0 + %res1 = add <8 x i32> , %a + %res2 = and <8 x i32> , %res1 + ret <8 x i32> %res2 +} + + +; ALL: .LCPI23 +; ALL-NEXT: .long 0 # 0x0 +; ALL-NEXT: .long 1 # 0x1 +; ALL-NEXT: .long 2 # 0x2 +; ALL-NEXT: .long 3 # 0x3 +; ALL-NOT: .long + +define <8 x i32> @f8xi32_i128(<8 x i32> %a) { +; ALL-LABEL: f8xi32_i128: +; ALL: # BB#0: +; ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] +; ALL-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; ALL-NEXT: vpand %ymm1, %ymm0, %ymm0 + %res1 = add <8 x i32> , %a + %res2 = and <8 x i32> , %res1 + ret <8 x i32> %res2 +} + + +; ALL64: .LCPI24 +; ALL64-NEXT: .quad 4294967296 # 0x100000000 + +; ALL32: .LCPI24 +; ALL32-NEXT: .quad 4294967296 # double 2.1219957909652723E-314 + +; AVX: .LCPI24 +; AVX-NEXT: .quad 4294967296 # double 2.1219957909652723E-314 + +define <16 x i32> @f16xi32_i64(<16 x i32> %a) { +; AVX2-LABEL: f16xi32_i64: +; AVX2: # BB#0: +; AVX2-NEXT: vpbroadcastq {{.*}}, %ymm2 +; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 +; +; AVX512-LABEL: f16xi32_i64: +; AVX512: # BB#0: +; AVX512-NEXT: vpbroadcastq {{.*}}, %zmm1 +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; +; AVX-LABEL: f16xi32_i64: +; AVX: # BB#0: +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0] +; AVX-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpaddd %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpaddd %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1] +; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 + %res1 = add <16 x i32> , %a + %res2 = and <16 x i32> , %res1 + ret <16 x i32> %res2 +} + + +; ALL: .LCPI25 +; ALL-NEXT: .long 0 # 0x0 +; ALL-NEXT: .long 1 # 0x1 +; ALL-NEXT: .long 2 # 0x2 +; ALL-NEXT: .long 3 # 0x3 +; ALL-NOT: .long + +define <16 x i32> @f16xi32_i128(<16 x i32> %a) { +; AVX2-LABEL: f16xi32_i128: +; AVX2: # BB#0: +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1] +; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 +; +; AVX512-LABEL: f16xi32_i128: +; AVX512: # BB#0: +; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 + %res1 = add <16 x i32> , %a + %res2 = and <16 x i32> , %res1 + ret <16 x i32> %res2 +} + + +; ALL64: .LCPI26 +; ALL64-NEXT: .quad 0 # 0x0 +; ALL64-NEXT: .quad 1 # 0x1 +; ALL64-NOT: .quad + +define <4 x i64> @f4xi64_i128(<4 x i64> %a) { +; ALL64-LABEL: f4xi64_i128: +; ALL64: # BB#0: +; ALL64-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] +; ALL64-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0 +; ALL64-NEXT: retq + %res1 = add <4 x i64> , %a + %res2 = and <4 x i64> , %res1 + ret <4 x i64> %res2 +} + + +; ALL64: .LCPI27 +; ALL64-NEXT: .quad 0 # 0x0 +; ALL64-NEXT: .quad 1 # 0x1 +; ALL64-NOT: .quad + +define <8 x i64> @f8xi64_i128(<8 x i64> %a) { +; AVX2-64-LABEL: f8xi64_i128: +; AVX2-64: # BB#0: +; AVX2-64-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1] +; AVX2-64-NEXT: vpaddq %ymm2, %ymm1, %ymm1 +; AVX2-64-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-64-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-64-NEXT: retq +; +; AVX512F-64-LABEL: f8xi64_i128: +; AVX512F-64: # BB#0: +; AVX512F-64-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-64-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; AVX512F-64-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512F-64-NEXT: retq +; +; AVX512BW-64-LABEL: f8xi64_i128: +; AVX512BW-64: # BB#0: +; AVX512BW-64-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-64-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BW-64-NEXT: retq + %res1 = add <8 x i64> , %a + %res2 = and <8 x i64> , %res1 + ret <8 x i64> %res2 +} + + +; ALL64: .LCPI28 +; ALL64-NEXT: .quad 0 # 0x0 +; ALL64-NEXT: .quad 1 # 0x1 +; ALL64-NEXT: .quad 2 # 0x2 +; ALL64-NEXT: .quad 3 # 0x3 +; ALL64-NOT: .quad + +define <8 x i64> @f8xi64_i256(<8 x i64> %a) { +; AVX512F-64-LABEL: f8xi64_i256: +; AVX512F-64: # BB#0: +; AVX512F-64-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512F-64-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; AVX512F-64-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512F-64-NEXT: retq +; +; AVX512BW-64-LABEL: f8xi64_i256: +; AVX512BW-64: # BB#0: +; AVX512BW-64-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-64-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; AVX512BW-64-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BW-64-NEXT: retq + %res1 = add <8 x i64> , %a + %res2 = and <8 x i64> , %res1 + ret <8 x i64> %res2 +} + + +; ALL: .LCPI29 +; ALL-NEXT: .quad 4575657222482165760 + +; AVX: .LCPI29 +; AVX-NEXT: .quad 4575657222482165760 # double 0.0078125018626451492 + +define <4 x float> @f4xf32_f64(<4 x float> %a) { +; ALL32-LABEL: f4xf32_f64: +; ALL32: # BB#0: +; ALL32-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; ALL32-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; ALL32-NEXT: vdivps %xmm0, %xmm1, %xmm0 +; ALL32-NEXT: retl +; +; ALL64-LABEL: f4xf32_f64: +; ALL64: # BB#0: +; ALL64-NEXT: vpbroadcastq {{.*}}(%rip), %xmm1 +; ALL64-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; ALL64-NEXT: vdivps %xmm0, %xmm1, %xmm0 +; ALL64-NEXT: retq +; +; AVX-LABEL: f4xf32_f64: +; AVX: # BB#0: +; AVX-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] +; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vdivps %xmm0, %xmm1, %xmm0 + %res1 = fadd <4 x float> , %a + %res2 = fdiv <4 x float> , %res1 + ret <4 x float> %res2 +} + + +; ALL64: .LCPI30 +; ALL64-NEXT: .quad 4575657222482165760 # 0x3f80000040000000 + +; ALL32: .LCPI30 +; ALL32-NEXT: .quad 4575657222482165760 # double 0.0078125018626451492 + +; AVX: .LCPI30 +; AVX-NEXT: .quad 4575657222482165760 # double 0.0078125018626451492 + +define <8 x float> @f8xf32_f64(<8 x float> %a) { +; ALL-LABEL: f8xf32_f64: +; ALL: # BB#0: +; ALL-NEXT: vbroadcastsd {{.*}}, %ymm1 +; ALL-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; ALL-NEXT: vdivps %ymm0, %ymm1, %ymm0 +; +; AVX-LABEL: f8xf32_f64: +; AVX: # BB#0: +; AVX-NEXT: vbroadcastsd {{\.LCPI.*}}, %ymm1 +; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vdivps %ymm0, %ymm1, %ymm0 + %res1 = fadd <8 x float> , %a + %res2 = fdiv <8 x float> , %res1 + ret <8 x float> %res2 +} + + +; ALL: .LCPI31 +; ALL-NEXT: .long 1082130432 # float 4 +; ALL-NEXT: .long 1065353216 # float 1 +; ALL-NEXT: .long 1073741824 # float 2 +; ALL-NEXT: .long 1077936128 # float 3 +; ALL-NOT: .long + +define <8 x float> @f8xf32_f128(<8 x float> %a) { +; ALL-LABEL: f8xf32_f128: +; ALL: # BB#0: +; ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] +; ALL-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; ALL-NEXT: vdivps %ymm0, %ymm1, %ymm0 +; +; AVX-LABEL: f8xf32_f128: +; AVX: # BB#0: +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] +; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vdivps %ymm0, %ymm1, %ymm0 + %res1 = fadd <8 x float> , %a + %res2 = fdiv <8 x float> , %res1 + ret <8 x float> %res2 +} + + +; ALL64: .LCPI32 +; ALL64-NEXT: .quad 4575657222482165760 # 0x3f80000040000000 + +; ALL32: .LCPI32 +; ALL32-NEXT: .quad 4575657222482165760 # double 0.0078125018626451492 + +; AVX: .LCPI32 +; AVX-NEXT: .quad 4575657222482165760 # double 0.0078125018626451492 + +define <16 x float> @f16xf32_f64(<16 x float> %a) { +; AVX2-LABEL: f16xf32_f64: +; AVX2: # BB#0: +; AVX2-NEXT: vbroadcastsd {{.*}}, %ymm2 +; AVX2-NEXT: vaddps %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vaddps %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vdivps %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vdivps %ymm1, %ymm2, %ymm1 +; +; AVX512-LABEL: f16xf32_f64: +; AVX512: # BB#0: +; AVX512-NEXT: vbroadcastsd {{.*}}, %zmm1 +; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vdivps %zmm0, %zmm1, %zmm0 +; +; AVX-LABEL: f16xf32_f64: +; AVX: # BB#0: +; AVX-NEXT: vbroadcastsd {{\.LCPI.*}}, %ymm2 +; AVX-NEXT: vaddps %ymm2, %ymm1, %ymm1 +; AVX-NEXT: vaddps %ymm2, %ymm0, %ymm0 +; AVX-NEXT: vdivps %ymm0, %ymm2, %ymm0 +; AVX-NEXT: vdivps %ymm1, %ymm2, %ymm1 + %res1 = fadd <16 x float> , %a + %res2 = fdiv <16 x float> , %res1 + ret <16 x float> %res2 +} + + +; ALL: .LCPI33 +; ALL-NEXT: .long 1082130432 # float 4 +; ALL-NEXT: .long 1065353216 # float 1 +; ALL-NEXT: .long 1073741824 # float 2 +; ALL-NEXT: .long 1077936128 # float 3 +; ALL-NOT: .long + +define <16 x float> @f16xf32_f128(<16 x float> %a) { +; AVX2-LABEL: f16xf32_f128: +; AVX2: # BB#0: +; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1] +; AVX2-NEXT: vaddps %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vaddps %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vdivps %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vdivps %ymm1, %ymm2, %ymm1 +; +; AVX512-LABEL: f16xf32_f128: +; AVX512: # BB#0: +; AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vdivps %zmm0, %zmm1, %zmm0 +; +; AVX-LABEL: f16xf32_f128: +; AVX: # BB#0: +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1] +; AVX-NEXT: vaddps %ymm2, %ymm1, %ymm1 +; AVX-NEXT: vaddps %ymm2, %ymm0, %ymm0 +; AVX-NEXT: vdivps %ymm0, %ymm2, %ymm0 +; AVX-NEXT: vdivps %ymm1, %ymm2, %ymm1 + %res1 = fadd <16 x float> , %a + %res2 = fdiv <16 x float> , %res1 + ret <16 x float> %res2 +} + + +; AVX512: .LCPI34 +; AVX512-NEXT: .long 1090519040 # float 8 +; AVX512-NEXT: .long 1065353216 # float 1 +; AVX512-NEXT: .long 1073741824 # float 2 +; AVX512-NEXT: .long 1077936128 # float 3 +; AVX512-NEXT: .long 1082130432 # float 4 +; AVX512-NEXT: .long 1084227584 # float 5 +; AVX512-NEXT: .long 1086324736 # float 6 +; AVX512-NEXT: .long 1088421888 # float 7 +; AVX512-NOT: .long + +define <16 x float> @f16xf32_f256(<16 x float> %a) { +; AVX512-LABEL: f16xf32_f256: +; AVX512: # BB#0: +; AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vdivps %zmm0, %zmm1, %zmm0 + %res1 = fadd <16 x float> , %a + %res2 = fdiv <16 x float> , %res1 + ret <16 x float> %res2 +} + + +; ALL: .LCPI35 +; ALL-NEXT: .quad 4611686018427387904 # double 2 +; ALL-NEXT: .quad 4607182418800017408 # double 1 +; ALL-NOT: .quad + +define <4 x double> @f4xf64_f128(<4 x double> %a) { +; ALL-LABEL: f4xf64_f128: +; ALL: # BB#0: +; ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] +; ALL-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; ALL-NEXT: vdivpd %ymm0, %ymm1, %ymm0 +; +; AVX-LABEL: f4xf64_f128: +; AVX: # BB#0: +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] +; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vdivpd %ymm0, %ymm1, %ymm0 + %res1 = fadd <4 x double> , %a + %res2 = fdiv <4 x double> , %res1 + ret <4 x double> %res2 +} + + +; ALL: .LCPI36 +; ALL-NEXT: .quad 4611686018427387904 # double 2 +; ALL-NEXT: .quad 4607182418800017408 # double 1 +; ALL-NOT: .quad + +define <8 x double> @f8xf64_f128(<8 x double> %a) { +; AVX2-LABEL: f8xf64_f128: +; AVX2: # BB#0: +; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1] +; AVX2-NEXT: vaddpd %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vaddpd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vdivpd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vdivpd %ymm1, %ymm2, %ymm1 +; +; AVX512-LABEL: f8xf64_f128: +; AVX512: # BB#0: +; AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vdivpd %zmm0, %zmm1, %zmm0 +; +; AVX-LABEL: f8xf64_f128: +; AVX: # BB#0: +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1] +; AVX-NEXT: vaddpd %ymm2, %ymm1, %ymm1 +; AVX-NEXT: vaddpd %ymm2, %ymm0, %ymm0 +; AVX-NEXT: vdivpd %ymm0, %ymm2, %ymm0 +; AVX-NEXT: vdivpd %ymm1, %ymm2, %ymm1 + %res1 = fadd <8 x double> , %a + %res2 = fdiv <8 x double> , %res1 + ret <8 x double> %res2 +} + + +; AVX512: .LCPI37 +; AVX512-NEXT: .quad 4616189618054758400 # double 4 +; AVX512-NEXT: .quad 4607182418800017408 # double 1 +; AVX512-NEXT: .quad 4611686018427387904 # double 2 +; AVX512-NEXT: .quad 4613937818241073152 # double 3 +; AVX512-NOT: .quad + +define <8 x double> @f8xf64_f256(<8 x double> %a) { +; AVX512-LABEL: f8xf64_f256: +; AVX512: # BB#0: +; AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm1 = mem[0,1,2,3,0,1,2,3] +; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vdivpd %zmm0, %zmm1, %zmm0 + %res1 = fadd <8 x double> , %a + %res2 = fdiv <8 x double> , %res1 + ret <8 x double> %res2 +} + diff --git a/test/CodeGen/X86/vec_shift6.ll b/test/CodeGen/X86/vec_shift6.ll index c4b7f204be6..b4a58deff2f 100644 --- a/test/CodeGen/X86/vec_shift6.ll +++ b/test/CodeGen/X86/vec_shift6.ll @@ -153,14 +153,14 @@ define <32 x i16> @test7(<32 x i16> %a) { ; ; AVX2-LABEL: test7: ; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [2,2,4,8,128,1,512,2048,2,2,4,8,128,1,512,2048] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1] ; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: test7: ; AVX512: # BB#0: -; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [2,2,4,8,128,1,512,2048,2,2,4,8,128,1,512,2048] +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1] ; AVX512-NEXT: vpmullw %ymm2, %ymm0, %ymm0 ; AVX512-NEXT: vpmullw %ymm2, %ymm1, %ymm1 ; AVX512-NEXT: retq @@ -183,7 +183,7 @@ define <16 x i32> @test8(<16 x i32> %a) { ; ; AVX2-LABEL: test8: ; AVX2: # BB#0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [1,1,2,3,1,1,2,3] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1] ; AVX2-NEXT: vpsllvd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpsllvd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retq diff --git a/test/CodeGen/X86/vector-shuffle-512-v64.ll b/test/CodeGen/X86/vector-shuffle-512-v64.ll index 5b6c2be1b34..ff0d1bbc6b1 100644 --- a/test/CodeGen/X86/vector-shuffle-512-v64.ll +++ b/test/CodeGen/X86/vector-shuffle-512-v64.ll @@ -466,7 +466,7 @@ define <64 x i8> @shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_ define <64 x i8> @shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_47_80_45_82_43_84_41_86_39_88_37_90_35_92_33_94_31_96_29_98_27_100_25_102_23_104_21_106_19_108_17_110_15_112_13_114_11_116_9_118_7_120_5_122_3_124_1_126(<64 x i8> %a, <64 x i8> %b) { ; AVX512F-LABEL: shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_47_80_45_82_43_84_41_86_39_88_37_90_35_92_33_94_31_96_29_98_27_100_25_102_23_104_21_106_19_108_17_110_15_112_13_114_11_116_9_118_7_120_5_122_3_124_1_126: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX512F-NEXT: vpbroadcastw {{.*}}(%rip), %ymm4 ; AVX512F-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3,0,1] ; AVX512F-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 @@ -482,7 +482,7 @@ define <64 x i8> @shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_ ; AVX512BW-LABEL: shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_47_80_45_82_43_84_41_86_39_88_37_90_35_92_33_94_31_96_29_98_27_100_25_102_23_104_21_106_19_108_17_110_15_112_13_114_11_116_9_118_7_120_5_122_3_124_1_126: ; AVX512BW: # BB#0: ; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX512BW-NEXT: vpbroadcastw {{.*}}(%rip), %ymm3 ; AVX512BW-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm2 ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm2[2,3,0,1] ; AVX512BW-NEXT: vpblendvb %ymm3, %ymm2, %ymm4, %ymm2 @@ -498,7 +498,7 @@ define <64 x i8> @shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_ ; ; AVX512DQ-LABEL: shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_47_80_45_82_43_84_41_86_39_88_37_90_35_92_33_94_31_96_29_98_27_100_25_102_23_104_21_106_19_108_17_110_15_112_13_114_11_116_9_118_7_120_5_122_3_124_1_126: ; AVX512DQ: # BB#0: -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX512DQ-NEXT: vpbroadcastw {{.*}}(%rip), %ymm4 ; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm2, %ymm1, %ymm1 ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3,0,1] ; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 -- 2.11.0