private:
void Select(SDNode *N) override;
- bool tryGather(SDNode *N, unsigned Opc);
bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM);
llvm_unreachable("unrecognized size for LdVT");
}
-/// Customized ISel for GATHER operations.
-bool X86DAGToDAGISel::tryGather(SDNode *Node, unsigned Opc) {
- // Operands of Gather: VSrc, Base, VIdx, VMask, Scale
- SDValue Chain = Node->getOperand(0);
- SDValue VSrc = Node->getOperand(2);
- SDValue Base = Node->getOperand(3);
- SDValue VIdx = Node->getOperand(4);
- SDValue VMask = Node->getOperand(5);
- ConstantSDNode *Scale = dyn_cast<ConstantSDNode>(Node->getOperand(6));
- if (!Scale)
- return false;
-
- SDVTList VTs = CurDAG->getVTList(VSrc.getValueType(), VSrc.getValueType(),
- MVT::Other);
-
- SDLoc DL(Node);
-
- // Memory Operands: Base, Scale, Index, Disp, Segment
- SDValue Disp = CurDAG->getTargetConstant(0, DL, MVT::i32);
- SDValue Segment = CurDAG->getRegister(0, MVT::i32);
- const SDValue Ops[] = { VSrc, Base, getI8Imm(Scale->getSExtValue(), DL), VIdx,
- Disp, Segment, VMask, Chain};
- SDNode *ResNode = CurDAG->getMachineNode(Opc, DL, VTs, Ops);
- // Node has 2 outputs: VDst and MVT::Other.
- // ResNode has 3 outputs: VDst, VMask_wb, and MVT::Other.
- // We replace VDst of Node with VDst of ResNode, and Other of Node with Other
- // of ResNode.
- ReplaceUses(SDValue(Node, 0), SDValue(ResNode, 0));
- ReplaceUses(SDValue(Node, 1), SDValue(ResNode, 2));
- CurDAG->RemoveDeadNode(Node);
- return true;
-}
-
void X86DAGToDAGISel::Select(SDNode *Node) {
MVT NVT = Node->getSimpleValueType(0);
unsigned Opc, MOpc;
}
break;
}
- case ISD::INTRINSIC_W_CHAIN: {
- unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
- switch (IntNo) {
- default: break;
- case Intrinsic::x86_avx2_gather_d_pd:
- case Intrinsic::x86_avx2_gather_d_pd_256:
- case Intrinsic::x86_avx2_gather_q_pd:
- case Intrinsic::x86_avx2_gather_q_pd_256:
- case Intrinsic::x86_avx2_gather_d_ps:
- case Intrinsic::x86_avx2_gather_d_ps_256:
- case Intrinsic::x86_avx2_gather_q_ps:
- case Intrinsic::x86_avx2_gather_q_ps_256:
- case Intrinsic::x86_avx2_gather_d_q:
- case Intrinsic::x86_avx2_gather_d_q_256:
- case Intrinsic::x86_avx2_gather_q_q:
- case Intrinsic::x86_avx2_gather_q_q_256:
- case Intrinsic::x86_avx2_gather_d_d:
- case Intrinsic::x86_avx2_gather_d_d_256:
- case Intrinsic::x86_avx2_gather_q_d:
- case Intrinsic::x86_avx2_gather_q_d_256: {
- if (!Subtarget->hasAVX2())
- break;
- unsigned Opc;
- switch (IntNo) {
- default: llvm_unreachable("Impossible intrinsic");
- case Intrinsic::x86_avx2_gather_d_pd: Opc = X86::VGATHERDPDrm; break;
- case Intrinsic::x86_avx2_gather_d_pd_256: Opc = X86::VGATHERDPDYrm; break;
- case Intrinsic::x86_avx2_gather_q_pd: Opc = X86::VGATHERQPDrm; break;
- case Intrinsic::x86_avx2_gather_q_pd_256: Opc = X86::VGATHERQPDYrm; break;
- case Intrinsic::x86_avx2_gather_d_ps: Opc = X86::VGATHERDPSrm; break;
- case Intrinsic::x86_avx2_gather_d_ps_256: Opc = X86::VGATHERDPSYrm; break;
- case Intrinsic::x86_avx2_gather_q_ps: Opc = X86::VGATHERQPSrm; break;
- case Intrinsic::x86_avx2_gather_q_ps_256: Opc = X86::VGATHERQPSYrm; break;
- case Intrinsic::x86_avx2_gather_d_q: Opc = X86::VPGATHERDQrm; break;
- case Intrinsic::x86_avx2_gather_d_q_256: Opc = X86::VPGATHERDQYrm; break;
- case Intrinsic::x86_avx2_gather_q_q: Opc = X86::VPGATHERQQrm; break;
- case Intrinsic::x86_avx2_gather_q_q_256: Opc = X86::VPGATHERQQYrm; break;
- case Intrinsic::x86_avx2_gather_d_d: Opc = X86::VPGATHERDDrm; break;
- case Intrinsic::x86_avx2_gather_d_d_256: Opc = X86::VPGATHERDDYrm; break;
- case Intrinsic::x86_avx2_gather_q_d: Opc = X86::VPGATHERQDrm; break;
- case Intrinsic::x86_avx2_gather_q_d_256: Opc = X86::VPGATHERQDYrm; break;
- }
- if (tryGather(Node, Opc))
- return;
- break;
- }
- }
- break;
- }
case X86ISD::GlobalBaseReg:
ReplaceNode(Node, getGlobalBaseReg());
return;
}
}
+static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
+ SDValue Src, SDValue Mask, SDValue Base,
+ SDValue Index, SDValue ScaleOp, SDValue Chain,
+ const X86Subtarget &Subtarget) {
+ SDLoc dl(Op);
+ auto *C = cast<ConstantSDNode>(ScaleOp);
+ SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
+ EVT MaskVT = Mask.getValueType();
+ SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
+ SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
+ SDValue Segment = DAG.getRegister(0, MVT::i32);
+ // If source is undef or we know it won't be used, use a zero vector
+ // to break register dependency.
+ // TODO: use undef instead and let ExeDepsFix deal with it?
+ if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
+ Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
+ SDValue Ops[] = {Src, Base, Scale, Index, Disp, Segment, Mask, Chain};
+ SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
+ SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
+ return DAG.getMergeValues(RetOps, dl);
+}
+
static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
SDValue Src, SDValue Mask, SDValue Base,
SDValue Index, SDValue ScaleOp, SDValue Chain,
return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
SDValue(Result.getNode(), 2));
}
+ case GATHER_AVX2: {
+ SDValue Chain = Op.getOperand(0);
+ SDValue Src = Op.getOperand(2);
+ SDValue Base = Op.getOperand(3);
+ SDValue Index = Op.getOperand(4);
+ SDValue Mask = Op.getOperand(5);
+ SDValue Scale = Op.getOperand(6);
+ return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
+ Scale, Chain, Subtarget);
+ }
case GATHER: {
//gather(v1, mask, index, base, scale);
SDValue Chain = Op.getOperand(0);
TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32,
EXPAND_FROM_MEM,
TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, KUNPCK, FIXUPIMM, FIXUPIMM_MASKZ, FIXUPIMMS,
- FIXUPIMMS_MASKZ, CONVERT_MASK_TO_VEC, CONVERT_TO_MASK
+ FIXUPIMMS_MASKZ, CONVERT_MASK_TO_VEC, CONVERT_TO_MASK, GATHER_AVX2
};
struct IntrinsicData {
X86_INTRINSIC_DATA(addcarryx_u32, ADX, X86ISD::ADC, 0),
X86_INTRINSIC_DATA(addcarryx_u64, ADX, X86ISD::ADC, 0),
+ X86_INTRINSIC_DATA(avx2_gather_d_d, GATHER_AVX2, X86::VPGATHERDDrm, 0),
+ X86_INTRINSIC_DATA(avx2_gather_d_d_256, GATHER_AVX2, X86::VPGATHERDDYrm, 0),
+ X86_INTRINSIC_DATA(avx2_gather_d_pd, GATHER_AVX2, X86::VGATHERDPDrm, 0),
+ X86_INTRINSIC_DATA(avx2_gather_d_pd_256, GATHER_AVX2, X86::VGATHERDPDYrm, 0),
+ X86_INTRINSIC_DATA(avx2_gather_d_ps, GATHER_AVX2, X86::VGATHERDPSrm, 0),
+ X86_INTRINSIC_DATA(avx2_gather_d_ps_256, GATHER_AVX2, X86::VGATHERDPSYrm, 0),
+ X86_INTRINSIC_DATA(avx2_gather_d_q, GATHER_AVX2, X86::VPGATHERDQrm, 0),
+ X86_INTRINSIC_DATA(avx2_gather_d_q_256, GATHER_AVX2, X86::VPGATHERDQYrm, 0),
+ X86_INTRINSIC_DATA(avx2_gather_q_d, GATHER_AVX2, X86::VPGATHERQDrm, 0),
+ X86_INTRINSIC_DATA(avx2_gather_q_d_256, GATHER_AVX2, X86::VPGATHERQDYrm, 0),
+ X86_INTRINSIC_DATA(avx2_gather_q_pd, GATHER_AVX2, X86::VGATHERQPDrm, 0),
+ X86_INTRINSIC_DATA(avx2_gather_q_pd_256, GATHER_AVX2, X86::VGATHERQPDYrm, 0),
+ X86_INTRINSIC_DATA(avx2_gather_q_ps, GATHER_AVX2, X86::VGATHERQPSrm, 0),
+ X86_INTRINSIC_DATA(avx2_gather_q_ps_256, GATHER_AVX2, X86::VGATHERQPSYrm, 0),
+ X86_INTRINSIC_DATA(avx2_gather_q_q, GATHER_AVX2, X86::VPGATHERQQrm, 0),
+ X86_INTRINSIC_DATA(avx2_gather_q_q_256, GATHER_AVX2, X86::VPGATHERQQYrm, 0),
+
X86_INTRINSIC_DATA(avx512_gather_dpd_512, GATHER, X86::VGATHERDPDZrm, 0),
X86_INTRINSIC_DATA(avx512_gather_dpi_512, GATHER, X86::VPGATHERDDZrm, 0),
X86_INTRINSIC_DATA(avx512_gather_dpq_512, GATHER, X86::VPGATHERDQZrm, 0),
; X32-LABEL: test_x86_avx2_gather_d_ps:
; X32: ## BB#0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vxorps %xmm2, %xmm2, %xmm2
; X32-NEXT: vgatherdps %xmm1, (%eax,%xmm0,2), %xmm2
; X32-NEXT: vmovaps %xmm2, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_x86_avx2_gather_d_ps:
; X64: ## BB#0:
+; X64-NEXT: vxorps %xmm2, %xmm2, %xmm2
; X64-NEXT: vgatherdps %xmm1, (%rdi,%xmm0,2), %xmm2
; X64-NEXT: vmovaps %xmm2, %xmm0
; X64-NEXT: retq
; X32-LABEL: test_x86_avx2_gather_d_pd:
; X32: ## BB#0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vxorpd %xmm2, %xmm2, %xmm2
; X32-NEXT: vgatherdpd %xmm1, (%eax,%xmm0,2), %xmm2
; X32-NEXT: vmovapd %xmm2, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_x86_avx2_gather_d_pd:
; X64: ## BB#0:
+; X64-NEXT: vxorpd %xmm2, %xmm2, %xmm2
; X64-NEXT: vgatherdpd %xmm1, (%rdi,%xmm0,2), %xmm2
; X64-NEXT: vmovapd %xmm2, %xmm0
; X64-NEXT: retq
; X32-LABEL: test_x86_avx2_gather_d_ps_256:
; X32: ## BB#0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vxorps %ymm2, %ymm2, %ymm2
; X32-NEXT: vgatherdps %ymm1, (%eax,%ymm0,4), %ymm2
; X32-NEXT: vmovaps %ymm2, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_x86_avx2_gather_d_ps_256:
; X64: ## BB#0:
+; X64-NEXT: vxorps %ymm2, %ymm2, %ymm2
; X64-NEXT: vgatherdps %ymm1, (%rdi,%ymm0,4), %ymm2
; X64-NEXT: vmovaps %ymm2, %ymm0
; X64-NEXT: retq
; X32-LABEL: test_x86_avx2_gather_d_pd_256:
; X32: ## BB#0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: vxorpd %ymm2, %ymm2, %ymm2
; X32-NEXT: vgatherdpd %ymm1, (%eax,%xmm0,8), %ymm2
; X32-NEXT: vmovapd %ymm2, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: test_x86_avx2_gather_d_pd_256:
; X64: ## BB#0:
+; X64-NEXT: vxorpd %ymm2, %ymm2, %ymm2
; X64-NEXT: vgatherdpd %ymm1, (%rdi,%xmm0,8), %ymm2
; X64-NEXT: vmovapd %ymm2, %ymm0
; X64-NEXT: retq
; X32-LABEL: test_mm_i32gather_pd:
; X32: ## BB#0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; X32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X32-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; X32-NEXT: vgatherdpd %xmm2, (%eax,%xmm0,2), %xmm1
; X32-NEXT: vmovapd %xmm1, %xmm0
; X32-NEXT: retl
;
; X64-LABEL: test_mm_i32gather_pd:
; X64: ## BB#0:
-; X64-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X64-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; X64-NEXT: vgatherdpd %xmm2, (%rdi,%xmm0,2), %xmm1
; X64-NEXT: vmovapd %xmm1, %xmm0
; X64-NEXT: retq
; X32: # BB#0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X32-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X32-NEXT: vpgatherdd %xmm2, (%eax,%xmm0,2), %xmm1
; X32-NEXT: vmovdqa %xmm1, %xmm0
; X32-NEXT: retl
; X64-LABEL: test_mm_i32gather_epi32:
; X64: # BB#0:
; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X64-NEXT: vpgatherdd %xmm2, (%rdi,%xmm0,2), %xmm1
; X64-NEXT: vmovdqa %xmm1, %xmm0
; X64-NEXT: retq
; X32: # BB#0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; X32-NEXT: vpxor %ymm1, %ymm1, %ymm1
; X32-NEXT: vpgatherdd %ymm2, (%eax,%ymm0,2), %ymm1
; X32-NEXT: vmovdqa %ymm1, %ymm0
; X32-NEXT: retl
; X64-LABEL: test_mm256_i32gather_epi32:
; X64: # BB#0:
; X64-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; X64-NEXT: vpxor %ymm1, %ymm1, %ymm1
; X64-NEXT: vpgatherdd %ymm2, (%rdi,%ymm0,2), %ymm1
; X64-NEXT: vmovdqa %ymm1, %ymm0
; X64-NEXT: retq
; X32: # BB#0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X32-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X32-NEXT: vpgatherdq %xmm2, (%eax,%xmm0,2), %xmm1
; X32-NEXT: vmovdqa %xmm1, %xmm0
; X32-NEXT: retl
; X64-LABEL: test_mm_i32gather_epi64:
; X64: # BB#0:
; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X64-NEXT: vpgatherdq %xmm2, (%rdi,%xmm0,2), %xmm1
; X64-NEXT: vmovdqa %xmm1, %xmm0
; X64-NEXT: retq
; X32: # BB#0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; X32-NEXT: vpxor %ymm1, %ymm1, %ymm1
; X32-NEXT: vpgatherdq %ymm2, (%eax,%xmm0,2), %ymm1
; X32-NEXT: vmovdqa %ymm1, %ymm0
; X32-NEXT: retl
; X64-LABEL: test_mm256_i32gather_epi64:
; X64: # BB#0:
; X64-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; X64-NEXT: vpxor %ymm1, %ymm1, %ymm1
; X64-NEXT: vpgatherdq %ymm2, (%rdi,%xmm0,2), %ymm1
; X64-NEXT: vmovdqa %ymm1, %ymm0
; X64-NEXT: retq
; X32: # BB#0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X32-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; X32-NEXT: vgatherdpd %xmm2, (%eax,%xmm0,2), %xmm1
; X32-NEXT: vmovapd %xmm1, %xmm0
; X32-NEXT: retl
; X64-LABEL: test_mm_i32gather_pd:
; X64: # BB#0:
; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X64-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; X64-NEXT: vgatherdpd %xmm2, (%rdi,%xmm0,2), %xmm1
; X64-NEXT: vmovapd %xmm1, %xmm0
; X64-NEXT: retq
; X32: # BB#0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X32-NEXT: vgatherdps %xmm2, (%eax,%xmm0,2), %xmm1
; X32-NEXT: vmovaps %xmm1, %xmm0
; X32-NEXT: retl
; X64-LABEL: test_mm_i32gather_ps:
; X64: # BB#0:
; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-NEXT: vgatherdps %xmm2, (%rdi,%xmm0,2), %xmm1
; X64-NEXT: vmovaps %xmm1, %xmm0
; X64-NEXT: retq
; X32: # BB#0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X32-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X32-NEXT: vpgatherqd %xmm2, (%eax,%xmm0,2), %xmm1
; X32-NEXT: vmovdqa %xmm1, %xmm0
; X32-NEXT: retl
; X64-LABEL: test_mm_i64gather_epi32:
; X64: # BB#0:
; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X64-NEXT: vpgatherqd %xmm2, (%rdi,%xmm0,2), %xmm1
; X64-NEXT: vmovdqa %xmm1, %xmm0
; X64-NEXT: retq
; X32: # BB#0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X32-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X32-NEXT: vpgatherqd %xmm2, (%eax,%ymm0,2), %xmm1
; X32-NEXT: vmovdqa %xmm1, %xmm0
; X32-NEXT: vzeroupper
; X64-LABEL: test_mm256_i64gather_epi32:
; X64: # BB#0:
; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X64-NEXT: vpgatherqd %xmm2, (%rdi,%ymm0,2), %xmm1
; X64-NEXT: vmovdqa %xmm1, %xmm0
; X64-NEXT: vzeroupper
; X32: # BB#0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X32-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X32-NEXT: vpgatherqq %xmm2, (%eax,%xmm0,2), %xmm1
; X32-NEXT: vmovdqa %xmm1, %xmm0
; X32-NEXT: retl
; X64-LABEL: test_mm_i64gather_epi64:
; X64: # BB#0:
; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X64-NEXT: vpgatherqq %xmm2, (%rdi,%xmm0,2), %xmm1
; X64-NEXT: vmovdqa %xmm1, %xmm0
; X64-NEXT: retq
; X32: # BB#0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; X32-NEXT: vpxor %ymm1, %ymm1, %ymm1
; X32-NEXT: vpgatherqq %ymm2, (%eax,%ymm0,2), %ymm1
; X32-NEXT: vmovdqa %ymm1, %ymm0
; X32-NEXT: retl
; X64-LABEL: test_mm256_i64gather_epi64:
; X64: # BB#0:
; X64-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
+; X64-NEXT: vpxor %ymm1, %ymm1, %ymm1
; X64-NEXT: vpgatherqq %ymm2, (%rdi,%ymm0,2), %ymm1
; X64-NEXT: vmovdqa %ymm1, %ymm0
; X64-NEXT: retq
; X32: # BB#0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X32-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; X32-NEXT: vgatherqpd %xmm2, (%eax,%xmm0,2), %xmm1
; X32-NEXT: vmovapd %xmm1, %xmm0
; X32-NEXT: retl
; X64-LABEL: test_mm_i64gather_pd:
; X64: # BB#0:
; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X64-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; X64-NEXT: vgatherqpd %xmm2, (%rdi,%xmm0,2), %xmm1
; X64-NEXT: vmovapd %xmm1, %xmm0
; X64-NEXT: retq
; X32: # BB#0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X32-NEXT: vgatherqps %xmm2, (%eax,%xmm0,2), %xmm1
; X32-NEXT: vmovaps %xmm1, %xmm0
; X32-NEXT: retl
; X64-LABEL: test_mm_i64gather_ps:
; X64: # BB#0:
; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-NEXT: vgatherqps %xmm2, (%rdi,%xmm0,2), %xmm1
; X64-NEXT: vmovaps %xmm1, %xmm0
; X64-NEXT: retq
; X32: # BB#0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X32-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X32-NEXT: vgatherqps %xmm2, (%eax,%ymm0,2), %xmm1
; X32-NEXT: vmovaps %xmm1, %xmm0
; X32-NEXT: vzeroupper
; X64-LABEL: test_mm256_i64gather_ps:
; X64: # BB#0:
; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-NEXT: vgatherqps %xmm2, (%rdi,%ymm0,2), %xmm1
; X64-NEXT: vmovaps %xmm1, %xmm0
; X64-NEXT: vzeroupper