}
if (HasInt256) {
+ // Custom legalize 2x32 to get a little better code.
+ setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
+
for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
setOperationAction(ISD::MGATHER, VT, Custom);
setOperationAction(ISD::CTPOP, VT, Legal);
}
- // Custom legalize 2x32 to get a little better code.
- if (Subtarget.hasVLX()) {
- setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
- }
-
// Custom lower several nodes.
for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
}
case ISD::MGATHER: {
EVT VT = N->getValueType(0);
- if (VT == MVT::v2f32 && Subtarget.hasVLX()) {
+ if (VT == MVT::v2f32 && (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
auto *Gather = cast<MaskedGatherSDNode>(N);
SDValue Index = Gather->getIndex();
if (Index.getValueType() != MVT::v2i64)
SDValue Src0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
Gather->getValue(),
DAG.getUNDEF(MVT::v2f32));
+ if (!Subtarget.hasVLX()) {
+ // We need to widen the mask, but the instruction will only use 2
+ // of its elements. So we can use undef.
+ Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
+ DAG.getUNDEF(MVT::v2i1));
+ Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
+ }
SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(),
Index };
SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
- DAG.getVTList(MVT::v4f32, MVT::v2i1, MVT::Other), Ops, dl,
+ DAG.getVTList(MVT::v4f32, Mask.getValueType(), MVT::Other), Ops, dl,
Gather->getMemoryVT(), Gather->getMemOperand());
Results.push_back(Res);
Results.push_back(Res.getValue(2));
;
; X64-LABEL: masked_gather_v2float:
; X64: # BB#0: # %entry
-; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; X64-NEXT: vmovaps (%rdi), %xmm2
+; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-NEXT: vpslld $31, %xmm0, %xmm0
; X64-NEXT: vpsrad $31, %xmm0, %xmm0
-; X64-NEXT: vmovaps (%rdi), %xmm2
-; X64-NEXT: vgatherqps %xmm0, (,%ymm2), %xmm1
+; X64-NEXT: vgatherqps %xmm0, (,%xmm2), %xmm1
; X64-NEXT: vmovaps %xmm1, %xmm0
-; X64-NEXT: vzeroupper
; X64-NEXT: retq
;
; NOGATHER-LABEL: masked_gather_v2float:
;
; X64-LABEL: masked_gather_v2float_concat:
; X64: # BB#0: # %entry
-; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; X64-NEXT: vmovaps (%rdi), %xmm2
+; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; X64-NEXT: vpslld $31, %xmm0, %xmm0
; X64-NEXT: vpsrad $31, %xmm0, %xmm0
-; X64-NEXT: vmovaps (%rdi), %xmm2
-; X64-NEXT: vgatherqps %xmm0, (,%ymm2), %xmm1
+; X64-NEXT: vgatherqps %xmm0, (,%xmm2), %xmm1
; X64-NEXT: vmovaps %xmm1, %xmm0
-; X64-NEXT: vzeroupper
; X64-NEXT: retq
;
; NOGATHER-LABEL: masked_gather_v2float_concat: