}
+define <8 x float> @test8(float %a, <8 x float> %b, <8 x float> %c) {
+; X32-LABEL: test8:
+; X32: # %bb.0: # %entry
+; X32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X32-NEXT: vbroadcastss {{.*#+}} xmm3 = [-0,-0,-0,-0]
+; X32-NEXT: vxorps %xmm3, %xmm2, %xmm2
+; X32-NEXT: vbroadcastss %xmm2, %ymm2
+; X32-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm1
+; X32-NEXT: retl
+;
+; X64-LABEL: test8:
+; X64: # %bb.0: # %entry
+; X64-NEXT: vbroadcastss {{.*#+}} xmm3 = [-0,-0,-0,-0]
+; X64-NEXT: vxorps %xmm3, %xmm0, %xmm0
+; X64-NEXT: vbroadcastss %xmm0, %ymm0
+; X64-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2
+; X64-NEXT: retq
+entry:
+ %0 = fsub float -0.0, %a
+ %1 = insertelement <8 x float> undef, float %0, i32 0
+ %2 = shufflevector <8 x float> %1, <8 x float> undef, <8 x i32> zeroinitializer
+ %3 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %2, <8 x float> %b, <8 x float> %c)
+ ret <8 x float> %3
+}
+
declare <8 x float> @llvm.fma.v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c)