From b0abb4dc4203b903d8d0b48a952ba0a6312eeeb7 Mon Sep 17 00:00:00 2001 From: Bob Wilson Date: Tue, 11 Aug 2009 05:39:44 +0000 Subject: [PATCH] Use vAny type to get rid of Neon intrinsics that differed only in whether the overloaded vector types allowed floating-point or integer vector elements. Most of these operations actually depend on the element type, so bitcasting was not an option. If you include the vpadd intrinsics that I updated earlier, this gets rid of 20 intrinsics. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@78646 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/IntrinsicsARM.td | 134 ++++++++++++------------------------- lib/Target/ARM/ARMISelDAGToDAG.cpp | 9 +-- lib/Target/ARM/ARMISelLowering.cpp | 18 ++--- lib/Target/ARM/ARMInstrNEON.td | 72 ++++++++++---------- test/CodeGen/ARM/vabd.ll | 8 +-- test/CodeGen/ARM/vabs.ll | 8 +-- test/CodeGen/ARM/vld1.ll | 40 +++++------ test/CodeGen/ARM/vld2.ll | 16 ++--- test/CodeGen/ARM/vld3.ll | 16 ++--- test/CodeGen/ARM/vld4.ll | 16 ++--- test/CodeGen/ARM/vmax.ll | 8 +-- test/CodeGen/ARM/vmin.ll | 8 +-- test/CodeGen/ARM/vpmax.ll | 4 +- test/CodeGen/ARM/vpmin.ll | 4 +- test/CodeGen/ARM/vrecpe.ll | 8 +-- test/CodeGen/ARM/vrsqrte.ll | 8 +-- test/CodeGen/ARM/vst1.ll | 40 +++++------ test/CodeGen/ARM/vst2.ll | 16 ++--- test/CodeGen/ARM/vst3.ll | 16 ++--- test/CodeGen/ARM/vst4.ll | 16 ++--- test/CodeGen/ARM/vtrn.ll | 32 ++++----- test/CodeGen/ARM/vuzp.ll | 32 ++++----- test/CodeGen/ARM/vzip.ll | 32 ++++----- 23 files changed, 253 insertions(+), 308 deletions(-) diff --git a/include/llvm/IntrinsicsARM.td b/include/llvm/IntrinsicsARM.td index d86dd087107..7b720827638 100644 --- a/include/llvm/IntrinsicsARM.td +++ b/include/llvm/IntrinsicsARM.td @@ -27,53 +27,42 @@ let TargetPrefix = "arm" in { // All intrinsics start with "llvm.arm.". // The following classes do not correspond directly to GCC builtins. class Neon_1Arg_Intrinsic - : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrNoMem]>; - class Neon_1Arg_Float_Intrinsic - : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; + : Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>], [IntrNoMem]>; class Neon_1Arg_Narrow_Intrinsic - : Intrinsic<[llvm_anyint_ty], + : Intrinsic<[llvm_anyvector_ty], [LLVMExtendedElementVectorType<0>], [IntrNoMem]>; class Neon_1Arg_Long_Intrinsic - : Intrinsic<[llvm_anyint_ty], + : Intrinsic<[llvm_anyvector_ty], [LLVMTruncatedElementVectorType<0>], [IntrNoMem]>; class Neon_2Arg_Intrinsic - : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>], - [IntrNoMem]>; - class Neon_2Arg_Float_Intrinsic - : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>], - [IntrNoMem]>; - class Neon_2Arg_Vector_Intrinsic : Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; class Neon_2Arg_Narrow_Intrinsic - : Intrinsic<[llvm_anyint_ty], + : Intrinsic<[llvm_anyvector_ty], [LLVMExtendedElementVectorType<0>, LLVMExtendedElementVectorType<0>], [IntrNoMem]>; class Neon_2Arg_Long_Intrinsic - : Intrinsic<[llvm_anyint_ty], + : Intrinsic<[llvm_anyvector_ty], [LLVMTruncatedElementVectorType<0>, LLVMTruncatedElementVectorType<0>], [IntrNoMem]>; class Neon_2Arg_Wide_Intrinsic - : Intrinsic<[llvm_anyint_ty], + : Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMTruncatedElementVectorType<0>], [IntrNoMem]>; class Neon_3Arg_Intrinsic - : Intrinsic<[llvm_anyint_ty], + : Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; class Neon_3Arg_Long_Intrinsic - : Intrinsic<[llvm_anyint_ty], + : Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMTruncatedElementVectorType<0>, LLVMTruncatedElementVectorType<0>], [IntrNoMem]>; class Neon_2Result_Intrinsic - : Intrinsic<[llvm_anyint_ty, LLVMMatchType<0>], - [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; - class Neon_2Result_Float_Intrinsic - : Intrinsic<[llvm_anyfloat_ty, LLVMMatchType<0>], + : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>], [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; class Neon_CvtFxToFP_Intrinsic : Intrinsic<[llvm_anyfloat_ty], [llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>; @@ -141,18 +130,16 @@ let Properties = [IntrNoMem, Commutative] in { // Vector Maximum. def int_arm_neon_vmaxs : Neon_2Arg_Intrinsic; def int_arm_neon_vmaxu : Neon_2Arg_Intrinsic; - def int_arm_neon_vmaxf : Neon_2Arg_Float_Intrinsic; // Vector Minimum. def int_arm_neon_vmins : Neon_2Arg_Intrinsic; def int_arm_neon_vminu : Neon_2Arg_Intrinsic; - def int_arm_neon_vminf : Neon_2Arg_Float_Intrinsic; // Vector Reciprocal Step. - def int_arm_neon_vrecps : Neon_2Arg_Float_Intrinsic; + def int_arm_neon_vrecps : Neon_2Arg_Intrinsic; // Vector Reciprocal Square Root Step. - def int_arm_neon_vrsqrts : Neon_2Arg_Float_Intrinsic; + def int_arm_neon_vrsqrts : Neon_2Arg_Intrinsic; } // Vector Subtract. @@ -186,7 +173,6 @@ let TargetPrefix = "arm" in { // Vector Absolute Differences. def int_arm_neon_vabds : Neon_2Arg_Intrinsic; def int_arm_neon_vabdu : Neon_2Arg_Intrinsic; -def int_arm_neon_vabdf : Neon_2Arg_Float_Intrinsic; def int_arm_neon_vabdls : Neon_2Arg_Long_Intrinsic; def int_arm_neon_vabdlu : Neon_2Arg_Long_Intrinsic; @@ -197,16 +183,16 @@ def int_arm_neon_vabals : Neon_3Arg_Long_Intrinsic; def int_arm_neon_vabalu : Neon_3Arg_Long_Intrinsic; // Vector Pairwise Add. -def int_arm_neon_vpadd : Neon_2Arg_Vector_Intrinsic; +def int_arm_neon_vpadd : Neon_2Arg_Intrinsic; // Vector Pairwise Add Long. // Note: This is different than the other "long" NEON intrinsics because // the result vector has half as many elements as the source vector. // The source and destination vector types must be specified separately. let TargetPrefix = "arm" in { - def int_arm_neon_vpaddls : Intrinsic<[llvm_anyint_ty], [llvm_anyint_ty], + def int_arm_neon_vpaddls : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty], [IntrNoMem]>; - def int_arm_neon_vpaddlu : Intrinsic<[llvm_anyint_ty], [llvm_anyint_ty], + def int_arm_neon_vpaddlu : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty], [IntrNoMem]>; } @@ -214,21 +200,19 @@ let TargetPrefix = "arm" in { // Note: This is similar to vpaddl but the destination vector also appears // as the first argument. let TargetPrefix = "arm" in { - def int_arm_neon_vpadals : Intrinsic<[llvm_anyint_ty], - [LLVMMatchType<0>, llvm_anyint_ty], + def int_arm_neon_vpadals : Intrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>, llvm_anyvector_ty], [IntrNoMem]>; - def int_arm_neon_vpadalu : Intrinsic<[llvm_anyint_ty], - [LLVMMatchType<0>, llvm_anyint_ty], + def int_arm_neon_vpadalu : Intrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>, llvm_anyvector_ty], [IntrNoMem]>; } // Vector Pairwise Maximum and Minimum. def int_arm_neon_vpmaxs : Neon_2Arg_Intrinsic; def int_arm_neon_vpmaxu : Neon_2Arg_Intrinsic; -def int_arm_neon_vpmaxf : Neon_2Arg_Float_Intrinsic; def int_arm_neon_vpmins : Neon_2Arg_Intrinsic; def int_arm_neon_vpminu : Neon_2Arg_Intrinsic; -def int_arm_neon_vpminf : Neon_2Arg_Float_Intrinsic; // Vector Shifts: // @@ -283,7 +267,6 @@ def int_arm_neon_vshiftins : Neon_3Arg_Intrinsic; // Vector Absolute Value and Saturating Absolute Value. def int_arm_neon_vabs : Neon_1Arg_Intrinsic; -def int_arm_neon_vabsf : Neon_1Arg_Float_Intrinsic; def int_arm_neon_vqabs : Neon_1Arg_Intrinsic; // Vector Saturating Negate. @@ -298,11 +281,9 @@ def int_arm_neon_vcnt : Neon_1Arg_Intrinsic; // Vector Reciprocal Estimate. def int_arm_neon_vrecpe : Neon_1Arg_Intrinsic; -def int_arm_neon_vrecpef : Neon_1Arg_Float_Intrinsic; // Vector Reciprocal Square Root Estimate. def int_arm_neon_vrsqrte : Neon_1Arg_Intrinsic; -def int_arm_neon_vrsqrtef : Neon_1Arg_Float_Intrinsic; // Vector Conversions Between Floating-point and Fixed-point. def int_arm_neon_vcvtfp2fxs : Neon_CvtFPToFx_Intrinsic; @@ -331,68 +312,41 @@ def int_arm_neon_vtbx3 : Neon_Tbl5Arg_Intrinsic; def int_arm_neon_vtbx4 : Neon_Tbl6Arg_Intrinsic; // Vector Transpose. -def int_arm_neon_vtrni : Neon_2Result_Intrinsic; -def int_arm_neon_vtrnf : Neon_2Result_Float_Intrinsic; +def int_arm_neon_vtrn : Neon_2Result_Intrinsic; // Vector Interleave (vzip). -def int_arm_neon_vzipi : Neon_2Result_Intrinsic; -def int_arm_neon_vzipf : Neon_2Result_Float_Intrinsic; +def int_arm_neon_vzip : Neon_2Result_Intrinsic; // Vector Deinterleave (vuzp). -def int_arm_neon_vuzpi : Neon_2Result_Intrinsic; -def int_arm_neon_vuzpf : Neon_2Result_Float_Intrinsic; +def int_arm_neon_vuzp : Neon_2Result_Intrinsic; let TargetPrefix = "arm" in { // De-interleaving vector loads from N-element structures. - def int_arm_neon_vld1i : Intrinsic<[llvm_anyint_ty], - [llvm_ptr_ty], [IntrReadArgMem]>; - def int_arm_neon_vld1f : Intrinsic<[llvm_anyfloat_ty], - [llvm_ptr_ty], [IntrReadArgMem]>; - def int_arm_neon_vld2i : Intrinsic<[llvm_anyint_ty, LLVMMatchType<0>], - [llvm_ptr_ty], [IntrReadArgMem]>; - def int_arm_neon_vld2f : Intrinsic<[llvm_anyfloat_ty, LLVMMatchType<0>], - [llvm_ptr_ty], [IntrReadArgMem]>; - def int_arm_neon_vld3i : Intrinsic<[llvm_anyint_ty, LLVMMatchType<0>, - LLVMMatchType<0>], - [llvm_ptr_ty], [IntrReadArgMem]>; - def int_arm_neon_vld3f : Intrinsic<[llvm_anyfloat_ty, LLVMMatchType<0>, - LLVMMatchType<0>], - [llvm_ptr_ty], [IntrReadArgMem]>; - def int_arm_neon_vld4i : Intrinsic<[llvm_anyint_ty, LLVMMatchType<0>, - LLVMMatchType<0>, LLVMMatchType<0>], - [llvm_ptr_ty], [IntrReadArgMem]>; - def int_arm_neon_vld4f : Intrinsic<[llvm_anyfloat_ty, LLVMMatchType<0>, - LLVMMatchType<0>, LLVMMatchType<0>], - [llvm_ptr_ty], [IntrReadArgMem]>; + def int_arm_neon_vld1 : Intrinsic<[llvm_anyvector_ty], + [llvm_ptr_ty], [IntrReadArgMem]>; + def int_arm_neon_vld2 : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>], + [llvm_ptr_ty], [IntrReadArgMem]>; + def int_arm_neon_vld3 : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, + LLVMMatchType<0>], + [llvm_ptr_ty], [IntrReadArgMem]>; + def int_arm_neon_vld4 : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, + LLVMMatchType<0>, LLVMMatchType<0>], + [llvm_ptr_ty], [IntrReadArgMem]>; // Interleaving vector stores from N-element structures. - def int_arm_neon_vst1i : Intrinsic<[llvm_void_ty], - [llvm_ptr_ty, llvm_anyint_ty], - [IntrWriteArgMem]>; - def int_arm_neon_vst1f : Intrinsic<[llvm_void_ty], - [llvm_ptr_ty, llvm_anyfloat_ty], + def int_arm_neon_vst1 : Intrinsic<[llvm_void_ty], + [llvm_ptr_ty, llvm_anyvector_ty], + [IntrWriteArgMem]>; + def int_arm_neon_vst2 : Intrinsic<[llvm_void_ty], + [llvm_ptr_ty, llvm_anyvector_ty, + LLVMMatchType<0>], [IntrWriteArgMem]>; + def int_arm_neon_vst3 : Intrinsic<[llvm_void_ty], + [llvm_ptr_ty, llvm_anyvector_ty, + LLVMMatchType<0>, LLVMMatchType<0>], [IntrWriteArgMem]>; - def int_arm_neon_vst2i : Intrinsic<[llvm_void_ty], - [llvm_ptr_ty, llvm_anyint_ty, - LLVMMatchType<0>], [IntrWriteArgMem]>; - def int_arm_neon_vst2f : Intrinsic<[llvm_void_ty], - [llvm_ptr_ty, llvm_anyfloat_ty, - LLVMMatchType<0>], [IntrWriteArgMem]>; - def int_arm_neon_vst3i : Intrinsic<[llvm_void_ty], - [llvm_ptr_ty, llvm_anyint_ty, - LLVMMatchType<0>, LLVMMatchType<0>], - [IntrWriteArgMem]>; - def int_arm_neon_vst3f : Intrinsic<[llvm_void_ty], - [llvm_ptr_ty, llvm_anyfloat_ty, - LLVMMatchType<0>, LLVMMatchType<0>], - [IntrWriteArgMem]>; - def int_arm_neon_vst4i : Intrinsic<[llvm_void_ty], - [llvm_ptr_ty, llvm_anyint_ty, - LLVMMatchType<0>, LLVMMatchType<0>, - LLVMMatchType<0>], [IntrWriteArgMem]>; - def int_arm_neon_vst4f : Intrinsic<[llvm_void_ty], - [llvm_ptr_ty, llvm_anyfloat_ty, - LLVMMatchType<0>, LLVMMatchType<0>, - LLVMMatchType<0>], [IntrWriteArgMem]>; + def int_arm_neon_vst4 : Intrinsic<[llvm_void_ty], + [llvm_ptr_ty, llvm_anyvector_ty, + LLVMMatchType<0>, LLVMMatchType<0>, + LLVMMatchType<0>], [IntrWriteArgMem]>; } diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp index 1096e8eb01c..a927da2a380 100644 --- a/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -1466,8 +1466,7 @@ SDNode *ARMDAGToDAGISel::Select(SDValue Op) { switch (IntNo) { default: break; - case Intrinsic::arm_neon_vtrni: - case Intrinsic::arm_neon_vtrnf: + case Intrinsic::arm_neon_vtrn: switch (VT.getSimpleVT()) { default: return NULL; case EVT::v8i8: Opc = ARM::VTRNd8; break; @@ -1482,8 +1481,7 @@ SDNode *ARMDAGToDAGISel::Select(SDValue Op) { return CurDAG->getTargetNode(Opc, dl, VT, VT, N->getOperand(1), N->getOperand(2)); - case Intrinsic::arm_neon_vuzpi: - case Intrinsic::arm_neon_vuzpf: + case Intrinsic::arm_neon_vuzp: switch (VT.getSimpleVT()) { default: return NULL; case EVT::v8i8: Opc = ARM::VUZPd8; break; @@ -1498,8 +1496,7 @@ SDNode *ARMDAGToDAGISel::Select(SDValue Op) { return CurDAG->getTargetNode(Opc, dl, VT, VT, N->getOperand(1), N->getOperand(2)); - case Intrinsic::arm_neon_vzipi: - case Intrinsic::arm_neon_vzipf: + case Intrinsic::arm_neon_vzip: switch (VT.getSimpleVT()) { default: return NULL; case EVT::v8i8: Opc = ARM::VZIPd8; break; diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 61722d44fae..1a662d9d872 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -1360,23 +1360,17 @@ SDValue ARMTargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) { unsigned IntNo = cast(Op.getOperand(1))->getZExtValue(); switch (IntNo) { - case Intrinsic::arm_neon_vld2i: - case Intrinsic::arm_neon_vld2f: + case Intrinsic::arm_neon_vld2: return LowerNeonVLDIntrinsic(Op, DAG, ARMISD::VLD2D); - case Intrinsic::arm_neon_vld3i: - case Intrinsic::arm_neon_vld3f: + case Intrinsic::arm_neon_vld3: return LowerNeonVLDIntrinsic(Op, DAG, ARMISD::VLD3D); - case Intrinsic::arm_neon_vld4i: - case Intrinsic::arm_neon_vld4f: + case Intrinsic::arm_neon_vld4: return LowerNeonVLDIntrinsic(Op, DAG, ARMISD::VLD4D); - case Intrinsic::arm_neon_vst2i: - case Intrinsic::arm_neon_vst2f: + case Intrinsic::arm_neon_vst2: return LowerNeonVSTIntrinsic(Op, DAG, ARMISD::VST2D, 2); - case Intrinsic::arm_neon_vst3i: - case Intrinsic::arm_neon_vst3f: + case Intrinsic::arm_neon_vst3: return LowerNeonVSTIntrinsic(Op, DAG, ARMISD::VST3D, 3); - case Intrinsic::arm_neon_vst4i: - case Intrinsic::arm_neon_vst4f: + case Intrinsic::arm_neon_vst4: return LowerNeonVSTIntrinsic(Op, DAG, ARMISD::VST4D, 4); default: return SDValue(); // Don't custom lower most intrinsics. } diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td index 1ed3a619a0f..53283e84ead 100644 --- a/lib/Target/ARM/ARMInstrNEON.td +++ b/lib/Target/ARM/ARMInstrNEON.td @@ -173,17 +173,17 @@ class VLD1Q !strconcat(OpcodeStr, "\t${dst:dregpair}, $addr"), [(set QPR:$dst, (Ty (IntOp addrmode6:$addr)))]>; -def VLD1d8 : VLD1D<"vld1.8", v8i8, int_arm_neon_vld1i>; -def VLD1d16 : VLD1D<"vld1.16", v4i16, int_arm_neon_vld1i>; -def VLD1d32 : VLD1D<"vld1.32", v2i32, int_arm_neon_vld1i>; -def VLD1df : VLD1D<"vld1.32", v2f32, int_arm_neon_vld1f>; -def VLD1d64 : VLD1D<"vld1.64", v1i64, int_arm_neon_vld1i>; - -def VLD1q8 : VLD1Q<"vld1.8", v16i8, int_arm_neon_vld1i>; -def VLD1q16 : VLD1Q<"vld1.16", v8i16, int_arm_neon_vld1i>; -def VLD1q32 : VLD1Q<"vld1.32", v4i32, int_arm_neon_vld1i>; -def VLD1qf : VLD1Q<"vld1.32", v4f32, int_arm_neon_vld1f>; -def VLD1q64 : VLD1Q<"vld1.64", v2i64, int_arm_neon_vld1i>; +def VLD1d8 : VLD1D<"vld1.8", v8i8, int_arm_neon_vld1>; +def VLD1d16 : VLD1D<"vld1.16", v4i16, int_arm_neon_vld1>; +def VLD1d32 : VLD1D<"vld1.32", v2i32, int_arm_neon_vld1>; +def VLD1df : VLD1D<"vld1.32", v2f32, int_arm_neon_vld1>; +def VLD1d64 : VLD1D<"vld1.64", v1i64, int_arm_neon_vld1>; + +def VLD1q8 : VLD1Q<"vld1.8", v16i8, int_arm_neon_vld1>; +def VLD1q16 : VLD1Q<"vld1.16", v8i16, int_arm_neon_vld1>; +def VLD1q32 : VLD1Q<"vld1.32", v4i32, int_arm_neon_vld1>; +def VLD1qf : VLD1Q<"vld1.32", v4f32, int_arm_neon_vld1>; +def VLD1q64 : VLD1Q<"vld1.64", v2i64, int_arm_neon_vld1>; // VLD2 : Vector Load (multiple 2-element structures) class VLD2D @@ -228,17 +228,17 @@ class VST1Q !strconcat(OpcodeStr, "\t${src:dregpair}, $addr"), [(IntOp addrmode6:$addr, (Ty QPR:$src))]>; -def VST1d8 : VST1D<"vst1.8", v8i8, int_arm_neon_vst1i>; -def VST1d16 : VST1D<"vst1.16", v4i16, int_arm_neon_vst1i>; -def VST1d32 : VST1D<"vst1.32", v2i32, int_arm_neon_vst1i>; -def VST1df : VST1D<"vst1.32", v2f32, int_arm_neon_vst1f>; -def VST1d64 : VST1D<"vst1.64", v1i64, int_arm_neon_vst1i>; +def VST1d8 : VST1D<"vst1.8", v8i8, int_arm_neon_vst1>; +def VST1d16 : VST1D<"vst1.16", v4i16, int_arm_neon_vst1>; +def VST1d32 : VST1D<"vst1.32", v2i32, int_arm_neon_vst1>; +def VST1df : VST1D<"vst1.32", v2f32, int_arm_neon_vst1>; +def VST1d64 : VST1D<"vst1.64", v1i64, int_arm_neon_vst1>; -def VST1q8 : VST1Q<"vst1.8", v16i8, int_arm_neon_vst1i>; -def VST1q16 : VST1Q<"vst1.16", v8i16, int_arm_neon_vst1i>; -def VST1q32 : VST1Q<"vst1.32", v4i32, int_arm_neon_vst1i>; -def VST1qf : VST1Q<"vst1.32", v4f32, int_arm_neon_vst1f>; -def VST1q64 : VST1Q<"vst1.64", v2i64, int_arm_neon_vst1i>; +def VST1q8 : VST1Q<"vst1.8", v16i8, int_arm_neon_vst1>; +def VST1q16 : VST1Q<"vst1.16", v8i16, int_arm_neon_vst1>; +def VST1q32 : VST1Q<"vst1.32", v4i32, int_arm_neon_vst1>; +def VST1qf : VST1Q<"vst1.32", v4f32, int_arm_neon_vst1>; +def VST1q64 : VST1Q<"vst1.64", v2i64, int_arm_neon_vst1>; // VST2 : Vector Store (multiple 2-element structures) class VST2D @@ -1223,9 +1223,9 @@ def VBSLq : N3V<1, 0, 0b01, 0b0001, 1, 1, (outs QPR:$dst), defm VABDs : N3VInt_QHS<0, 0, 0b0111, 0, "vabd.s", int_arm_neon_vabds, 0>; defm VABDu : N3VInt_QHS<1, 0, 0b0111, 0, "vabd.u", int_arm_neon_vabdu, 0>; def VABDfd : N3VDInt<1, 0, 0b10, 0b1101, 0, "vabd.f32", v2f32, v2f32, - int_arm_neon_vabdf, 0>; + int_arm_neon_vabds, 0>; def VABDfq : N3VQInt<1, 0, 0b10, 0b1101, 0, "vabd.f32", v4f32, v4f32, - int_arm_neon_vabdf, 0>; + int_arm_neon_vabds, 0>; // VABDL : Vector Absolute Difference Long (Q = | D - D |) defm VABDLs : N3VLInt_QHS<0,1,0b0111,0, "vabdl.s", int_arm_neon_vabdls, 0>; @@ -1245,17 +1245,17 @@ defm VABALu : N3VLInt3_QHS<1,1,0b0101,0, "vabal.u", int_arm_neon_vabalu>; defm VMAXs : N3VInt_QHS<0, 0, 0b0110, 0, "vmax.s", int_arm_neon_vmaxs, 1>; defm VMAXu : N3VInt_QHS<1, 0, 0b0110, 0, "vmax.u", int_arm_neon_vmaxu, 1>; def VMAXfd : N3VDInt<0, 0, 0b00, 0b1111, 0, "vmax.f32", v2f32, v2f32, - int_arm_neon_vmaxf, 1>; + int_arm_neon_vmaxs, 1>; def VMAXfq : N3VQInt<0, 0, 0b00, 0b1111, 0, "vmax.f32", v4f32, v4f32, - int_arm_neon_vmaxf, 1>; + int_arm_neon_vmaxs, 1>; // VMIN : Vector Minimum defm VMINs : N3VInt_QHS<0, 0, 0b0110, 1, "vmin.s", int_arm_neon_vmins, 1>; defm VMINu : N3VInt_QHS<1, 0, 0b0110, 1, "vmin.u", int_arm_neon_vminu, 1>; def VMINfd : N3VDInt<0, 0, 0b10, 0b1111, 0, "vmin.f32", v2f32, v2f32, - int_arm_neon_vminf, 1>; + int_arm_neon_vmins, 1>; def VMINfq : N3VQInt<0, 0, 0b10, 0b1111, 0, "vmin.f32", v4f32, v4f32, - int_arm_neon_vminf, 1>; + int_arm_neon_vmins, 1>; // Vector Pairwise Operations. @@ -1295,7 +1295,7 @@ def VPMAXu16 : N3VDInt<1, 0, 0b01, 0b1010, 0, "vpmax.u16", v4i16, v4i16, def VPMAXu32 : N3VDInt<1, 0, 0b10, 0b1010, 0, "vpmax.u32", v2i32, v2i32, int_arm_neon_vpmaxu, 0>; def VPMAXf : N3VDInt<1, 0, 0b00, 0b1111, 0, "vpmax.f32", v2f32, v2f32, - int_arm_neon_vpmaxf, 0>; + int_arm_neon_vpmaxs, 0>; // VPMIN : Vector Pairwise Minimum def VPMINs8 : N3VDInt<0, 0, 0b00, 0b1010, 1, "vpmin.s8", v8i8, v8i8, @@ -1311,7 +1311,7 @@ def VPMINu16 : N3VDInt<1, 0, 0b01, 0b1010, 1, "vpmin.u16", v4i16, v4i16, def VPMINu32 : N3VDInt<1, 0, 0b10, 0b1010, 1, "vpmin.u32", v2i32, v2i32, int_arm_neon_vpminu, 0>; def VPMINf : N3VDInt<1, 0, 0b10, 0b1111, 0, "vpmin.f32", v2f32, v2f32, - int_arm_neon_vpminf, 0>; + int_arm_neon_vpmins, 0>; // Vector Reciprocal and Reciprocal Square Root Estimate and Step. @@ -1321,9 +1321,9 @@ def VRECPEd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01000, 0, "vrecpe.u32", def VRECPEq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01000, 0, "vrecpe.u32", v4i32, v4i32, int_arm_neon_vrecpe>; def VRECPEfd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01010, 0, "vrecpe.f32", - v2f32, v2f32, int_arm_neon_vrecpef>; + v2f32, v2f32, int_arm_neon_vrecpe>; def VRECPEfq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01010, 0, "vrecpe.f32", - v4f32, v4f32, int_arm_neon_vrecpef>; + v4f32, v4f32, int_arm_neon_vrecpe>; // VRECPS : Vector Reciprocal Step def VRECPSfd : N3VDInt<0, 0, 0b00, 0b1111, 1, "vrecps.f32", v2f32, v2f32, @@ -1337,9 +1337,9 @@ def VRSQRTEd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01001, 0, "vrsqrte.u32", def VRSQRTEq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01001, 0, "vrsqrte.u32", v4i32, v4i32, int_arm_neon_vrsqrte>; def VRSQRTEfd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01011, 0, "vrsqrte.f32", - v2f32, v2f32, int_arm_neon_vrsqrtef>; + v2f32, v2f32, int_arm_neon_vrsqrte>; def VRSQRTEfq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01011, 0, "vrsqrte.f32", - v4f32, v4f32, int_arm_neon_vrsqrtef>; + v4f32, v4f32, int_arm_neon_vrsqrte>; // VRSQRTS : Vector Reciprocal Square Root Step def VRSQRTSfd : N3VDInt<0, 0, 0b10, 0b1111, 1, "vrsqrts.f32", v2f32, v2f32, @@ -1480,9 +1480,9 @@ defm VSRI : N2VShIns_QHSD<1, 1, 0b0100, 1, "vsri.", NEONvsri>; defm VABS : N2VInt_QHS<0b11, 0b11, 0b01, 0b00110, 0, "vabs.s", int_arm_neon_vabs>; def VABSfd : N2VDInt<0b11, 0b11, 0b10, 0b01, 0b01110, 0, "vabs.f32", - v2f32, v2f32, int_arm_neon_vabsf>; + v2f32, v2f32, int_arm_neon_vabs>; def VABSfq : N2VQInt<0b11, 0b11, 0b10, 0b01, 0b01110, 0, "vabs.f32", - v4f32, v4f32, int_arm_neon_vabsf>; + v4f32, v4f32, int_arm_neon_vabs>; // VQABS : Vector Saturating Absolute Value defm VQABS : N2VInt_QHS<0b11, 0b11, 0b00, 0b01110, 0, "vqabs.s", @@ -2017,7 +2017,7 @@ def : N3VDMulOpsPat; // Vector Absolute used for single-precision FP let neverHasSideEffects = 1 in def VABSfd_sfp : N2VDInts<0b11, 0b11, 0b10, 0b01, 0b01110, 0, "vabs.f32", - v2f32, v2f32, int_arm_neon_vabsf>; + v2f32, v2f32, int_arm_neon_vabs>; def : N2VDIntsPat; // Vector Negate used for single-precision FP diff --git a/test/CodeGen/ARM/vabd.ll b/test/CodeGen/ARM/vabd.ll index c0497f9134e..e7648401548 100644 --- a/test/CodeGen/ARM/vabd.ll +++ b/test/CodeGen/ARM/vabd.ll @@ -59,7 +59,7 @@ define <2 x float> @vabdf32(<2 x float>* %A, <2 x float>* %B) nounwind { ;CHECK: vabd.f32 %tmp1 = load <2 x float>* %A %tmp2 = load <2 x float>* %B - %tmp3 = call <2 x float> @llvm.arm.neon.vabdf.v2f32(<2 x float> %tmp1, <2 x float> %tmp2) + %tmp3 = call <2 x float> @llvm.arm.neon.vabds.v2f32(<2 x float> %tmp1, <2 x float> %tmp2) ret <2 x float> %tmp3 } @@ -122,7 +122,7 @@ define <4 x float> @vabdQf32(<4 x float>* %A, <4 x float>* %B) nounwind { ;CHECK: vabd.f32 %tmp1 = load <4 x float>* %A %tmp2 = load <4 x float>* %B - %tmp3 = call <4 x float> @llvm.arm.neon.vabdf.v4f32(<4 x float> %tmp1, <4 x float> %tmp2) + %tmp3 = call <4 x float> @llvm.arm.neon.vabds.v4f32(<4 x float> %tmp1, <4 x float> %tmp2) ret <4 x float> %tmp3 } @@ -134,7 +134,7 @@ declare <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8>, <8 x i8>) nounwind readnon declare <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone declare <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone -declare <2 x float> @llvm.arm.neon.vabdf.v2f32(<2 x float>, <2 x float>) nounwind readnone +declare <2 x float> @llvm.arm.neon.vabds.v2f32(<2 x float>, <2 x float>) nounwind readnone declare <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8>, <16 x i8>) nounwind readnone declare <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone @@ -144,4 +144,4 @@ declare <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8>, <16 x i8>) nounwind read declare <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone declare <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone -declare <4 x float> @llvm.arm.neon.vabdf.v4f32(<4 x float>, <4 x float>) nounwind readnone +declare <4 x float> @llvm.arm.neon.vabds.v4f32(<4 x float>, <4 x float>) nounwind readnone diff --git a/test/CodeGen/ARM/vabs.ll b/test/CodeGen/ARM/vabs.ll index a7979eec52a..1195f087ef4 100644 --- a/test/CodeGen/ARM/vabs.ll +++ b/test/CodeGen/ARM/vabs.ll @@ -28,7 +28,7 @@ define <2 x float> @vabsf32(<2 x float>* %A) nounwind { ;CHECK: vabsf32: ;CHECK: vabs.f32 %tmp1 = load <2 x float>* %A - %tmp2 = call <2 x float> @llvm.arm.neon.vabsf.v2f32(<2 x float> %tmp1) + %tmp2 = call <2 x float> @llvm.arm.neon.vabs.v2f32(<2 x float> %tmp1) ret <2 x float> %tmp2 } @@ -60,17 +60,17 @@ define <4 x float> @vabsQf32(<4 x float>* %A) nounwind { ;CHECK: vabsQf32: ;CHECK: vabs.f32 %tmp1 = load <4 x float>* %A - %tmp2 = call <4 x float> @llvm.arm.neon.vabsf.v4f32(<4 x float> %tmp1) + %tmp2 = call <4 x float> @llvm.arm.neon.vabs.v4f32(<4 x float> %tmp1) ret <4 x float> %tmp2 } declare <8 x i8> @llvm.arm.neon.vabs.v8i8(<8 x i8>) nounwind readnone declare <4 x i16> @llvm.arm.neon.vabs.v4i16(<4 x i16>) nounwind readnone declare <2 x i32> @llvm.arm.neon.vabs.v2i32(<2 x i32>) nounwind readnone -declare <2 x float> @llvm.arm.neon.vabsf.v2f32(<2 x float>) nounwind readnone +declare <2 x float> @llvm.arm.neon.vabs.v2f32(<2 x float>) nounwind readnone declare <16 x i8> @llvm.arm.neon.vabs.v16i8(<16 x i8>) nounwind readnone declare <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16>) nounwind readnone declare <4 x i32> @llvm.arm.neon.vabs.v4i32(<4 x i32>) nounwind readnone -declare <4 x float> @llvm.arm.neon.vabsf.v4f32(<4 x float>) nounwind readnone +declare <4 x float> @llvm.arm.neon.vabs.v4f32(<4 x float>) nounwind readnone diff --git a/test/CodeGen/ARM/vld1.ll b/test/CodeGen/ARM/vld1.ll index d5191338c9a..81f1bdec9ee 100644 --- a/test/CodeGen/ARM/vld1.ll +++ b/test/CodeGen/ARM/vld1.ll @@ -3,81 +3,81 @@ define <8 x i8> @vld1i8(i8* %A) nounwind { ;CHECK: vld1i8: ;CHECK: vld1.8 - %tmp1 = call <8 x i8> @llvm.arm.neon.vld1i.v8i8(i8* %A) + %tmp1 = call <8 x i8> @llvm.arm.neon.vld1.v8i8(i8* %A) ret <8 x i8> %tmp1 } define <4 x i16> @vld1i16(i16* %A) nounwind { ;CHECK: vld1i16: ;CHECK: vld1.16 - %tmp1 = call <4 x i16> @llvm.arm.neon.vld1i.v4i16(i16* %A) + %tmp1 = call <4 x i16> @llvm.arm.neon.vld1.v4i16(i16* %A) ret <4 x i16> %tmp1 } define <2 x i32> @vld1i32(i32* %A) nounwind { ;CHECK: vld1i32: ;CHECK: vld1.32 - %tmp1 = call <2 x i32> @llvm.arm.neon.vld1i.v2i32(i32* %A) + %tmp1 = call <2 x i32> @llvm.arm.neon.vld1.v2i32(i32* %A) ret <2 x i32> %tmp1 } define <2 x float> @vld1f(float* %A) nounwind { ;CHECK: vld1f: ;CHECK: vld1.32 - %tmp1 = call <2 x float> @llvm.arm.neon.vld1f.v2f32(float* %A) + %tmp1 = call <2 x float> @llvm.arm.neon.vld1.v2f32(float* %A) ret <2 x float> %tmp1 } define <1 x i64> @vld1i64(i64* %A) nounwind { ;CHECK: vld1i64: ;CHECK: vld1.64 - %tmp1 = call <1 x i64> @llvm.arm.neon.vld1i.v1i64(i64* %A) + %tmp1 = call <1 x i64> @llvm.arm.neon.vld1.v1i64(i64* %A) ret <1 x i64> %tmp1 } define <16 x i8> @vld1Qi8(i8* %A) nounwind { ;CHECK: vld1Qi8: ;CHECK: vld1.8 - %tmp1 = call <16 x i8> @llvm.arm.neon.vld1i.v16i8(i8* %A) + %tmp1 = call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %A) ret <16 x i8> %tmp1 } define <8 x i16> @vld1Qi16(i16* %A) nounwind { ;CHECK: vld1Qi16: ;CHECK: vld1.16 - %tmp1 = call <8 x i16> @llvm.arm.neon.vld1i.v8i16(i16* %A) + %tmp1 = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i16* %A) ret <8 x i16> %tmp1 } define <4 x i32> @vld1Qi32(i32* %A) nounwind { ;CHECK: vld1Qi32: ;CHECK: vld1.32 - %tmp1 = call <4 x i32> @llvm.arm.neon.vld1i.v4i32(i32* %A) + %tmp1 = call <4 x i32> @llvm.arm.neon.vld1.v4i32(i32* %A) ret <4 x i32> %tmp1 } define <4 x float> @vld1Qf(float* %A) nounwind { ;CHECK: vld1Qf: ;CHECK: vld1.32 - %tmp1 = call <4 x float> @llvm.arm.neon.vld1f.v4f32(float* %A) + %tmp1 = call <4 x float> @llvm.arm.neon.vld1.v4f32(float* %A) ret <4 x float> %tmp1 } define <2 x i64> @vld1Qi64(i64* %A) nounwind { ;CHECK: vld1Qi64: ;CHECK: vld1.64 - %tmp1 = call <2 x i64> @llvm.arm.neon.vld1i.v2i64(i64* %A) + %tmp1 = call <2 x i64> @llvm.arm.neon.vld1.v2i64(i64* %A) ret <2 x i64> %tmp1 } -declare <8 x i8> @llvm.arm.neon.vld1i.v8i8(i8*) nounwind readonly -declare <4 x i16> @llvm.arm.neon.vld1i.v4i16(i8*) nounwind readonly -declare <2 x i32> @llvm.arm.neon.vld1i.v2i32(i8*) nounwind readonly -declare <2 x float> @llvm.arm.neon.vld1f.v2f32(i8*) nounwind readonly -declare <1 x i64> @llvm.arm.neon.vld1i.v1i64(i8*) nounwind readonly +declare <8 x i8> @llvm.arm.neon.vld1.v8i8(i8*) nounwind readonly +declare <4 x i16> @llvm.arm.neon.vld1.v4i16(i8*) nounwind readonly +declare <2 x i32> @llvm.arm.neon.vld1.v2i32(i8*) nounwind readonly +declare <2 x float> @llvm.arm.neon.vld1.v2f32(i8*) nounwind readonly +declare <1 x i64> @llvm.arm.neon.vld1.v1i64(i8*) nounwind readonly -declare <16 x i8> @llvm.arm.neon.vld1i.v16i8(i8*) nounwind readonly -declare <8 x i16> @llvm.arm.neon.vld1i.v8i16(i8*) nounwind readonly -declare <4 x i32> @llvm.arm.neon.vld1i.v4i32(i8*) nounwind readonly -declare <4 x float> @llvm.arm.neon.vld1f.v4f32(i8*) nounwind readonly -declare <2 x i64> @llvm.arm.neon.vld1i.v2i64(i8*) nounwind readonly +declare <16 x i8> @llvm.arm.neon.vld1.v16i8(i8*) nounwind readonly +declare <8 x i16> @llvm.arm.neon.vld1.v8i16(i8*) nounwind readonly +declare <4 x i32> @llvm.arm.neon.vld1.v4i32(i8*) nounwind readonly +declare <4 x float> @llvm.arm.neon.vld1.v4f32(i8*) nounwind readonly +declare <2 x i64> @llvm.arm.neon.vld1.v2i64(i8*) nounwind readonly diff --git a/test/CodeGen/ARM/vld2.ll b/test/CodeGen/ARM/vld2.ll index 2c16ac19d8d..168b62b9ab4 100644 --- a/test/CodeGen/ARM/vld2.ll +++ b/test/CodeGen/ARM/vld2.ll @@ -8,7 +8,7 @@ define <8 x i8> @vld2i8(i8* %A) nounwind { ;CHECK: vld2i8: ;CHECK: vld2.8 - %tmp1 = call %struct.__builtin_neon_v8qi2 @llvm.arm.neon.vld2i.v8i8(i8* %A) + %tmp1 = call %struct.__builtin_neon_v8qi2 @llvm.arm.neon.vld2.v8i8(i8* %A) %tmp2 = extractvalue %struct.__builtin_neon_v8qi2 %tmp1, 0 %tmp3 = extractvalue %struct.__builtin_neon_v8qi2 %tmp1, 1 %tmp4 = add <8 x i8> %tmp2, %tmp3 @@ -18,7 +18,7 @@ define <8 x i8> @vld2i8(i8* %A) nounwind { define <4 x i16> @vld2i16(i16* %A) nounwind { ;CHECK: vld2i16: ;CHECK: vld2.16 - %tmp1 = call %struct.__builtin_neon_v4hi2 @llvm.arm.neon.vld2i.v4i16(i16* %A) + %tmp1 = call %struct.__builtin_neon_v4hi2 @llvm.arm.neon.vld2.v4i16(i16* %A) %tmp2 = extractvalue %struct.__builtin_neon_v4hi2 %tmp1, 0 %tmp3 = extractvalue %struct.__builtin_neon_v4hi2 %tmp1, 1 %tmp4 = add <4 x i16> %tmp2, %tmp3 @@ -28,7 +28,7 @@ define <4 x i16> @vld2i16(i16* %A) nounwind { define <2 x i32> @vld2i32(i32* %A) nounwind { ;CHECK: vld2i32: ;CHECK: vld2.32 - %tmp1 = call %struct.__builtin_neon_v2si2 @llvm.arm.neon.vld2i.v2i32(i32* %A) + %tmp1 = call %struct.__builtin_neon_v2si2 @llvm.arm.neon.vld2.v2i32(i32* %A) %tmp2 = extractvalue %struct.__builtin_neon_v2si2 %tmp1, 0 %tmp3 = extractvalue %struct.__builtin_neon_v2si2 %tmp1, 1 %tmp4 = add <2 x i32> %tmp2, %tmp3 @@ -38,14 +38,14 @@ define <2 x i32> @vld2i32(i32* %A) nounwind { define <2 x float> @vld2f(float* %A) nounwind { ;CHECK: vld2f: ;CHECK: vld2.32 - %tmp1 = call %struct.__builtin_neon_v2sf2 @llvm.arm.neon.vld2f.v2f32(float* %A) + %tmp1 = call %struct.__builtin_neon_v2sf2 @llvm.arm.neon.vld2.v2f32(float* %A) %tmp2 = extractvalue %struct.__builtin_neon_v2sf2 %tmp1, 0 %tmp3 = extractvalue %struct.__builtin_neon_v2sf2 %tmp1, 1 %tmp4 = add <2 x float> %tmp2, %tmp3 ret <2 x float> %tmp4 } -declare %struct.__builtin_neon_v8qi2 @llvm.arm.neon.vld2i.v8i8(i8*) nounwind readonly -declare %struct.__builtin_neon_v4hi2 @llvm.arm.neon.vld2i.v4i16(i8*) nounwind readonly -declare %struct.__builtin_neon_v2si2 @llvm.arm.neon.vld2i.v2i32(i8*) nounwind readonly -declare %struct.__builtin_neon_v2sf2 @llvm.arm.neon.vld2f.v2f32(i8*) nounwind readonly +declare %struct.__builtin_neon_v8qi2 @llvm.arm.neon.vld2.v8i8(i8*) nounwind readonly +declare %struct.__builtin_neon_v4hi2 @llvm.arm.neon.vld2.v4i16(i8*) nounwind readonly +declare %struct.__builtin_neon_v2si2 @llvm.arm.neon.vld2.v2i32(i8*) nounwind readonly +declare %struct.__builtin_neon_v2sf2 @llvm.arm.neon.vld2.v2f32(i8*) nounwind readonly diff --git a/test/CodeGen/ARM/vld3.ll b/test/CodeGen/ARM/vld3.ll index 49665f6de33..5e528c04560 100644 --- a/test/CodeGen/ARM/vld3.ll +++ b/test/CodeGen/ARM/vld3.ll @@ -8,7 +8,7 @@ define <8 x i8> @vld3i8(i8* %A) nounwind { ;CHECK: vld3i8: ;CHECK: vld3.8 - %tmp1 = call %struct.__builtin_neon_v8qi3 @llvm.arm.neon.vld3i.v8i8(i8* %A) + %tmp1 = call %struct.__builtin_neon_v8qi3 @llvm.arm.neon.vld3.v8i8(i8* %A) %tmp2 = extractvalue %struct.__builtin_neon_v8qi3 %tmp1, 0 %tmp3 = extractvalue %struct.__builtin_neon_v8qi3 %tmp1, 2 %tmp4 = add <8 x i8> %tmp2, %tmp3 @@ -18,7 +18,7 @@ define <8 x i8> @vld3i8(i8* %A) nounwind { define <4 x i16> @vld3i16(i16* %A) nounwind { ;CHECK: vld3i16: ;CHECK: vld3.16 - %tmp1 = call %struct.__builtin_neon_v4hi3 @llvm.arm.neon.vld3i.v4i16(i16* %A) + %tmp1 = call %struct.__builtin_neon_v4hi3 @llvm.arm.neon.vld3.v4i16(i16* %A) %tmp2 = extractvalue %struct.__builtin_neon_v4hi3 %tmp1, 0 %tmp3 = extractvalue %struct.__builtin_neon_v4hi3 %tmp1, 2 %tmp4 = add <4 x i16> %tmp2, %tmp3 @@ -28,7 +28,7 @@ define <4 x i16> @vld3i16(i16* %A) nounwind { define <2 x i32> @vld3i32(i32* %A) nounwind { ;CHECK: vld3i32: ;CHECK: vld3.32 - %tmp1 = call %struct.__builtin_neon_v2si3 @llvm.arm.neon.vld3i.v2i32(i32* %A) + %tmp1 = call %struct.__builtin_neon_v2si3 @llvm.arm.neon.vld3.v2i32(i32* %A) %tmp2 = extractvalue %struct.__builtin_neon_v2si3 %tmp1, 0 %tmp3 = extractvalue %struct.__builtin_neon_v2si3 %tmp1, 2 %tmp4 = add <2 x i32> %tmp2, %tmp3 @@ -38,14 +38,14 @@ define <2 x i32> @vld3i32(i32* %A) nounwind { define <2 x float> @vld3f(float* %A) nounwind { ;CHECK: vld3f: ;CHECK: vld3.32 - %tmp1 = call %struct.__builtin_neon_v2sf3 @llvm.arm.neon.vld3f.v2f32(float* %A) + %tmp1 = call %struct.__builtin_neon_v2sf3 @llvm.arm.neon.vld3.v2f32(float* %A) %tmp2 = extractvalue %struct.__builtin_neon_v2sf3 %tmp1, 0 %tmp3 = extractvalue %struct.__builtin_neon_v2sf3 %tmp1, 2 %tmp4 = add <2 x float> %tmp2, %tmp3 ret <2 x float> %tmp4 } -declare %struct.__builtin_neon_v8qi3 @llvm.arm.neon.vld3i.v8i8(i8*) nounwind readonly -declare %struct.__builtin_neon_v4hi3 @llvm.arm.neon.vld3i.v4i16(i8*) nounwind readonly -declare %struct.__builtin_neon_v2si3 @llvm.arm.neon.vld3i.v2i32(i8*) nounwind readonly -declare %struct.__builtin_neon_v2sf3 @llvm.arm.neon.vld3f.v2f32(i8*) nounwind readonly +declare %struct.__builtin_neon_v8qi3 @llvm.arm.neon.vld3.v8i8(i8*) nounwind readonly +declare %struct.__builtin_neon_v4hi3 @llvm.arm.neon.vld3.v4i16(i8*) nounwind readonly +declare %struct.__builtin_neon_v2si3 @llvm.arm.neon.vld3.v2i32(i8*) nounwind readonly +declare %struct.__builtin_neon_v2sf3 @llvm.arm.neon.vld3.v2f32(i8*) nounwind readonly diff --git a/test/CodeGen/ARM/vld4.ll b/test/CodeGen/ARM/vld4.ll index a0f41cfa67c..48125be5142 100644 --- a/test/CodeGen/ARM/vld4.ll +++ b/test/CodeGen/ARM/vld4.ll @@ -8,7 +8,7 @@ define <8 x i8> @vld4i8(i8* %A) nounwind { ;CHECK: vld4i8: ;CHECK: vld4.8 - %tmp1 = call %struct.__builtin_neon_v8qi4 @llvm.arm.neon.vld4i.v8i8(i8* %A) + %tmp1 = call %struct.__builtin_neon_v8qi4 @llvm.arm.neon.vld4.v8i8(i8* %A) %tmp2 = extractvalue %struct.__builtin_neon_v8qi4 %tmp1, 0 %tmp3 = extractvalue %struct.__builtin_neon_v8qi4 %tmp1, 2 %tmp4 = add <8 x i8> %tmp2, %tmp3 @@ -18,7 +18,7 @@ define <8 x i8> @vld4i8(i8* %A) nounwind { define <4 x i16> @vld4i16(i16* %A) nounwind { ;CHECK: vld4i16: ;CHECK: vld4.16 - %tmp1 = call %struct.__builtin_neon_v4hi4 @llvm.arm.neon.vld4i.v4i16(i16* %A) + %tmp1 = call %struct.__builtin_neon_v4hi4 @llvm.arm.neon.vld4.v4i16(i16* %A) %tmp2 = extractvalue %struct.__builtin_neon_v4hi4 %tmp1, 0 %tmp3 = extractvalue %struct.__builtin_neon_v4hi4 %tmp1, 2 %tmp4 = add <4 x i16> %tmp2, %tmp3 @@ -28,7 +28,7 @@ define <4 x i16> @vld4i16(i16* %A) nounwind { define <2 x i32> @vld4i32(i32* %A) nounwind { ;CHECK: vld4i32: ;CHECK: vld4.32 - %tmp1 = call %struct.__builtin_neon_v2si4 @llvm.arm.neon.vld4i.v2i32(i32* %A) + %tmp1 = call %struct.__builtin_neon_v2si4 @llvm.arm.neon.vld4.v2i32(i32* %A) %tmp2 = extractvalue %struct.__builtin_neon_v2si4 %tmp1, 0 %tmp3 = extractvalue %struct.__builtin_neon_v2si4 %tmp1, 2 %tmp4 = add <2 x i32> %tmp2, %tmp3 @@ -38,14 +38,14 @@ define <2 x i32> @vld4i32(i32* %A) nounwind { define <2 x float> @vld4f(float* %A) nounwind { ;CHECK: vld4f: ;CHECK: vld4.32 - %tmp1 = call %struct.__builtin_neon_v2sf4 @llvm.arm.neon.vld4f.v2f32(float* %A) + %tmp1 = call %struct.__builtin_neon_v2sf4 @llvm.arm.neon.vld4.v2f32(float* %A) %tmp2 = extractvalue %struct.__builtin_neon_v2sf4 %tmp1, 0 %tmp3 = extractvalue %struct.__builtin_neon_v2sf4 %tmp1, 2 %tmp4 = add <2 x float> %tmp2, %tmp3 ret <2 x float> %tmp4 } -declare %struct.__builtin_neon_v8qi4 @llvm.arm.neon.vld4i.v8i8(i8*) nounwind readonly -declare %struct.__builtin_neon_v4hi4 @llvm.arm.neon.vld4i.v4i16(i8*) nounwind readonly -declare %struct.__builtin_neon_v2si4 @llvm.arm.neon.vld4i.v2i32(i8*) nounwind readonly -declare %struct.__builtin_neon_v2sf4 @llvm.arm.neon.vld4f.v2f32(i8*) nounwind readonly +declare %struct.__builtin_neon_v8qi4 @llvm.arm.neon.vld4.v8i8(i8*) nounwind readonly +declare %struct.__builtin_neon_v4hi4 @llvm.arm.neon.vld4.v4i16(i8*) nounwind readonly +declare %struct.__builtin_neon_v2si4 @llvm.arm.neon.vld4.v2i32(i8*) nounwind readonly +declare %struct.__builtin_neon_v2sf4 @llvm.arm.neon.vld4.v2f32(i8*) nounwind readonly diff --git a/test/CodeGen/ARM/vmax.ll b/test/CodeGen/ARM/vmax.ll index 60322f85d39..65f607671c7 100644 --- a/test/CodeGen/ARM/vmax.ll +++ b/test/CodeGen/ARM/vmax.ll @@ -52,7 +52,7 @@ define <2 x i32> @vmaxu32(<2 x i32>* %A, <2 x i32>* %B) nounwind { define <2 x float> @vmaxf32(<2 x float>* %A, <2 x float>* %B) nounwind { %tmp1 = load <2 x float>* %A %tmp2 = load <2 x float>* %B - %tmp3 = call <2 x float> @llvm.arm.neon.vmaxf.v2f32(<2 x float> %tmp1, <2 x float> %tmp2) + %tmp3 = call <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float> %tmp1, <2 x float> %tmp2) ret <2 x float> %tmp3 } @@ -101,7 +101,7 @@ define <4 x i32> @vmaxQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind { define <4 x float> @vmaxQf32(<4 x float>* %A, <4 x float>* %B) nounwind { %tmp1 = load <4 x float>* %A %tmp2 = load <4 x float>* %B - %tmp3 = call <4 x float> @llvm.arm.neon.vmaxf.v4f32(<4 x float> %tmp1, <4 x float> %tmp2) + %tmp3 = call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %tmp1, <4 x float> %tmp2) ret <4 x float> %tmp3 } @@ -113,7 +113,7 @@ declare <8 x i8> @llvm.arm.neon.vmaxu.v8i8(<8 x i8>, <8 x i8>) nounwind readnon declare <4 x i16> @llvm.arm.neon.vmaxu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone declare <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone -declare <2 x float> @llvm.arm.neon.vmaxf.v2f32(<2 x float>, <2 x float>) nounwind readnone +declare <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float>, <2 x float>) nounwind readnone declare <16 x i8> @llvm.arm.neon.vmaxs.v16i8(<16 x i8>, <16 x i8>) nounwind readnone declare <8 x i16> @llvm.arm.neon.vmaxs.v8i16(<8 x i16>, <8 x i16>) nounwind readnone @@ -123,4 +123,4 @@ declare <16 x i8> @llvm.arm.neon.vmaxu.v16i8(<16 x i8>, <16 x i8>) nounwind read declare <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone declare <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone -declare <4 x float> @llvm.arm.neon.vmaxf.v4f32(<4 x float>, <4 x float>) nounwind readnone +declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>) nounwind readnone diff --git a/test/CodeGen/ARM/vmin.ll b/test/CodeGen/ARM/vmin.ll index a6936937c7b..08a3f090991 100644 --- a/test/CodeGen/ARM/vmin.ll +++ b/test/CodeGen/ARM/vmin.ll @@ -52,7 +52,7 @@ define <2 x i32> @vminu32(<2 x i32>* %A, <2 x i32>* %B) nounwind { define <2 x float> @vminf32(<2 x float>* %A, <2 x float>* %B) nounwind { %tmp1 = load <2 x float>* %A %tmp2 = load <2 x float>* %B - %tmp3 = call <2 x float> @llvm.arm.neon.vminf.v2f32(<2 x float> %tmp1, <2 x float> %tmp2) + %tmp3 = call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> %tmp1, <2 x float> %tmp2) ret <2 x float> %tmp3 } @@ -101,7 +101,7 @@ define <4 x i32> @vminQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind { define <4 x float> @vminQf32(<4 x float>* %A, <4 x float>* %B) nounwind { %tmp1 = load <4 x float>* %A %tmp2 = load <4 x float>* %B - %tmp3 = call <4 x float> @llvm.arm.neon.vminf.v4f32(<4 x float> %tmp1, <4 x float> %tmp2) + %tmp3 = call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %tmp1, <4 x float> %tmp2) ret <4 x float> %tmp3 } @@ -113,7 +113,7 @@ declare <8 x i8> @llvm.arm.neon.vminu.v8i8(<8 x i8>, <8 x i8>) nounwind readnon declare <4 x i16> @llvm.arm.neon.vminu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone declare <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone -declare <2 x float> @llvm.arm.neon.vminf.v2f32(<2 x float>, <2 x float>) nounwind readnone +declare <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float>, <2 x float>) nounwind readnone declare <16 x i8> @llvm.arm.neon.vmins.v16i8(<16 x i8>, <16 x i8>) nounwind readnone declare <8 x i16> @llvm.arm.neon.vmins.v8i16(<8 x i16>, <8 x i16>) nounwind readnone @@ -123,4 +123,4 @@ declare <16 x i8> @llvm.arm.neon.vminu.v16i8(<16 x i8>, <16 x i8>) nounwind read declare <8 x i16> @llvm.arm.neon.vminu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone declare <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone -declare <4 x float> @llvm.arm.neon.vminf.v4f32(<4 x float>, <4 x float>) nounwind readnone +declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>) nounwind readnone diff --git a/test/CodeGen/ARM/vpmax.ll b/test/CodeGen/ARM/vpmax.ll index 9878ca8c7ba..90ae70ff94b 100644 --- a/test/CodeGen/ARM/vpmax.ll +++ b/test/CodeGen/ARM/vpmax.ll @@ -52,7 +52,7 @@ define <2 x i32> @vpmaxu32(<2 x i32>* %A, <2 x i32>* %B) nounwind { define <2 x float> @vpmaxf32(<2 x float>* %A, <2 x float>* %B) nounwind { %tmp1 = load <2 x float>* %A %tmp2 = load <2 x float>* %B - %tmp3 = call <2 x float> @llvm.arm.neon.vpmaxf.v2f32(<2 x float> %tmp1, <2 x float> %tmp2) + %tmp3 = call <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float> %tmp1, <2 x float> %tmp2) ret <2 x float> %tmp3 } @@ -64,4 +64,4 @@ declare <8 x i8> @llvm.arm.neon.vpmaxu.v8i8(<8 x i8>, <8 x i8>) nounwind readno declare <4 x i16> @llvm.arm.neon.vpmaxu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone declare <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone -declare <2 x float> @llvm.arm.neon.vpmaxf.v2f32(<2 x float>, <2 x float>) nounwind readnone +declare <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float>, <2 x float>) nounwind readnone diff --git a/test/CodeGen/ARM/vpmin.ll b/test/CodeGen/ARM/vpmin.ll index 7b5348baa54..0f982f4610a 100644 --- a/test/CodeGen/ARM/vpmin.ll +++ b/test/CodeGen/ARM/vpmin.ll @@ -52,7 +52,7 @@ define <2 x i32> @vpminu32(<2 x i32>* %A, <2 x i32>* %B) nounwind { define <2 x float> @vpminf32(<2 x float>* %A, <2 x float>* %B) nounwind { %tmp1 = load <2 x float>* %A %tmp2 = load <2 x float>* %B - %tmp3 = call <2 x float> @llvm.arm.neon.vpminf.v2f32(<2 x float> %tmp1, <2 x float> %tmp2) + %tmp3 = call <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float> %tmp1, <2 x float> %tmp2) ret <2 x float> %tmp3 } @@ -64,4 +64,4 @@ declare <8 x i8> @llvm.arm.neon.vpminu.v8i8(<8 x i8>, <8 x i8>) nounwind readno declare <4 x i16> @llvm.arm.neon.vpminu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone declare <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone -declare <2 x float> @llvm.arm.neon.vpminf.v2f32(<2 x float>, <2 x float>) nounwind readnone +declare <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float>, <2 x float>) nounwind readnone diff --git a/test/CodeGen/ARM/vrecpe.ll b/test/CodeGen/ARM/vrecpe.ll index 79cb595bc84..622725bce3b 100644 --- a/test/CodeGen/ARM/vrecpe.ll +++ b/test/CodeGen/ARM/vrecpe.ll @@ -16,18 +16,18 @@ define <4 x i32> @vrecpeQi32(<4 x i32>* %A) nounwind { define <2 x float> @vrecpef32(<2 x float>* %A) nounwind { %tmp1 = load <2 x float>* %A - %tmp2 = call <2 x float> @llvm.arm.neon.vrecpef.v2f32(<2 x float> %tmp1) + %tmp2 = call <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float> %tmp1) ret <2 x float> %tmp2 } define <4 x float> @vrecpeQf32(<4 x float>* %A) nounwind { %tmp1 = load <4 x float>* %A - %tmp2 = call <4 x float> @llvm.arm.neon.vrecpef.v4f32(<4 x float> %tmp1) + %tmp2 = call <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float> %tmp1) ret <4 x float> %tmp2 } declare <2 x i32> @llvm.arm.neon.vrecpe.v2i32(<2 x i32>) nounwind readnone declare <4 x i32> @llvm.arm.neon.vrecpe.v4i32(<4 x i32>) nounwind readnone -declare <2 x float> @llvm.arm.neon.vrecpef.v2f32(<2 x float>) nounwind readnone -declare <4 x float> @llvm.arm.neon.vrecpef.v4f32(<4 x float>) nounwind readnone +declare <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float>) nounwind readnone +declare <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float>) nounwind readnone diff --git a/test/CodeGen/ARM/vrsqrte.ll b/test/CodeGen/ARM/vrsqrte.ll index 10529f61b56..4f119775248 100644 --- a/test/CodeGen/ARM/vrsqrte.ll +++ b/test/CodeGen/ARM/vrsqrte.ll @@ -16,18 +16,18 @@ define <4 x i32> @vrsqrteQi32(<4 x i32>* %A) nounwind { define <2 x float> @vrsqrtef32(<2 x float>* %A) nounwind { %tmp1 = load <2 x float>* %A - %tmp2 = call <2 x float> @llvm.arm.neon.vrsqrtef.v2f32(<2 x float> %tmp1) + %tmp2 = call <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float> %tmp1) ret <2 x float> %tmp2 } define <4 x float> @vrsqrteQf32(<4 x float>* %A) nounwind { %tmp1 = load <4 x float>* %A - %tmp2 = call <4 x float> @llvm.arm.neon.vrsqrtef.v4f32(<4 x float> %tmp1) + %tmp2 = call <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float> %tmp1) ret <4 x float> %tmp2 } declare <2 x i32> @llvm.arm.neon.vrsqrte.v2i32(<2 x i32>) nounwind readnone declare <4 x i32> @llvm.arm.neon.vrsqrte.v4i32(<4 x i32>) nounwind readnone -declare <2 x float> @llvm.arm.neon.vrsqrtef.v2f32(<2 x float>) nounwind readnone -declare <4 x float> @llvm.arm.neon.vrsqrtef.v4f32(<4 x float>) nounwind readnone +declare <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float>) nounwind readnone +declare <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float>) nounwind readnone diff --git a/test/CodeGen/ARM/vst1.ll b/test/CodeGen/ARM/vst1.ll index d84f75882f3..8fbae12a032 100644 --- a/test/CodeGen/ARM/vst1.ll +++ b/test/CodeGen/ARM/vst1.ll @@ -4,7 +4,7 @@ define void @vst1i8(i8* %A, <8 x i8>* %B) nounwind { ;CHECK: vst1i8: ;CHECK: vst1.8 %tmp1 = load <8 x i8>* %B - call void @llvm.arm.neon.vst1i.v8i8(i8* %A, <8 x i8> %tmp1) + call void @llvm.arm.neon.vst1.v8i8(i8* %A, <8 x i8> %tmp1) ret void } @@ -12,7 +12,7 @@ define void @vst1i16(i16* %A, <4 x i16>* %B) nounwind { ;CHECK: vst1i16: ;CHECK: vst1.16 %tmp1 = load <4 x i16>* %B - call void @llvm.arm.neon.vst1i.v4i16(i16* %A, <4 x i16> %tmp1) + call void @llvm.arm.neon.vst1.v4i16(i16* %A, <4 x i16> %tmp1) ret void } @@ -20,7 +20,7 @@ define void @vst1i32(i32* %A, <2 x i32>* %B) nounwind { ;CHECK: vst1i32: ;CHECK: vst1.32 %tmp1 = load <2 x i32>* %B - call void @llvm.arm.neon.vst1i.v2i32(i32* %A, <2 x i32> %tmp1) + call void @llvm.arm.neon.vst1.v2i32(i32* %A, <2 x i32> %tmp1) ret void } @@ -28,7 +28,7 @@ define void @vst1f(float* %A, <2 x float>* %B) nounwind { ;CHECK: vst1f: ;CHECK: vst1.32 %tmp1 = load <2 x float>* %B - call void @llvm.arm.neon.vst1f.v2f32(float* %A, <2 x float> %tmp1) + call void @llvm.arm.neon.vst1.v2f32(float* %A, <2 x float> %tmp1) ret void } @@ -36,7 +36,7 @@ define void @vst1i64(i64* %A, <1 x i64>* %B) nounwind { ;CHECK: vst1i64: ;CHECK: vst1.64 %tmp1 = load <1 x i64>* %B - call void @llvm.arm.neon.vst1i.v1i64(i64* %A, <1 x i64> %tmp1) + call void @llvm.arm.neon.vst1.v1i64(i64* %A, <1 x i64> %tmp1) ret void } @@ -44,7 +44,7 @@ define void @vst1Qi8(i8* %A, <16 x i8>* %B) nounwind { ;CHECK: vst1Qi8: ;CHECK: vst1.8 %tmp1 = load <16 x i8>* %B - call void @llvm.arm.neon.vst1i.v16i8(i8* %A, <16 x i8> %tmp1) + call void @llvm.arm.neon.vst1.v16i8(i8* %A, <16 x i8> %tmp1) ret void } @@ -52,7 +52,7 @@ define void @vst1Qi16(i16* %A, <8 x i16>* %B) nounwind { ;CHECK: vst1Qi16: ;CHECK: vst1.16 %tmp1 = load <8 x i16>* %B - call void @llvm.arm.neon.vst1i.v8i16(i16* %A, <8 x i16> %tmp1) + call void @llvm.arm.neon.vst1.v8i16(i16* %A, <8 x i16> %tmp1) ret void } @@ -60,7 +60,7 @@ define void @vst1Qi32(i32* %A, <4 x i32>* %B) nounwind { ;CHECK: vst1Qi32: ;CHECK: vst1.32 %tmp1 = load <4 x i32>* %B - call void @llvm.arm.neon.vst1i.v4i32(i32* %A, <4 x i32> %tmp1) + call void @llvm.arm.neon.vst1.v4i32(i32* %A, <4 x i32> %tmp1) ret void } @@ -68,7 +68,7 @@ define void @vst1Qf(float* %A, <4 x float>* %B) nounwind { ;CHECK: vst1Qf: ;CHECK: vst1.32 %tmp1 = load <4 x float>* %B - call void @llvm.arm.neon.vst1f.v4f32(float* %A, <4 x float> %tmp1) + call void @llvm.arm.neon.vst1.v4f32(float* %A, <4 x float> %tmp1) ret void } @@ -76,18 +76,18 @@ define void @vst1Qi64(i64* %A, <2 x i64>* %B) nounwind { ;CHECK: vst1Qi64: ;CHECK: vst1.64 %tmp1 = load <2 x i64>* %B - call void @llvm.arm.neon.vst1i.v2i64(i64* %A, <2 x i64> %tmp1) + call void @llvm.arm.neon.vst1.v2i64(i64* %A, <2 x i64> %tmp1) ret void } -declare void @llvm.arm.neon.vst1i.v8i8(i8*, <8 x i8>) nounwind -declare void @llvm.arm.neon.vst1i.v4i16(i8*, <4 x i16>) nounwind -declare void @llvm.arm.neon.vst1i.v2i32(i8*, <2 x i32>) nounwind -declare void @llvm.arm.neon.vst1f.v2f32(i8*, <2 x float>) nounwind -declare void @llvm.arm.neon.vst1i.v1i64(i8*, <1 x i64>) nounwind +declare void @llvm.arm.neon.vst1.v8i8(i8*, <8 x i8>) nounwind +declare void @llvm.arm.neon.vst1.v4i16(i8*, <4 x i16>) nounwind +declare void @llvm.arm.neon.vst1.v2i32(i8*, <2 x i32>) nounwind +declare void @llvm.arm.neon.vst1.v2f32(i8*, <2 x float>) nounwind +declare void @llvm.arm.neon.vst1.v1i64(i8*, <1 x i64>) nounwind -declare void @llvm.arm.neon.vst1i.v16i8(i8*, <16 x i8>) nounwind -declare void @llvm.arm.neon.vst1i.v8i16(i8*, <8 x i16>) nounwind -declare void @llvm.arm.neon.vst1i.v4i32(i8*, <4 x i32>) nounwind -declare void @llvm.arm.neon.vst1f.v4f32(i8*, <4 x float>) nounwind -declare void @llvm.arm.neon.vst1i.v2i64(i8*, <2 x i64>) nounwind +declare void @llvm.arm.neon.vst1.v16i8(i8*, <16 x i8>) nounwind +declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>) nounwind +declare void @llvm.arm.neon.vst1.v4i32(i8*, <4 x i32>) nounwind +declare void @llvm.arm.neon.vst1.v4f32(i8*, <4 x float>) nounwind +declare void @llvm.arm.neon.vst1.v2i64(i8*, <2 x i64>) nounwind diff --git a/test/CodeGen/ARM/vst2.ll b/test/CodeGen/ARM/vst2.ll index f8f34f4aae3..3e2d028e726 100644 --- a/test/CodeGen/ARM/vst2.ll +++ b/test/CodeGen/ARM/vst2.ll @@ -4,7 +4,7 @@ define void @vst2i8(i8* %A, <8 x i8>* %B) nounwind { ;CHECK: vst2i8: ;CHECK: vst2.8 %tmp1 = load <8 x i8>* %B - call void @llvm.arm.neon.vst2i.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1) + call void @llvm.arm.neon.vst2.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1) ret void } @@ -12,7 +12,7 @@ define void @vst2i16(i16* %A, <4 x i16>* %B) nounwind { ;CHECK: vst2i16: ;CHECK: vst2.16 %tmp1 = load <4 x i16>* %B - call void @llvm.arm.neon.vst2i.v4i16(i16* %A, <4 x i16> %tmp1, <4 x i16> %tmp1) + call void @llvm.arm.neon.vst2.v4i16(i16* %A, <4 x i16> %tmp1, <4 x i16> %tmp1) ret void } @@ -20,7 +20,7 @@ define void @vst2i32(i32* %A, <2 x i32>* %B) nounwind { ;CHECK: vst2i32: ;CHECK: vst2.32 %tmp1 = load <2 x i32>* %B - call void @llvm.arm.neon.vst2i.v2i32(i32* %A, <2 x i32> %tmp1, <2 x i32> %tmp1) + call void @llvm.arm.neon.vst2.v2i32(i32* %A, <2 x i32> %tmp1, <2 x i32> %tmp1) ret void } @@ -28,11 +28,11 @@ define void @vst2f(float* %A, <2 x float>* %B) nounwind { ;CHECK: vst2f: ;CHECK: vst2.32 %tmp1 = load <2 x float>* %B - call void @llvm.arm.neon.vst2f.v2f32(float* %A, <2 x float> %tmp1, <2 x float> %tmp1) + call void @llvm.arm.neon.vst2.v2f32(float* %A, <2 x float> %tmp1, <2 x float> %tmp1) ret void } -declare void @llvm.arm.neon.vst2i.v8i8(i8*, <8 x i8>, <8 x i8>) nounwind -declare void @llvm.arm.neon.vst2i.v4i16(i8*, <4 x i16>, <4 x i16>) nounwind -declare void @llvm.arm.neon.vst2i.v2i32(i8*, <2 x i32>, <2 x i32>) nounwind -declare void @llvm.arm.neon.vst2f.v2f32(i8*, <2 x float>, <2 x float>) nounwind +declare void @llvm.arm.neon.vst2.v8i8(i8*, <8 x i8>, <8 x i8>) nounwind +declare void @llvm.arm.neon.vst2.v4i16(i8*, <4 x i16>, <4 x i16>) nounwind +declare void @llvm.arm.neon.vst2.v2i32(i8*, <2 x i32>, <2 x i32>) nounwind +declare void @llvm.arm.neon.vst2.v2f32(i8*, <2 x float>, <2 x float>) nounwind diff --git a/test/CodeGen/ARM/vst3.ll b/test/CodeGen/ARM/vst3.ll index c1a6ce86b4c..0a47efae202 100644 --- a/test/CodeGen/ARM/vst3.ll +++ b/test/CodeGen/ARM/vst3.ll @@ -4,7 +4,7 @@ define void @vst3i8(i8* %A, <8 x i8>* %B) nounwind { ;CHECK: vst3i8: ;CHECK: vst3.8 %tmp1 = load <8 x i8>* %B - call void @llvm.arm.neon.vst3i.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1) + call void @llvm.arm.neon.vst3.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1) ret void } @@ -12,7 +12,7 @@ define void @vst3i16(i16* %A, <4 x i16>* %B) nounwind { ;CHECK: vst3i16: ;CHECK: vst3.16 %tmp1 = load <4 x i16>* %B - call void @llvm.arm.neon.vst3i.v4i16(i16* %A, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1) + call void @llvm.arm.neon.vst3.v4i16(i16* %A, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1) ret void } @@ -20,7 +20,7 @@ define void @vst3i32(i32* %A, <2 x i32>* %B) nounwind { ;CHECK: vst3i32: ;CHECK: vst3.32 %tmp1 = load <2 x i32>* %B - call void @llvm.arm.neon.vst3i.v2i32(i32* %A, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1) + call void @llvm.arm.neon.vst3.v2i32(i32* %A, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1) ret void } @@ -28,11 +28,11 @@ define void @vst3f(float* %A, <2 x float>* %B) nounwind { ;CHECK: vst3f: ;CHECK: vst3.32 %tmp1 = load <2 x float>* %B - call void @llvm.arm.neon.vst3f.v2f32(float* %A, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1) + call void @llvm.arm.neon.vst3.v2f32(float* %A, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1) ret void } -declare void @llvm.arm.neon.vst3i.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>) nounwind -declare void @llvm.arm.neon.vst3i.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>) nounwind -declare void @llvm.arm.neon.vst3i.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>) nounwind -declare void @llvm.arm.neon.vst3f.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>) nounwind +declare void @llvm.arm.neon.vst3.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>) nounwind +declare void @llvm.arm.neon.vst3.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>) nounwind +declare void @llvm.arm.neon.vst3.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>) nounwind +declare void @llvm.arm.neon.vst3.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>) nounwind diff --git a/test/CodeGen/ARM/vst4.ll b/test/CodeGen/ARM/vst4.ll index 1d6f109a728..fa745ebc133 100644 --- a/test/CodeGen/ARM/vst4.ll +++ b/test/CodeGen/ARM/vst4.ll @@ -4,7 +4,7 @@ define void @vst4i8(i8* %A, <8 x i8>* %B) nounwind { ;CHECK: vst4i8: ;CHECK: vst4.8 %tmp1 = load <8 x i8>* %B - call void @llvm.arm.neon.vst4i.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1) + call void @llvm.arm.neon.vst4.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1) ret void } @@ -12,7 +12,7 @@ define void @vst4i16(i16* %A, <4 x i16>* %B) nounwind { ;CHECK: vst4i16: ;CHECK: vst4.16 %tmp1 = load <4 x i16>* %B - call void @llvm.arm.neon.vst4i.v4i16(i16* %A, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1) + call void @llvm.arm.neon.vst4.v4i16(i16* %A, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1) ret void } @@ -20,7 +20,7 @@ define void @vst4i32(i32* %A, <2 x i32>* %B) nounwind { ;CHECK: vst4i32: ;CHECK: vst4.32 %tmp1 = load <2 x i32>* %B - call void @llvm.arm.neon.vst4i.v2i32(i32* %A, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1) + call void @llvm.arm.neon.vst4.v2i32(i32* %A, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1) ret void } @@ -28,11 +28,11 @@ define void @vst4f(float* %A, <2 x float>* %B) nounwind { ;CHECK: vst4f: ;CHECK: vst4.32 %tmp1 = load <2 x float>* %B - call void @llvm.arm.neon.vst4f.v2f32(float* %A, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1) + call void @llvm.arm.neon.vst4.v2f32(float* %A, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1) ret void } -declare void @llvm.arm.neon.vst4i.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>) nounwind -declare void @llvm.arm.neon.vst4i.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>) nounwind -declare void @llvm.arm.neon.vst4i.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>) nounwind -declare void @llvm.arm.neon.vst4f.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>) nounwind +declare void @llvm.arm.neon.vst4.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>) nounwind +declare void @llvm.arm.neon.vst4.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>) nounwind +declare void @llvm.arm.neon.vst4.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>) nounwind +declare void @llvm.arm.neon.vst4.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>) nounwind diff --git a/test/CodeGen/ARM/vtrn.ll b/test/CodeGen/ARM/vtrn.ll index 205052cdd16..36a05617055 100644 --- a/test/CodeGen/ARM/vtrn.ll +++ b/test/CodeGen/ARM/vtrn.ll @@ -15,7 +15,7 @@ define <8 x i8> @vtrni8(<8 x i8>* %A, <8 x i8>* %B) nounwind { ;CHECK: vtrn.8 %tmp1 = load <8 x i8>* %A %tmp2 = load <8 x i8>* %B - %tmp3 = call %struct.__builtin_neon_v8qi2 @llvm.arm.neon.vtrni.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) + %tmp3 = call %struct.__builtin_neon_v8qi2 @llvm.arm.neon.vtrn.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) %tmp4 = extractvalue %struct.__builtin_neon_v8qi2 %tmp3, 0 %tmp5 = extractvalue %struct.__builtin_neon_v8qi2 %tmp3, 1 %tmp6 = add <8 x i8> %tmp4, %tmp5 @@ -27,7 +27,7 @@ define <4 x i16> @vtrni16(<4 x i16>* %A, <4 x i16>* %B) nounwind { ;CHECK: vtrn.16 %tmp1 = load <4 x i16>* %A %tmp2 = load <4 x i16>* %B - %tmp3 = call %struct.__builtin_neon_v4hi2 @llvm.arm.neon.vtrni.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) + %tmp3 = call %struct.__builtin_neon_v4hi2 @llvm.arm.neon.vtrn.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) %tmp4 = extractvalue %struct.__builtin_neon_v4hi2 %tmp3, 0 %tmp5 = extractvalue %struct.__builtin_neon_v4hi2 %tmp3, 1 %tmp6 = add <4 x i16> %tmp4, %tmp5 @@ -39,7 +39,7 @@ define <2 x i32> @vtrni32(<2 x i32>* %A, <2 x i32>* %B) nounwind { ;CHECK: vtrn.32 %tmp1 = load <2 x i32>* %A %tmp2 = load <2 x i32>* %B - %tmp3 = call %struct.__builtin_neon_v2si2 @llvm.arm.neon.vtrni.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) + %tmp3 = call %struct.__builtin_neon_v2si2 @llvm.arm.neon.vtrn.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) %tmp4 = extractvalue %struct.__builtin_neon_v2si2 %tmp3, 0 %tmp5 = extractvalue %struct.__builtin_neon_v2si2 %tmp3, 1 %tmp6 = add <2 x i32> %tmp4, %tmp5 @@ -51,7 +51,7 @@ define <2 x float> @vtrnf(<2 x float>* %A, <2 x float>* %B) nounwind { ;CHECK: vtrn.32 %tmp1 = load <2 x float>* %A %tmp2 = load <2 x float>* %B - %tmp3 = call %struct.__builtin_neon_v2sf2 @llvm.arm.neon.vtrnf.v2f32(<2 x float> %tmp1, <2 x float> %tmp2) + %tmp3 = call %struct.__builtin_neon_v2sf2 @llvm.arm.neon.vtrn.v2f32(<2 x float> %tmp1, <2 x float> %tmp2) %tmp4 = extractvalue %struct.__builtin_neon_v2sf2 %tmp3, 0 %tmp5 = extractvalue %struct.__builtin_neon_v2sf2 %tmp3, 1 %tmp6 = add <2 x float> %tmp4, %tmp5 @@ -63,7 +63,7 @@ define <16 x i8> @vtrnQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind { ;CHECK: vtrn.8 %tmp1 = load <16 x i8>* %A %tmp2 = load <16 x i8>* %B - %tmp3 = call %struct.__builtin_neon_v16qi2 @llvm.arm.neon.vtrni.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) + %tmp3 = call %struct.__builtin_neon_v16qi2 @llvm.arm.neon.vtrn.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) %tmp4 = extractvalue %struct.__builtin_neon_v16qi2 %tmp3, 0 %tmp5 = extractvalue %struct.__builtin_neon_v16qi2 %tmp3, 1 %tmp6 = add <16 x i8> %tmp4, %tmp5 @@ -75,7 +75,7 @@ define <8 x i16> @vtrnQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind { ;CHECK: vtrn.16 %tmp1 = load <8 x i16>* %A %tmp2 = load <8 x i16>* %B - %tmp3 = call %struct.__builtin_neon_v8hi2 @llvm.arm.neon.vtrni.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) + %tmp3 = call %struct.__builtin_neon_v8hi2 @llvm.arm.neon.vtrn.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) %tmp4 = extractvalue %struct.__builtin_neon_v8hi2 %tmp3, 0 %tmp5 = extractvalue %struct.__builtin_neon_v8hi2 %tmp3, 1 %tmp6 = add <8 x i16> %tmp4, %tmp5 @@ -87,7 +87,7 @@ define <4 x i32> @vtrnQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind { ;CHECK: vtrn.32 %tmp1 = load <4 x i32>* %A %tmp2 = load <4 x i32>* %B - %tmp3 = call %struct.__builtin_neon_v4si2 @llvm.arm.neon.vtrni.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) + %tmp3 = call %struct.__builtin_neon_v4si2 @llvm.arm.neon.vtrn.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) %tmp4 = extractvalue %struct.__builtin_neon_v4si2 %tmp3, 0 %tmp5 = extractvalue %struct.__builtin_neon_v4si2 %tmp3, 1 %tmp6 = add <4 x i32> %tmp4, %tmp5 @@ -99,19 +99,19 @@ define <4 x float> @vtrnQf(<4 x float>* %A, <4 x float>* %B) nounwind { ;CHECK: vtrn.32 %tmp1 = load <4 x float>* %A %tmp2 = load <4 x float>* %B - %tmp3 = call %struct.__builtin_neon_v4sf2 @llvm.arm.neon.vtrnf.v4f32(<4 x float> %tmp1, <4 x float> %tmp2) + %tmp3 = call %struct.__builtin_neon_v4sf2 @llvm.arm.neon.vtrn.v4f32(<4 x float> %tmp1, <4 x float> %tmp2) %tmp4 = extractvalue %struct.__builtin_neon_v4sf2 %tmp3, 0 %tmp5 = extractvalue %struct.__builtin_neon_v4sf2 %tmp3, 1 %tmp6 = add <4 x float> %tmp4, %tmp5 ret <4 x float> %tmp6 } -declare %struct.__builtin_neon_v8qi2 @llvm.arm.neon.vtrni.v8i8(<8 x i8>, <8 x i8>) nounwind readnone -declare %struct.__builtin_neon_v4hi2 @llvm.arm.neon.vtrni.v4i16(<4 x i16>, <4 x i16>) nounwind readnone -declare %struct.__builtin_neon_v2si2 @llvm.arm.neon.vtrni.v2i32(<2 x i32>, <2 x i32>) nounwind readnone -declare %struct.__builtin_neon_v2sf2 @llvm.arm.neon.vtrnf.v2f32(<2 x float>, <2 x float>) nounwind readnone +declare %struct.__builtin_neon_v8qi2 @llvm.arm.neon.vtrn.v8i8(<8 x i8>, <8 x i8>) nounwind readnone +declare %struct.__builtin_neon_v4hi2 @llvm.arm.neon.vtrn.v4i16(<4 x i16>, <4 x i16>) nounwind readnone +declare %struct.__builtin_neon_v2si2 @llvm.arm.neon.vtrn.v2i32(<2 x i32>, <2 x i32>) nounwind readnone +declare %struct.__builtin_neon_v2sf2 @llvm.arm.neon.vtrn.v2f32(<2 x float>, <2 x float>) nounwind readnone -declare %struct.__builtin_neon_v16qi2 @llvm.arm.neon.vtrni.v16i8(<16 x i8>, <16 x i8>) nounwind readnone -declare %struct.__builtin_neon_v8hi2 @llvm.arm.neon.vtrni.v8i16(<8 x i16>, <8 x i16>) nounwind readnone -declare %struct.__builtin_neon_v4si2 @llvm.arm.neon.vtrni.v4i32(<4 x i32>, <4 x i32>) nounwind readnone -declare %struct.__builtin_neon_v4sf2 @llvm.arm.neon.vtrnf.v4f32(<4 x float>, <4 x float>) nounwind readnone +declare %struct.__builtin_neon_v16qi2 @llvm.arm.neon.vtrn.v16i8(<16 x i8>, <16 x i8>) nounwind readnone +declare %struct.__builtin_neon_v8hi2 @llvm.arm.neon.vtrn.v8i16(<8 x i16>, <8 x i16>) nounwind readnone +declare %struct.__builtin_neon_v4si2 @llvm.arm.neon.vtrn.v4i32(<4 x i32>, <4 x i32>) nounwind readnone +declare %struct.__builtin_neon_v4sf2 @llvm.arm.neon.vtrn.v4f32(<4 x float>, <4 x float>) nounwind readnone diff --git a/test/CodeGen/ARM/vuzp.ll b/test/CodeGen/ARM/vuzp.ll index 508ae147066..883e0722abc 100644 --- a/test/CodeGen/ARM/vuzp.ll +++ b/test/CodeGen/ARM/vuzp.ll @@ -15,7 +15,7 @@ define <8 x i8> @vuzpi8(<8 x i8>* %A, <8 x i8>* %B) nounwind { ;CHECK: vuzp.8 %tmp1 = load <8 x i8>* %A %tmp2 = load <8 x i8>* %B - %tmp3 = call %struct.__builtin_neon_v8qi2 @llvm.arm.neon.vuzpi.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) + %tmp3 = call %struct.__builtin_neon_v8qi2 @llvm.arm.neon.vuzp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) %tmp4 = extractvalue %struct.__builtin_neon_v8qi2 %tmp3, 0 %tmp5 = extractvalue %struct.__builtin_neon_v8qi2 %tmp3, 1 %tmp6 = add <8 x i8> %tmp4, %tmp5 @@ -27,7 +27,7 @@ define <4 x i16> @vuzpi16(<4 x i16>* %A, <4 x i16>* %B) nounwind { ;CHECK: vuzp.16 %tmp1 = load <4 x i16>* %A %tmp2 = load <4 x i16>* %B - %tmp3 = call %struct.__builtin_neon_v4hi2 @llvm.arm.neon.vuzpi.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) + %tmp3 = call %struct.__builtin_neon_v4hi2 @llvm.arm.neon.vuzp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) %tmp4 = extractvalue %struct.__builtin_neon_v4hi2 %tmp3, 0 %tmp5 = extractvalue %struct.__builtin_neon_v4hi2 %tmp3, 1 %tmp6 = add <4 x i16> %tmp4, %tmp5 @@ -39,7 +39,7 @@ define <2 x i32> @vuzpi32(<2 x i32>* %A, <2 x i32>* %B) nounwind { ;CHECK: vuzp.32 %tmp1 = load <2 x i32>* %A %tmp2 = load <2 x i32>* %B - %tmp3 = call %struct.__builtin_neon_v2si2 @llvm.arm.neon.vuzpi.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) + %tmp3 = call %struct.__builtin_neon_v2si2 @llvm.arm.neon.vuzp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) %tmp4 = extractvalue %struct.__builtin_neon_v2si2 %tmp3, 0 %tmp5 = extractvalue %struct.__builtin_neon_v2si2 %tmp3, 1 %tmp6 = add <2 x i32> %tmp4, %tmp5 @@ -51,7 +51,7 @@ define <2 x float> @vuzpf(<2 x float>* %A, <2 x float>* %B) nounwind { ;CHECK: vuzp.32 %tmp1 = load <2 x float>* %A %tmp2 = load <2 x float>* %B - %tmp3 = call %struct.__builtin_neon_v2sf2 @llvm.arm.neon.vuzpf.v2f32(<2 x float> %tmp1, <2 x float> %tmp2) + %tmp3 = call %struct.__builtin_neon_v2sf2 @llvm.arm.neon.vuzp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2) %tmp4 = extractvalue %struct.__builtin_neon_v2sf2 %tmp3, 0 %tmp5 = extractvalue %struct.__builtin_neon_v2sf2 %tmp3, 1 %tmp6 = add <2 x float> %tmp4, %tmp5 @@ -63,7 +63,7 @@ define <16 x i8> @vuzpQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind { ;CHECK: vuzp.8 %tmp1 = load <16 x i8>* %A %tmp2 = load <16 x i8>* %B - %tmp3 = call %struct.__builtin_neon_v16qi2 @llvm.arm.neon.vuzpi.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) + %tmp3 = call %struct.__builtin_neon_v16qi2 @llvm.arm.neon.vuzp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) %tmp4 = extractvalue %struct.__builtin_neon_v16qi2 %tmp3, 0 %tmp5 = extractvalue %struct.__builtin_neon_v16qi2 %tmp3, 1 %tmp6 = add <16 x i8> %tmp4, %tmp5 @@ -75,7 +75,7 @@ define <8 x i16> @vuzpQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind { ;CHECK: vuzp.16 %tmp1 = load <8 x i16>* %A %tmp2 = load <8 x i16>* %B - %tmp3 = call %struct.__builtin_neon_v8hi2 @llvm.arm.neon.vuzpi.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) + %tmp3 = call %struct.__builtin_neon_v8hi2 @llvm.arm.neon.vuzp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) %tmp4 = extractvalue %struct.__builtin_neon_v8hi2 %tmp3, 0 %tmp5 = extractvalue %struct.__builtin_neon_v8hi2 %tmp3, 1 %tmp6 = add <8 x i16> %tmp4, %tmp5 @@ -87,7 +87,7 @@ define <4 x i32> @vuzpQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind { ;CHECK: vuzp.32 %tmp1 = load <4 x i32>* %A %tmp2 = load <4 x i32>* %B - %tmp3 = call %struct.__builtin_neon_v4si2 @llvm.arm.neon.vuzpi.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) + %tmp3 = call %struct.__builtin_neon_v4si2 @llvm.arm.neon.vuzp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) %tmp4 = extractvalue %struct.__builtin_neon_v4si2 %tmp3, 0 %tmp5 = extractvalue %struct.__builtin_neon_v4si2 %tmp3, 1 %tmp6 = add <4 x i32> %tmp4, %tmp5 @@ -99,19 +99,19 @@ define <4 x float> @vuzpQf(<4 x float>* %A, <4 x float>* %B) nounwind { ;CHECK: vuzp.32 %tmp1 = load <4 x float>* %A %tmp2 = load <4 x float>* %B - %tmp3 = call %struct.__builtin_neon_v4sf2 @llvm.arm.neon.vuzpf.v4f32(<4 x float> %tmp1, <4 x float> %tmp2) + %tmp3 = call %struct.__builtin_neon_v4sf2 @llvm.arm.neon.vuzp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2) %tmp4 = extractvalue %struct.__builtin_neon_v4sf2 %tmp3, 0 %tmp5 = extractvalue %struct.__builtin_neon_v4sf2 %tmp3, 1 %tmp6 = add <4 x float> %tmp4, %tmp5 ret <4 x float> %tmp6 } -declare %struct.__builtin_neon_v8qi2 @llvm.arm.neon.vuzpi.v8i8(<8 x i8>, <8 x i8>) nounwind readnone -declare %struct.__builtin_neon_v4hi2 @llvm.arm.neon.vuzpi.v4i16(<4 x i16>, <4 x i16>) nounwind readnone -declare %struct.__builtin_neon_v2si2 @llvm.arm.neon.vuzpi.v2i32(<2 x i32>, <2 x i32>) nounwind readnone -declare %struct.__builtin_neon_v2sf2 @llvm.arm.neon.vuzpf.v2f32(<2 x float>, <2 x float>) nounwind readnone +declare %struct.__builtin_neon_v8qi2 @llvm.arm.neon.vuzp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone +declare %struct.__builtin_neon_v4hi2 @llvm.arm.neon.vuzp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone +declare %struct.__builtin_neon_v2si2 @llvm.arm.neon.vuzp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone +declare %struct.__builtin_neon_v2sf2 @llvm.arm.neon.vuzp.v2f32(<2 x float>, <2 x float>) nounwind readnone -declare %struct.__builtin_neon_v16qi2 @llvm.arm.neon.vuzpi.v16i8(<16 x i8>, <16 x i8>) nounwind readnone -declare %struct.__builtin_neon_v8hi2 @llvm.arm.neon.vuzpi.v8i16(<8 x i16>, <8 x i16>) nounwind readnone -declare %struct.__builtin_neon_v4si2 @llvm.arm.neon.vuzpi.v4i32(<4 x i32>, <4 x i32>) nounwind readnone -declare %struct.__builtin_neon_v4sf2 @llvm.arm.neon.vuzpf.v4f32(<4 x float>, <4 x float>) nounwind readnone +declare %struct.__builtin_neon_v16qi2 @llvm.arm.neon.vuzp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone +declare %struct.__builtin_neon_v8hi2 @llvm.arm.neon.vuzp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone +declare %struct.__builtin_neon_v4si2 @llvm.arm.neon.vuzp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone +declare %struct.__builtin_neon_v4sf2 @llvm.arm.neon.vuzp.v4f32(<4 x float>, <4 x float>) nounwind readnone diff --git a/test/CodeGen/ARM/vzip.ll b/test/CodeGen/ARM/vzip.ll index ede5ab6209f..0485b30954b 100644 --- a/test/CodeGen/ARM/vzip.ll +++ b/test/CodeGen/ARM/vzip.ll @@ -15,7 +15,7 @@ define <8 x i8> @vzipi8(<8 x i8>* %A, <8 x i8>* %B) nounwind { ;CHECK: vzip.8 %tmp1 = load <8 x i8>* %A %tmp2 = load <8 x i8>* %B - %tmp3 = call %struct.__builtin_neon_v8qi2 @llvm.arm.neon.vzipi.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) + %tmp3 = call %struct.__builtin_neon_v8qi2 @llvm.arm.neon.vzip.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) %tmp4 = extractvalue %struct.__builtin_neon_v8qi2 %tmp3, 0 %tmp5 = extractvalue %struct.__builtin_neon_v8qi2 %tmp3, 1 %tmp6 = add <8 x i8> %tmp4, %tmp5 @@ -27,7 +27,7 @@ define <4 x i16> @vzipi16(<4 x i16>* %A, <4 x i16>* %B) nounwind { ;CHECK: vzip.16 %tmp1 = load <4 x i16>* %A %tmp2 = load <4 x i16>* %B - %tmp3 = call %struct.__builtin_neon_v4hi2 @llvm.arm.neon.vzipi.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) + %tmp3 = call %struct.__builtin_neon_v4hi2 @llvm.arm.neon.vzip.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) %tmp4 = extractvalue %struct.__builtin_neon_v4hi2 %tmp3, 0 %tmp5 = extractvalue %struct.__builtin_neon_v4hi2 %tmp3, 1 %tmp6 = add <4 x i16> %tmp4, %tmp5 @@ -39,7 +39,7 @@ define <2 x i32> @vzipi32(<2 x i32>* %A, <2 x i32>* %B) nounwind { ;CHECK: vzip.32 %tmp1 = load <2 x i32>* %A %tmp2 = load <2 x i32>* %B - %tmp3 = call %struct.__builtin_neon_v2si2 @llvm.arm.neon.vzipi.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) + %tmp3 = call %struct.__builtin_neon_v2si2 @llvm.arm.neon.vzip.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) %tmp4 = extractvalue %struct.__builtin_neon_v2si2 %tmp3, 0 %tmp5 = extractvalue %struct.__builtin_neon_v2si2 %tmp3, 1 %tmp6 = add <2 x i32> %tmp4, %tmp5 @@ -51,7 +51,7 @@ define <2 x float> @vzipf(<2 x float>* %A, <2 x float>* %B) nounwind { ;CHECK: vzip.32 %tmp1 = load <2 x float>* %A %tmp2 = load <2 x float>* %B - %tmp3 = call %struct.__builtin_neon_v2sf2 @llvm.arm.neon.vzipf.v2f32(<2 x float> %tmp1, <2 x float> %tmp2) + %tmp3 = call %struct.__builtin_neon_v2sf2 @llvm.arm.neon.vzip.v2f32(<2 x float> %tmp1, <2 x float> %tmp2) %tmp4 = extractvalue %struct.__builtin_neon_v2sf2 %tmp3, 0 %tmp5 = extractvalue %struct.__builtin_neon_v2sf2 %tmp3, 1 %tmp6 = add <2 x float> %tmp4, %tmp5 @@ -63,7 +63,7 @@ define <16 x i8> @vzipQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind { ;CHECK: vzip.8 %tmp1 = load <16 x i8>* %A %tmp2 = load <16 x i8>* %B - %tmp3 = call %struct.__builtin_neon_v16qi2 @llvm.arm.neon.vzipi.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) + %tmp3 = call %struct.__builtin_neon_v16qi2 @llvm.arm.neon.vzip.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) %tmp4 = extractvalue %struct.__builtin_neon_v16qi2 %tmp3, 0 %tmp5 = extractvalue %struct.__builtin_neon_v16qi2 %tmp3, 1 %tmp6 = add <16 x i8> %tmp4, %tmp5 @@ -75,7 +75,7 @@ define <8 x i16> @vzipQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind { ;CHECK: vzip.16 %tmp1 = load <8 x i16>* %A %tmp2 = load <8 x i16>* %B - %tmp3 = call %struct.__builtin_neon_v8hi2 @llvm.arm.neon.vzipi.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) + %tmp3 = call %struct.__builtin_neon_v8hi2 @llvm.arm.neon.vzip.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) %tmp4 = extractvalue %struct.__builtin_neon_v8hi2 %tmp3, 0 %tmp5 = extractvalue %struct.__builtin_neon_v8hi2 %tmp3, 1 %tmp6 = add <8 x i16> %tmp4, %tmp5 @@ -87,7 +87,7 @@ define <4 x i32> @vzipQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind { ;CHECK: vzip.32 %tmp1 = load <4 x i32>* %A %tmp2 = load <4 x i32>* %B - %tmp3 = call %struct.__builtin_neon_v4si2 @llvm.arm.neon.vzipi.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) + %tmp3 = call %struct.__builtin_neon_v4si2 @llvm.arm.neon.vzip.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) %tmp4 = extractvalue %struct.__builtin_neon_v4si2 %tmp3, 0 %tmp5 = extractvalue %struct.__builtin_neon_v4si2 %tmp3, 1 %tmp6 = add <4 x i32> %tmp4, %tmp5 @@ -99,19 +99,19 @@ define <4 x float> @vzipQf(<4 x float>* %A, <4 x float>* %B) nounwind { ;CHECK: vzip.32 %tmp1 = load <4 x float>* %A %tmp2 = load <4 x float>* %B - %tmp3 = call %struct.__builtin_neon_v4sf2 @llvm.arm.neon.vzipf.v4f32(<4 x float> %tmp1, <4 x float> %tmp2) + %tmp3 = call %struct.__builtin_neon_v4sf2 @llvm.arm.neon.vzip.v4f32(<4 x float> %tmp1, <4 x float> %tmp2) %tmp4 = extractvalue %struct.__builtin_neon_v4sf2 %tmp3, 0 %tmp5 = extractvalue %struct.__builtin_neon_v4sf2 %tmp3, 1 %tmp6 = add <4 x float> %tmp4, %tmp5 ret <4 x float> %tmp6 } -declare %struct.__builtin_neon_v8qi2 @llvm.arm.neon.vzipi.v8i8(<8 x i8>, <8 x i8>) nounwind readnone -declare %struct.__builtin_neon_v4hi2 @llvm.arm.neon.vzipi.v4i16(<4 x i16>, <4 x i16>) nounwind readnone -declare %struct.__builtin_neon_v2si2 @llvm.arm.neon.vzipi.v2i32(<2 x i32>, <2 x i32>) nounwind readnone -declare %struct.__builtin_neon_v2sf2 @llvm.arm.neon.vzipf.v2f32(<2 x float>, <2 x float>) nounwind readnone +declare %struct.__builtin_neon_v8qi2 @llvm.arm.neon.vzip.v8i8(<8 x i8>, <8 x i8>) nounwind readnone +declare %struct.__builtin_neon_v4hi2 @llvm.arm.neon.vzip.v4i16(<4 x i16>, <4 x i16>) nounwind readnone +declare %struct.__builtin_neon_v2si2 @llvm.arm.neon.vzip.v2i32(<2 x i32>, <2 x i32>) nounwind readnone +declare %struct.__builtin_neon_v2sf2 @llvm.arm.neon.vzip.v2f32(<2 x float>, <2 x float>) nounwind readnone -declare %struct.__builtin_neon_v16qi2 @llvm.arm.neon.vzipi.v16i8(<16 x i8>, <16 x i8>) nounwind readnone -declare %struct.__builtin_neon_v8hi2 @llvm.arm.neon.vzipi.v8i16(<8 x i16>, <8 x i16>) nounwind readnone -declare %struct.__builtin_neon_v4si2 @llvm.arm.neon.vzipi.v4i32(<4 x i32>, <4 x i32>) nounwind readnone -declare %struct.__builtin_neon_v4sf2 @llvm.arm.neon.vzipf.v4f32(<4 x float>, <4 x float>) nounwind readnone +declare %struct.__builtin_neon_v16qi2 @llvm.arm.neon.vzip.v16i8(<16 x i8>, <16 x i8>) nounwind readnone +declare %struct.__builtin_neon_v8hi2 @llvm.arm.neon.vzip.v8i16(<8 x i16>, <8 x i16>) nounwind readnone +declare %struct.__builtin_neon_v4si2 @llvm.arm.neon.vzip.v4i32(<4 x i32>, <4 x i32>) nounwind readnone +declare %struct.__builtin_neon_v4sf2 @llvm.arm.neon.vzip.v4f32(<4 x float>, <4 x float>) nounwind readnone -- 2.11.0