From: Krzysztof Parzyszek Date: Thu, 7 Dec 2017 17:37:28 +0000 (+0000) Subject: [Hexagon] Generate HVX code for basic arithmetic operations X-Git-Tag: android-x86-7.1-r4~7605 X-Git-Url: http://git.osdn.net/view?a=commitdiff_plain;h=b58d30229d720616a1bd9c611204d9161fa05e4b;p=android-x86%2Fexternal-llvm.git [Hexagon] Generate HVX code for basic arithmetic operations Handle and, or, xor, add, sub, mul for vectors of i8, i16, and i32. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@320063 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp index 859f6976c9b..f6d0239e450 100644 --- a/lib/Target/Hexagon/HexagonISelLowering.cpp +++ b/lib/Target/Hexagon/HexagonISelLowering.cpp @@ -2000,18 +2000,24 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM, bool Use64b = Subtarget.useHVX64BOps(); ArrayRef LegalV = Use64b ? LegalV64 : LegalV128; ArrayRef LegalW = Use64b ? LegalW64 : LegalW128; - MVT ByteV = Use64b ? MVT::v64i8 : MVT::v128i8; - MVT ByteW = Use64b ? MVT::v128i8 : MVT::v256i8; + MVT ByteV = Use64b ? MVT::v64i8 : MVT::v128i8; + MVT ByteW = Use64b ? MVT::v128i8 : MVT::v256i8; setOperationAction(ISD::VECTOR_SHUFFLE, ByteV, Legal); setOperationAction(ISD::VECTOR_SHUFFLE, ByteW, Legal); setOperationAction(ISD::CONCAT_VECTORS, ByteW, Legal); + setOperationAction(ISD::AND, ByteV, Legal); setOperationAction(ISD::OR, ByteV, Legal); + setOperationAction(ISD::XOR, ByteV, Legal); for (MVT T : LegalV) { setIndexedLoadAction(ISD::POST_INC, T, Legal); setIndexedStoreAction(ISD::POST_INC, T, Legal); + setOperationAction(ISD::ADD, T, Legal); + setOperationAction(ISD::SUB, T, Legal); + setOperationAction(ISD::MUL, T, Custom); + setOperationAction(ISD::BUILD_VECTOR, T, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, T, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, T, Custom); @@ -2025,7 +2031,9 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM, // Promote all shuffles and concats to operate on vectors of bytes. setPromoteTo(ISD::VECTOR_SHUFFLE, T, ByteV); setPromoteTo(ISD::CONCAT_VECTORS, T, ByteV); + setPromoteTo(ISD::AND, T, ByteV); setPromoteTo(ISD::OR, T, ByteV); + setPromoteTo(ISD::XOR, T, ByteV); } for (MVT T : LegalW) { @@ -2792,6 +2800,10 @@ HexagonTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::INLINEASM: return LowerINLINEASM(Op, DAG); case ISD::PREFETCH: return LowerPREFETCH(Op, DAG); case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, DAG); + case ISD::MUL: + if (Subtarget.useHVXOps()) + return LowerHvxMul(Op, DAG); + break; } return SDValue(); } diff --git a/lib/Target/Hexagon/HexagonISelLowering.h b/lib/Target/Hexagon/HexagonISelLowering.h index 17310914572..41fdd53714a 100644 --- a/lib/Target/Hexagon/HexagonISelLowering.h +++ b/lib/Target/Hexagon/HexagonISelLowering.h @@ -277,22 +277,14 @@ namespace HexagonISD { } private: - MVT ty(SDValue Op) const { - return Op.getValueType().getSimpleVT(); - } - MVT tyScalar(MVT Ty) const { - if (!Ty.isVector()) - return Ty; - return MVT::getIntegerVT(Ty.getSizeInBits()); - } - MVT tyVector(MVT Ty, MVT ElemTy) const { - if (Ty.isVector() && Ty.getVectorElementType() == ElemTy) - return Ty; - unsigned TyWidth = Ty.getSizeInBits(), ElemWidth = ElemTy.getSizeInBits(); - assert((TyWidth % ElemWidth) == 0); - return MVT::getVectorVT(ElemTy, TyWidth/ElemWidth); - } - + SDValue buildVector32(ArrayRef Elem, const SDLoc &dl, MVT VecTy, + SelectionDAG &DAG) const; + SDValue buildVector64(ArrayRef Elem, const SDLoc &dl, MVT VecTy, + SelectionDAG &DAG) const; + SDValue extractVector(SDValue VecV, SDValue IdxV, const SDLoc &dl, + MVT ValTy, MVT ResTy, SelectionDAG &DAG) const; + SDValue insertVector(SDValue VecV, SDValue ValV, SDValue IdxV, + const SDLoc &dl, MVT ValTy, SelectionDAG &DAG) const; bool isUndef(SDValue Op) const { if (Op.isMachineOpcode()) return Op.getMachineOpcode() == TargetOpcode::IMPLICIT_DEF; @@ -303,14 +295,6 @@ namespace HexagonISD { SDNode *N = DAG.getMachineNode(MachineOpc, dl, Ty, Ops); return SDValue(N, 0); } - SDValue buildVector32(ArrayRef Elem, const SDLoc &dl, MVT VecTy, - SelectionDAG &DAG) const; - SDValue buildVector64(ArrayRef Elem, const SDLoc &dl, MVT VecTy, - SelectionDAG &DAG) const; - SDValue extractVector(SDValue VecV, SDValue IdxV, const SDLoc &dl, - MVT ValTy, MVT ResTy, SelectionDAG &DAG) const; - SDValue insertVector(SDValue VecV, SDValue ValV, SDValue IdxV, - const SDLoc &dl, MVT ValTy, SelectionDAG &DAG) const; using VectorPair = std::pair; using TypePair = std::pair; @@ -318,14 +302,28 @@ namespace HexagonISD { SDValue getInt(unsigned IntId, MVT ResTy, ArrayRef Ops, const SDLoc &dl, SelectionDAG &DAG) const; + MVT ty(SDValue Op) const { + return Op.getValueType().getSimpleVT(); + } TypePair ty(const VectorPair &Ops) const { return { Ops.first.getValueType().getSimpleVT(), Ops.second.getValueType().getSimpleVT() }; } + MVT tyScalar(MVT Ty) const { + if (!Ty.isVector()) + return Ty; + return MVT::getIntegerVT(Ty.getSizeInBits()); + } + MVT tyVector(MVT Ty, MVT ElemTy) const { + if (Ty.isVector() && Ty.getVectorElementType() == ElemTy) + return Ty; + unsigned TyWidth = Ty.getSizeInBits(), ElemWidth = ElemTy.getSizeInBits(); + assert((TyWidth % ElemWidth) == 0); + return MVT::getVectorVT(ElemTy, TyWidth/ElemWidth); + } MVT typeJoin(const TypePair &Tys) const; TypePair typeSplit(MVT Ty) const; - MVT typeCastElem(MVT VecTy, MVT ElemTy) const; MVT typeExtElem(MVT VecTy, unsigned Factor) const; MVT typeTruncElem(MVT VecTy, unsigned Factor) const; @@ -337,12 +335,15 @@ namespace HexagonISD { SDValue convertToByteIndex(SDValue ElemIdx, MVT ElemTy, SelectionDAG &DAG) const; SDValue getIndexInWord32(SDValue Idx, MVT ElemTy, SelectionDAG &DAG) const; + SDValue getByteShuffle(const SDLoc &dl, SDValue Op0, SDValue Op1, + ArrayRef Mask, SelectionDAG &DAG) const; SDValue LowerHvxBuildVector(SDValue Op, SelectionDAG &DAG) const; SDValue LowerHvxExtractElement(SDValue Op, SelectionDAG &DAG) const; SDValue LowerHvxInsertElement(SDValue Op, SelectionDAG &DAG) const; SDValue LowerHvxExtractSubvector(SDValue Op, SelectionDAG &DAG) const; SDValue LowerHvxInsertSubvector(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerHvxMul(SDValue Op, SelectionDAG &DAG) const; std::pair findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT) diff --git a/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp index 6b25f74860e..de2ac42ad51 100644 --- a/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp +++ b/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp @@ -111,6 +111,37 @@ HexagonTargetLowering::getIndexInWord32(SDValue Idx, MVT ElemTy, } SDValue +HexagonTargetLowering::getByteShuffle(const SDLoc &dl, SDValue Op0, + SDValue Op1, ArrayRef Mask, + SelectionDAG &DAG) const { + MVT OpTy = ty(Op0); + assert(OpTy == ty(Op1)); + + MVT ElemTy = OpTy.getVectorElementType(); + if (ElemTy == MVT::i8) + return DAG.getVectorShuffle(OpTy, dl, Op0, Op1, Mask); + assert(ElemTy.getSizeInBits() >= 8); + + MVT ResTy = tyVector(OpTy, MVT::i8); + unsigned ElemSize = ElemTy.getSizeInBits() / 8; + + SmallVector ByteMask; + for (int M : Mask) { + if (M < 0) { + for (unsigned I = 0; I != ElemSize; ++I) + ByteMask.push_back(-1); + } else { + int NewM = M*ElemSize; + for (unsigned I = 0; I != ElemSize; ++I) + ByteMask.push_back(NewM+I); + } + } + assert(ResTy.getVectorNumElements() == ByteMask.size()); + return DAG.getVectorShuffle(ResTy, dl, opCastElem(Op0, MVT::i8, DAG), + opCastElem(Op1, MVT::i8, DAG), ByteMask); +} + +SDValue HexagonTargetLowering::LowerHvxBuildVector(SDValue Op, SelectionDAG &DAG) const { const SDLoc &dl(Op); @@ -276,7 +307,7 @@ HexagonTargetLowering::LowerHvxExtractSubvector(SDValue Op, SelectionDAG &DAG) SDValue HexagonTargetLowering::LowerHvxInsertSubvector(SDValue Op, SelectionDAG &DAG) const { - // Idx may be variable + // Idx may be variable. SDValue IdxV = Op.getOperand(2); auto *IdxN = dyn_cast(IdxV.getNode()); if (!IdxN) @@ -299,3 +330,56 @@ HexagonTargetLowering::LowerHvxInsertSubvector(SDValue Op, SelectionDAG &DAG) return DAG.getTargetInsertSubreg(Hexagon::vsub_hi, dl, DstTy, DstV, SrcV); return SDValue(); } + +SDValue +HexagonTargetLowering::LowerHvxMul(SDValue Op, SelectionDAG &DAG) const { + MVT ResTy = ty(Op); + if (!ResTy.isVector()) + return SDValue(); + const SDLoc &dl(Op); + SmallVector ShuffMask; + + MVT ElemTy = ResTy.getVectorElementType(); + unsigned VecLen = ResTy.getVectorNumElements(); + SDValue Vs = Op.getOperand(0); + SDValue Vt = Op.getOperand(1); + + switch (ElemTy.SimpleTy) { + case MVT::i8: + case MVT::i16: { + // For i8 vectors Vs = (a0, a1, ...), Vt = (b0, b1, ...), + // V6_vmpybv Vs, Vt produces a pair of i16 vectors Hi:Lo, + // where Lo = (a0*b0, a2*b2, ...), Hi = (a1*b1, a3*b3, ...). + // For i16, use V6_vmpyhv, which behaves in an analogous way to + // V6_vmpybv: results Lo and Hi are products of even/odd elements + // respectively. + MVT ExtTy = typeExtElem(ResTy, 2); + unsigned MpyOpc = ElemTy == MVT::i8 ? Hexagon::V6_vmpybv + : Hexagon::V6_vmpyhv; + SDValue M = getNode(MpyOpc, dl, ExtTy, {Vs, Vt}, DAG); + + // Discard high halves of the resulting values, collect the low halves. + for (unsigned I = 0; I < VecLen; I += 2) { + ShuffMask.push_back(I); // Pick even element. + ShuffMask.push_back(I+VecLen); // Pick odd element. + } + VectorPair P = opSplit(opCastElem(M, ElemTy, DAG), dl, DAG); + return getByteShuffle(dl, P.first, P.second, ShuffMask, DAG); + } + case MVT::i32: { + // Use the following sequence for signed word multiply: + // T0 = V6_vmpyiowh Vs, Vt + // T1 = V6_vaslw T0, 16 + // T2 = V6_vmpyiewuh_acc T1, Vs, Vt + SDValue S16 = DAG.getConstant(16, dl, MVT::i32); + SDValue T0 = getNode(Hexagon::V6_vmpyiowh, dl, ResTy, {Vs, Vt}, DAG); + SDValue T1 = getNode(Hexagon::V6_vaslw, dl, ResTy, {T0, S16}, DAG); + SDValue T2 = getNode(Hexagon::V6_vmpyiewuh_acc, dl, ResTy, + {T1, Vs, Vt}, DAG); + return T2; + } + default: + break; + } + return SDValue(); +} diff --git a/lib/Target/Hexagon/HexagonPatterns.td b/lib/Target/Hexagon/HexagonPatterns.td index f1d01b0cee2..7fc1f80aa55 100644 --- a/lib/Target/Hexagon/HexagonPatterns.td +++ b/lib/Target/Hexagon/HexagonPatterns.td @@ -2917,8 +2917,6 @@ def HexagonVINSERTW0 : SDNode<"HexagonISD::VINSERTW0", SDTHexagonVINSERTW0>; let Predicates = [UseHVX] in { def: Pat<(concat_vectors HVI8:$Vs, HVI8:$Vt), (V6_vcombine HvxVR:$Vt, HvxVR:$Vs)>; - def: Pat<(or HVI8:$Vs, HVI8:$Vt), - (V6_vor HvxVR:$Vt, HvxVR:$Vs)>; def: Pat<(HexagonVEXTRACTW HVI8:$Vu, I32:$Rs), (V6_extractw HvxVR:$Vu, I32:$Rs)>; @@ -2933,4 +2931,16 @@ let Predicates = [UseHVX] in { (V6_vinsertwr HvxVR:$Vu, I32:$Rt)>; def: Pat<(HexagonVINSERTW0 HVI32:$Vu, I32:$Rt), (V6_vinsertwr HvxVR:$Vu, I32:$Rt)>; + + def: Pat<(add HVI8:$Vs, HVI8:$Vt), (V6_vaddb HvxVR:$Vs, HvxVR:$Vt)>; + def: Pat<(add HVI16:$Vs, HVI16:$Vt), (V6_vaddh HvxVR:$Vs, HvxVR:$Vt)>; + def: Pat<(add HVI32:$Vs, HVI32:$Vt), (V6_vaddw HvxVR:$Vs, HvxVR:$Vt)>; + + def: Pat<(sub HVI8:$Vs, HVI8:$Vt), (V6_vsubb HvxVR:$Vs, HvxVR:$Vt)>; + def: Pat<(sub HVI16:$Vs, HVI16:$Vt), (V6_vsubh HvxVR:$Vs, HvxVR:$Vt)>; + def: Pat<(sub HVI32:$Vs, HVI32:$Vt), (V6_vsubw HvxVR:$Vs, HvxVR:$Vt)>; + + def: Pat<(and HVI8:$Vs, HVI8:$Vt), (V6_vand HvxVR:$Vs, HvxVR:$Vt)>; + def: Pat<(or HVI8:$Vs, HVI8:$Vt), (V6_vor HvxVR:$Vs, HvxVR:$Vt)>; + def: Pat<(xor HVI8:$Vs, HVI8:$Vt), (V6_vxor HvxVR:$Vs, HvxVR:$Vt)>; } diff --git a/test/CodeGen/Hexagon/autohvx/arith.ll b/test/CodeGen/Hexagon/autohvx/arith.ll new file mode 100644 index 00000000000..8c8dee6fc16 --- /dev/null +++ b/test/CodeGen/Hexagon/autohvx/arith.ll @@ -0,0 +1,278 @@ +; RUN: llc -march=hexagon < %s | FileCheck %s + +; --- and + +; CHECK-LABEL: andb_64: +; CHECK: vand(v0,v1) +define <64 x i8> @andb_64(<64 x i8> %v0, <64 x i8> %v1) #0 { + %p = and <64 x i8> %v0, %v1 + ret <64 x i8> %p +} + +; CHECK-LABEL: andb_128: +; CHECK: vand(v0,v1) +define <128 x i8> @andb_128(<128 x i8> %v0, <128 x i8> %v1) #1 { + %p = and <128 x i8> %v0, %v1 + ret <128 x i8> %p +} + +; CHECK-LABEL: andh_64: +; CHECK: vand(v0,v1) +define <32 x i16> @andh_64(<32 x i16> %v0, <32 x i16> %v1) #0 { + %p = and <32 x i16> %v0, %v1 + ret <32 x i16> %p +} + +; CHECK-LABEL: andh_128: +; CHECK: vand(v0,v1) +define <64 x i16> @andh_128(<64 x i16> %v0, <64 x i16> %v1) #1 { + %p = and <64 x i16> %v0, %v1 + ret <64 x i16> %p +} + +; CHECK-LABEL: andw_64: +; CHECK: vand(v0,v1) +define <16 x i32> @andw_64(<16 x i32> %v0, <16 x i32> %v1) #0 { + %p = and <16 x i32> %v0, %v1 + ret <16 x i32> %p +} + +; CHECK-LABEL: andw_128: +; CHECK: vand(v0,v1) +define <32 x i32> @andw_128(<32 x i32> %v0, <32 x i32> %v1) #1 { + %p = and <32 x i32> %v0, %v1 + ret <32 x i32> %p +} + +; --- or + +; CHECK-LABEL: orb_64: +; CHECK: vor(v0,v1) +define <64 x i8> @orb_64(<64 x i8> %v0, <64 x i8> %v1) #0 { + %p = or <64 x i8> %v0, %v1 + ret <64 x i8> %p +} + +; CHECK-LABEL: orb_128: +; CHECK: vor(v0,v1) +define <128 x i8> @orb_128(<128 x i8> %v0, <128 x i8> %v1) #1 { + %p = or <128 x i8> %v0, %v1 + ret <128 x i8> %p +} + +; CHECK-LABEL: orh_64: +; CHECK: vor(v0,v1) +define <32 x i16> @orh_64(<32 x i16> %v0, <32 x i16> %v1) #0 { + %p = or <32 x i16> %v0, %v1 + ret <32 x i16> %p +} + +; CHECK-LABEL: orh_128: +; CHECK: vor(v0,v1) +define <64 x i16> @orh_128(<64 x i16> %v0, <64 x i16> %v1) #1 { + %p = or <64 x i16> %v0, %v1 + ret <64 x i16> %p +} + +; CHECK-LABEL: orw_64: +; CHECK: vor(v0,v1) +define <16 x i32> @orw_64(<16 x i32> %v0, <16 x i32> %v1) #0 { + %p = or <16 x i32> %v0, %v1 + ret <16 x i32> %p +} + +; CHECK-LABEL: orw_128: +; CHECK: vor(v0,v1) +define <32 x i32> @orw_128(<32 x i32> %v0, <32 x i32> %v1) #1 { + %p = or <32 x i32> %v0, %v1 + ret <32 x i32> %p +} + +; --- xor + +; CHECK-LABEL: xorb_64: +; CHECK: vxor(v0,v1) +define <64 x i8> @xorb_64(<64 x i8> %v0, <64 x i8> %v1) #0 { + %p = xor <64 x i8> %v0, %v1 + ret <64 x i8> %p +} + +; CHECK-LABEL: xorb_128: +; CHECK: vxor(v0,v1) +define <128 x i8> @xorb_128(<128 x i8> %v0, <128 x i8> %v1) #1 { + %p = xor <128 x i8> %v0, %v1 + ret <128 x i8> %p +} + +; CHECK-LABEL: xorh_64: +; CHECK: vxor(v0,v1) +define <32 x i16> @xorh_64(<32 x i16> %v0, <32 x i16> %v1) #0 { + %p = xor <32 x i16> %v0, %v1 + ret <32 x i16> %p +} + +; CHECK-LABEL: xorh_128: +; CHECK: vxor(v0,v1) +define <64 x i16> @xorh_128(<64 x i16> %v0, <64 x i16> %v1) #1 { + %p = xor <64 x i16> %v0, %v1 + ret <64 x i16> %p +} + +; CHECK-LABEL: xorw_64: +; CHECK: vxor(v0,v1) +define <16 x i32> @xorw_64(<16 x i32> %v0, <16 x i32> %v1) #0 { + %p = xor <16 x i32> %v0, %v1 + ret <16 x i32> %p +} + +; CHECK-LABEL: xorw_128: +; CHECK: vxor(v0,v1) +define <32 x i32> @xorw_128(<32 x i32> %v0, <32 x i32> %v1) #1 { + %p = xor <32 x i32> %v0, %v1 + ret <32 x i32> %p +} + +; --- add + +; CHECK-LABEL: addb_64: +; CHECK: vadd(v0.b,v1.b) +define <64 x i8> @addb_64(<64 x i8> %v0, <64 x i8> %v1) #0 { + %p = add <64 x i8> %v0, %v1 + ret <64 x i8> %p +} + +; CHECK-LABEL: addb_128: +; CHECK: vadd(v0.b,v1.b) +define <128 x i8> @addb_128(<128 x i8> %v0, <128 x i8> %v1) #1 { + %p = add <128 x i8> %v0, %v1 + ret <128 x i8> %p +} + +; CHECK-LABEL: addh_64: +; CHECK: vadd(v0.h,v1.h) +define <32 x i16> @addh_64(<32 x i16> %v0, <32 x i16> %v1) #0 { + %p = add <32 x i16> %v0, %v1 + ret <32 x i16> %p +} + +; CHECK-LABEL: addh_128: +; CHECK: vadd(v0.h,v1.h) +define <64 x i16> @addh_128(<64 x i16> %v0, <64 x i16> %v1) #1 { + %p = add <64 x i16> %v0, %v1 + ret <64 x i16> %p +} + +; CHECK-LABEL: addw_64: +; CHECK: vadd(v0.w,v1.w) +define <16 x i32> @addw_64(<16 x i32> %v0, <16 x i32> %v1) #0 { + %p = add <16 x i32> %v0, %v1 + ret <16 x i32> %p +} + +; CHECK-LABEL: addw_128: +; CHECK: vadd(v0.w,v1.w) +define <32 x i32> @addw_128(<32 x i32> %v0, <32 x i32> %v1) #1 { + %p = add <32 x i32> %v0, %v1 + ret <32 x i32> %p +} + +; --- sub + +; CHECK-LABEL: subb_64: +; CHECK: vsub(v0.b,v1.b) +define <64 x i8> @subb_64(<64 x i8> %v0, <64 x i8> %v1) #0 { + %p = sub <64 x i8> %v0, %v1 + ret <64 x i8> %p +} + +; CHECK-LABEL: subb_128: +; CHECK: vsub(v0.b,v1.b) +define <128 x i8> @subb_128(<128 x i8> %v0, <128 x i8> %v1) #1 { + %p = sub <128 x i8> %v0, %v1 + ret <128 x i8> %p +} + +; CHECK-LABEL: subh_64: +; CHECK: vsub(v0.h,v1.h) +define <32 x i16> @subh_64(<32 x i16> %v0, <32 x i16> %v1) #0 { + %p = sub <32 x i16> %v0, %v1 + ret <32 x i16> %p +} + +; CHECK-LABEL: subh_128: +; CHECK: vsub(v0.h,v1.h) +define <64 x i16> @subh_128(<64 x i16> %v0, <64 x i16> %v1) #1 { + %p = sub <64 x i16> %v0, %v1 + ret <64 x i16> %p +} + +; CHECK-LABEL: subw_64: +; CHECK: vsub(v0.w,v1.w) +define <16 x i32> @subw_64(<16 x i32> %v0, <16 x i32> %v1) #0 { + %p = sub <16 x i32> %v0, %v1 + ret <16 x i32> %p +} + +; CHECK-LABEL: subw_128: +; CHECK: vsub(v0.w,v1.w) +define <32 x i32> @subw_128(<32 x i32> %v0, <32 x i32> %v1) #1 { + %p = sub <32 x i32> %v0, %v1 + ret <32 x i32> %p +} + +; --- mul + +; CHECK-LABEL: mpyb_64: +; CHECK: v[[H00:[0-9]+]]:[[L00:[0-9]+]].h = vmpy(v0.b,v1.b) +; CHECK: vshuffe(v[[H00]].b,v[[L00]].b) +define <64 x i8> @mpyb_64(<64 x i8> %v0, <64 x i8> %v1) #0 { + %p = mul <64 x i8> %v0, %v1 + ret <64 x i8> %p +} + +; CHECK-LABEL: mpyb_128: +; CHECK: v[[H10:[0-9]+]]:[[L10:[0-9]+]].h = vmpy(v0.b,v1.b) +; CHECK: vshuffe(v[[H10]].b,v[[L10]].b) +define <128 x i8> @mpyb_128(<128 x i8> %v0, <128 x i8> %v1) #1 { + %p = mul <128 x i8> %v0, %v1 + ret <128 x i8> %p +} + +; CHECK-LABEL: mpyh_64: +; CHECK: v[[H01:[0-9]+]]:[[L01:[0-9]+]].w = vmpy(v0.h,v1.h) +; CHECK: vshuffe(v[[H01]].h,v[[L01]].h) +define <32 x i16> @mpyh_64(<32 x i16> %v0, <32 x i16> %v1) #0 { + %p = mul <32 x i16> %v0, %v1 + ret <32 x i16> %p +} + +; CHECK-LABEL: mpyh_128: +; CHECK: v[[H11:[0-9]+]]:[[L11:[0-9]+]].w = vmpy(v0.h,v1.h) +; CHECK: vshuffe(v[[H11]].h,v[[L11]].h) +define <64 x i16> @mpyh_128(<64 x i16> %v0, <64 x i16> %v1) #1 { + %p = mul <64 x i16> %v0, %v1 + ret <64 x i16> %p +} + +; CHECK-LABEL: mpyw_64: +; CHECK-DAG: r[[T00:[0-9]+]] = #16 +; CHECK-DAG: v[[T01:[0-9]+]].w = vmpyio(v0.w,v1.h) +; CHECK: v[[T02:[0-9]+]].w = vasl(v[[T01]].w,r[[T00]]) +; CHECK: v[[T02]].w += vmpyie(v0.w,v1.uh) +define <16 x i32> @mpyw_64(<16 x i32> %v0, <16 x i32> %v1) #0 { + %p = mul <16 x i32> %v0, %v1 + ret <16 x i32> %p +} + +; CHECK-LABEL: mpyw_128: +; CHECK-DAG: r[[T10:[0-9]+]] = #16 +; CHECK-DAG: v[[T11:[0-9]+]].w = vmpyio(v0.w,v1.h) +; CHECK: v[[T12:[0-9]+]].w = vasl(v[[T11]].w,r[[T10]]) +; CHECK: v[[T12]].w += vmpyie(v0.w,v1.uh) +define <32 x i32> @mpyw_128(<32 x i32> %v0, <32 x i32> %v1) #1 { + %p = mul <32 x i32> %v0, %v1 + ret <32 x i32> %p +} + +attributes #0 = { nounwind "target-cpu"="hexagonv60" "target-features"="+hvx,+hvx-length64b" } +attributes #1 = { nounwind "target-cpu"="hexagonv60" "target-features"="+hvx,+hvx-length128b" }