From f645d8539e6dd8d0d7d6b604f36b9cc7b05d1886 Mon Sep 17 00:00:00 2001 From: Jan Voung Date: Thu, 9 Jul 2015 10:35:09 -0700 Subject: [PATCH] ARM32: Lower more integer intrinsics and test. Lower stacksave/restore. Lower ctlz, cttz, bswap, and popcount. Popcount is just done with a helper call. Ctz can use the clz instruction after reversing the bits. We can only crosstest stacksave/restore for now which happens to be written in C for the C99 VLAs. The CXX crosstests I can't seem to compile with the arm-cross-g++ (missing headers), so I will check that later after resolving the cross compilation issue. BUG= https://code.google.com/p/nativeclient/issues/detail?id=4076 R=jpp@chromium.org Review URL: https://codereview.chromium.org/1222943003 . --- Makefile.standalone | 3 +- src/IceInstARM32.cpp | 14 +-- src/IceInstARM32.h | 20 ++-- src/IceTargetLowering.h | 3 + src/IceTargetLoweringARM32.cpp | 132 +++++++++++++++++++--- src/IceTargetLoweringARM32.h | 14 +++ src/IceTargetLoweringX86Base.h | 4 +- src/IceTargetLoweringX86BaseImpl.h | 8 +- tests_lit/llvm2ice_tests/nacl-other-intrinsics.ll | 105 ++++++++++++++--- 9 files changed, 253 insertions(+), 50 deletions(-) diff --git a/Makefile.standalone b/Makefile.standalone index 83eddac13..3d61754d8 100644 --- a/Makefile.standalone +++ b/Makefile.standalone @@ -327,7 +327,8 @@ check-xtest: $(OBJDIR)/pnacl-sz make_symlink runtime --toolchain-root $(TOOLCHAIN_ROOT) \ -i x8632,native,sse2 -i x8632,native,sse4.1,test_vector_ops \ -i x8632,sandbox,sse4.1,Om1 \ - -i arm32,native,neon,Om1,simple_loop + -i arm32,native,neon,Om1,simple_loop \ + -i arm32,native,neon,Om1,test_stacksave PNACL_BIN_PATH=$(PNACL_BIN_PATH) \ $(LLVM_SRC_PATH)/utils/lit/lit.py -sv crosstest/Output endif diff --git a/src/IceInstARM32.cpp b/src/IceInstARM32.cpp index 72178ac77..e95f6b111 100644 --- a/src/IceInstARM32.cpp +++ b/src/IceInstARM32.cpp @@ -80,18 +80,15 @@ CondARM32::Cond InstARM32::getOppositeCondition(CondARM32::Cond Cond) { } void InstARM32Pred::emitUnaryopGPR(const char *Opcode, - const InstARM32Pred *Inst, const Cfg *Func) { + const InstARM32Pred *Inst, const Cfg *Func, + bool NeedsWidthSuffix) { Ostream &Str = Func->getContext()->getStrEmit(); assert(Inst->getSrcSize() == 1); Type SrcTy = Inst->getSrc(0)->getType(); - Type DestTy = Inst->getDest()->getType(); Str << "\t" << Opcode; - // Sxt and Uxt need source type width letter to define the operation. - // The other unary operations have the same source and dest type and - // as a result need only one letter. - if (SrcTy != DestTy) + if (NeedsWidthSuffix) Str << getWidthString(SrcTy); - Str << "\t"; + Str << Inst->getPredicate() << "\t"; Inst->getDest()->emit(Func); Str << ", "; Inst->getSrc(0)->emit(Func); @@ -358,7 +355,10 @@ InstARM32Umull::InstARM32Umull(Cfg *Func, Variable *DestLo, Variable *DestHi, template <> const char *InstARM32Movt::Opcode = "movt"; // Unary ops template <> const char *InstARM32Movw::Opcode = "movw"; +template <> const char *InstARM32Clz::Opcode = "clz"; template <> const char *InstARM32Mvn::Opcode = "mvn"; +template <> const char *InstARM32Rbit::Opcode = "rbit"; +template <> const char *InstARM32Rev::Opcode = "rev"; template <> const char *InstARM32Sxt::Opcode = "sxt"; // still requires b/h template <> const char *InstARM32Uxt::Opcode = "uxt"; // still requires b/h // Mov-like ops diff --git a/src/IceInstARM32.h b/src/IceInstARM32.h index d12c4ffb9..8a7e1da3e 100644 --- a/src/IceInstARM32.h +++ b/src/IceInstARM32.h @@ -262,6 +262,7 @@ public: Br, Call, Cmp, + Clz, Eor, Label, Ldr, @@ -277,7 +278,9 @@ public: Orr, Pop, Push, + Rbit, Ret, + Rev, Rsb, Sbc, Sdiv, @@ -324,7 +327,7 @@ public: /// Shared emit routines for common forms of instructions. static void emitUnaryopGPR(const char *Opcode, const InstARM32Pred *Inst, - const Cfg *Func); + const Cfg *Func, bool NeedsWidthSuffix); static void emitTwoAddr(const char *Opcode, const InstARM32Pred *Inst, const Cfg *Func); static void emitThreeAddr(const char *Opcode, const InstARM32Pred *Inst, @@ -345,7 +348,7 @@ inline StreamType &operator<<(StreamType &Stream, CondARM32::Cond Predicate) { } /// Instructions of the form x := op(y). -template +template class InstARM32UnaryopGPR : public InstARM32Pred { InstARM32UnaryopGPR() = delete; InstARM32UnaryopGPR(const InstARM32UnaryopGPR &) = delete; @@ -360,7 +363,7 @@ public: void emit(const Cfg *Func) const override { if (!BuildDefs::dump()) return; - emitUnaryopGPR(Opcode, this, Func); + emitUnaryopGPR(Opcode, this, Func, NeedsWidthSuffix); } void emitIAS(const Cfg *Func) const override { (void)Func; @@ -641,13 +644,16 @@ typedef InstARM32Movlike InstARM32Mov; /// MovT leaves the bottom bits alone so dest is also a source. /// This helps indicate that a previous MovW setting dest is not dead code. typedef InstARM32TwoAddrGPR InstARM32Movt; -typedef InstARM32UnaryopGPR InstARM32Movw; -typedef InstARM32UnaryopGPR InstARM32Mvn; +typedef InstARM32UnaryopGPR InstARM32Movw; +typedef InstARM32UnaryopGPR InstARM32Clz; +typedef InstARM32UnaryopGPR InstARM32Mvn; +typedef InstARM32UnaryopGPR InstARM32Rbit; +typedef InstARM32UnaryopGPR InstARM32Rev; // Technically, the uxt{b,h} and sxt{b,h} instructions have a rotation // operand as well (rotate source by 8, 16, 24 bits prior to extending), // but we aren't using that for now, so just model as a Unaryop. -typedef InstARM32UnaryopGPR InstARM32Sxt; -typedef InstARM32UnaryopGPR InstARM32Uxt; +typedef InstARM32UnaryopGPR InstARM32Sxt; +typedef InstARM32UnaryopGPR InstARM32Uxt; typedef InstARM32FourAddrGPR InstARM32Mla; typedef InstARM32FourAddrGPR InstARM32Mls; typedef InstARM32CmpLike InstARM32Cmp; diff --git a/src/IceTargetLowering.h b/src/IceTargetLowering.h index 64672c43e..a5b52ce91 100644 --- a/src/IceTargetLowering.h +++ b/src/IceTargetLowering.h @@ -321,6 +321,9 @@ protected: Context.insert(InstBundleLock::create(Func, BundleOption)); } void _bundle_unlock() { Context.insert(InstBundleUnlock::create(Func)); } + void _set_dest_nonkillable() { + Context.getLastInserted()->setDestNonKillable(); + } Cfg *Func; GlobalContext *Ctx; diff --git a/src/IceTargetLoweringARM32.cpp b/src/IceTargetLoweringARM32.cpp index 6639da855..10fdfe12a 100644 --- a/src/IceTargetLoweringARM32.cpp +++ b/src/IceTargetLoweringARM32.cpp @@ -1575,7 +1575,7 @@ void TargetARM32::lowerCall(const InstCall *Instr) { // Copy arguments that are passed on the stack to the appropriate // stack locations. - Variable *SP = Func->getTarget()->getPhysicalRegister(RegARM32::Reg_sp); + Variable *SP = getPhysicalRegister(RegARM32::Reg_sp); for (auto &StackArg : StackArgs) { ConstantInteger32 *Loc = llvm::cast(Ctx->getConstantInt32(StackArg.second)); @@ -1662,7 +1662,7 @@ void TargetARM32::lowerCall(const InstCall *Instr) { if (ParameterAreaSizeBytes) { Operand *AddAmount = legalize(Ctx->getConstantInt32(ParameterAreaSizeBytes), Legal_Reg | Legal_Flex); - Variable *SP = Func->getTarget()->getPhysicalRegister(RegARM32::Reg_sp); + Variable *SP = getPhysicalRegister(RegARM32::Reg_sp); _add(SP, SP, AddAmount); } @@ -2032,19 +2032,91 @@ void TargetARM32::lowerIntrinsicCall(const InstIntrinsicCall *Instr) { return; } case Intrinsics::Bswap: { - UnimplementedError(Func->getContext()->getFlags()); + Variable *Dest = Instr->getDest(); + Operand *Val = Instr->getArg(0); + Type Ty = Val->getType(); + if (Ty == IceType_i64) { + Variable *Val_Lo = legalizeToVar(loOperand(Val)); + Variable *Val_Hi = legalizeToVar(hiOperand(Val)); + Variable *T_Lo = makeReg(IceType_i32); + Variable *T_Hi = makeReg(IceType_i32); + Variable *DestLo = llvm::cast(loOperand(Dest)); + Variable *DestHi = llvm::cast(hiOperand(Dest)); + _rev(T_Lo, Val_Lo); + _rev(T_Hi, Val_Hi); + _mov(DestLo, T_Hi); + _mov(DestHi, T_Lo); + } else { + assert(Ty == IceType_i32 || Ty == IceType_i16); + Variable *ValR = legalizeToVar(Val); + Variable *T = makeReg(Ty); + _rev(T, ValR); + if (Val->getType() == IceType_i16) { + Operand *Sixteen = + legalize(Ctx->getConstantInt32(16), Legal_Reg | Legal_Flex); + _lsr(T, T, Sixteen); + } + _mov(Dest, T); + } return; } case Intrinsics::Ctpop: { - UnimplementedError(Func->getContext()->getFlags()); + Variable *Dest = Instr->getDest(); + Operand *Val = Instr->getArg(0); + InstCall *Call = makeHelperCall(isInt32Asserting32Or64(Val->getType()) + ? H_call_ctpop_i32 + : H_call_ctpop_i64, + Dest, 1); + Call->addArg(Val); + lowerCall(Call); + // The popcount helpers always return 32-bit values, while the intrinsic's + // signature matches some 64-bit platform's native instructions and + // expect to fill a 64-bit reg. Thus, clear the upper bits of the dest + // just in case the user doesn't do that in the IR or doesn't toss the bits + // via truncate. + if (Val->getType() == IceType_i64) { + Variable *DestHi = llvm::cast(hiOperand(Dest)); + Constant *Zero = Ctx->getConstantZero(IceType_i32); + _mov(DestHi, Zero); + } return; } case Intrinsics::Ctlz: { - UnimplementedError(Func->getContext()->getFlags()); + // The "is zero undef" parameter is ignored and we always return + // a well-defined value. + Operand *Val = Instr->getArg(0); + Variable *ValLoR; + Variable *ValHiR = nullptr; + if (Val->getType() == IceType_i64) { + ValLoR = legalizeToVar(loOperand(Val)); + ValHiR = legalizeToVar(hiOperand(Val)); + } else { + ValLoR = legalizeToVar(Val); + } + lowerCLZ(Instr->getDest(), ValLoR, ValHiR); return; } case Intrinsics::Cttz: { - UnimplementedError(Func->getContext()->getFlags()); + // Essentially like Clz, but reverse the bits first. + Operand *Val = Instr->getArg(0); + Variable *ValLoR; + Variable *ValHiR = nullptr; + if (Val->getType() == IceType_i64) { + ValLoR = legalizeToVar(loOperand(Val)); + ValHiR = legalizeToVar(hiOperand(Val)); + Variable *TLo = makeReg(IceType_i32); + Variable *THi = makeReg(IceType_i32); + _rbit(TLo, ValLoR); + _rbit(THi, ValHiR); + ValLoR = THi; + ValHiR = TLo; + } else { + ValLoR = legalizeToVar(Val); + Variable *T = makeReg(IceType_i32); + _rbit(T, ValLoR); + ValLoR = T; + } + lowerCLZ(Instr->getDest(), ValLoR, ValHiR); return; } case Intrinsics::Fabs: { @@ -2077,13 +2149,15 @@ void TargetARM32::lowerIntrinsicCall(const InstIntrinsicCall *Instr) { return; } case Intrinsics::Memset: { - // The value operand needs to be extended to a stack slot size - // because the PNaCl ABI requires arguments to be at least 32 bits - // wide. + // The value operand needs to be extended to a stack slot size because the + // PNaCl ABI requires arguments to be at least 32 bits wide. Operand *ValOp = Instr->getArg(1); assert(ValOp->getType() == IceType_i8); Variable *ValExt = Func->makeVariable(stackSlotType()); lowerCast(InstCast::create(Func, InstCast::Zext, ValExt, ValOp)); + // Technically, ARM has their own __aeabi_memset, but we can use plain + // memset too. The value and size argument need to be flipped if we ever + // decide to use __aeabi_memset. InstCall *Call = makeHelperCall(H_call_memset, nullptr, 3); Call->addArg(Instr->getArg(0)); Call->addArg(ValExt); @@ -2111,15 +2185,19 @@ void TargetARM32::lowerIntrinsicCall(const InstIntrinsicCall *Instr) { return; } case Intrinsics::Stacksave: { - UnimplementedError(Func->getContext()->getFlags()); + Variable *SP = getPhysicalRegister(RegARM32::Reg_sp); + Variable *Dest = Instr->getDest(); + _mov(Dest, SP); return; } case Intrinsics::Stackrestore: { - UnimplementedError(Func->getContext()->getFlags()); + Variable *SP = getPhysicalRegister(RegARM32::Reg_sp); + Operand *Val = legalize(Instr->getArg(0), Legal_Reg | Legal_Flex); + _mov_nonkillable(SP, Val); return; } case Intrinsics::Trap: - UnimplementedError(Func->getContext()->getFlags()); + _trap(); return; case Intrinsics::UnknownIntrinsic: Func->setError("Should not be lowering UnknownIntrinsic"); @@ -2128,6 +2206,34 @@ void TargetARM32::lowerIntrinsicCall(const InstIntrinsicCall *Instr) { return; } +void TargetARM32::lowerCLZ(Variable *Dest, Variable *ValLoR, Variable *ValHiR) { + Type Ty = Dest->getType(); + assert(Ty == IceType_i32 || Ty == IceType_i64); + Variable *T = makeReg(IceType_i32); + _clz(T, ValLoR); + if (Ty == IceType_i64) { + Variable *DestLo = llvm::cast(loOperand(Dest)); + Variable *DestHi = llvm::cast(hiOperand(Dest)); + Operand *Zero = + legalize(Ctx->getConstantZero(IceType_i32), Legal_Reg | Legal_Flex); + Operand *ThirtyTwo = + legalize(Ctx->getConstantInt32(32), Legal_Reg | Legal_Flex); + _cmp(ValHiR, Zero); + Variable *T2 = makeReg(IceType_i32); + _add(T2, T, ThirtyTwo); + _clz(T2, ValHiR, CondARM32::NE); + // T2 is actually a source as well when the predicate is not AL + // (since it may leave T2 alone). We use set_dest_nonkillable to + // prolong the liveness of T2 as if it was used as a source. + _set_dest_nonkillable(); + _mov(DestLo, T2); + _mov(DestHi, Ctx->getConstantZero(IceType_i32)); + return; + } + _mov(Dest, T); + return; +} + void TargetARM32::lowerLoad(const InstLoad *Load) { // A Load instruction can be treated the same as an Assign // instruction, after the source operand is transformed into an @@ -2186,7 +2292,7 @@ void TargetARM32::lowerRet(const InstRet *Inst) { // eliminated. TODO: Are there more places where the fake use // should be inserted? E.g. "void f(int n){while(1) g(n);}" may not // have a ret instruction. - Variable *SP = Func->getTarget()->getPhysicalRegister(RegARM32::Reg_sp); + Variable *SP = getPhysicalRegister(RegARM32::Reg_sp); Context.insert(InstFakeUse::create(Func, SP)); } diff --git a/src/IceTargetLoweringARM32.h b/src/IceTargetLoweringARM32.h index 8aa3e11b5..becb6152e 100644 --- a/src/IceTargetLoweringARM32.h +++ b/src/IceTargetLoweringARM32.h @@ -172,6 +172,8 @@ protected: ExtInstr ExtFunc, DivInstr DivFunc, const char *DivHelperName, bool IsRemainder); + void lowerCLZ(Variable *Dest, Variable *ValLo, Variable *ValHi); + // The following are helpers that insert lowered ARM32 instructions // with minimal syntactic overhead, so that the lowering code can // look as close to assembly as practical. @@ -224,6 +226,10 @@ protected: CondARM32::Cond Pred = CondARM32::AL) { Context.insert(InstARM32Cmp::create(Func, Src0, Src1, Pred)); } + void _clz(Variable *Dest, Variable *Src0, + CondARM32::Cond Pred = CondARM32::AL) { + Context.insert(InstARM32Clz::create(Func, Dest, Src0, Pred)); + } void _eor(Variable *Dest, Variable *Src0, Operand *Src1, CondARM32::Cond Pred = CondARM32::AL) { Context.insert(InstARM32Eor::create(Func, Dest, Src0, Src1, Pred)); @@ -301,6 +307,14 @@ protected: for (Variable *Dest : Dests) Context.insert(InstFakeDef::create(Func, Dest)); } + void _rbit(Variable *Dest, Variable *Src0, + CondARM32::Cond Pred = CondARM32::AL) { + Context.insert(InstARM32Rbit::create(Func, Dest, Src0, Pred)); + } + void _rev(Variable *Dest, Variable *Src0, + CondARM32::Cond Pred = CondARM32::AL) { + Context.insert(InstARM32Rev::create(Func, Dest, Src0, Pred)); + } void _ret(Variable *LR, Variable *Src0 = nullptr) { Context.insert(InstARM32Ret::create(Func, LR, Src0)); } diff --git a/src/IceTargetLoweringX86Base.h b/src/IceTargetLoweringX86Base.h index 32d7d6bfb..c00f0b0e5 100644 --- a/src/IceTargetLoweringX86Base.h +++ b/src/IceTargetLoweringX86Base.h @@ -96,6 +96,7 @@ public: using Machine::_bundle_lock; using Machine::_bundle_unlock; + using Machine::_set_dest_nonkillable; using Machine::getContext; using Machine::getStackAdjustment; using Machine::regAlloc; @@ -587,9 +588,6 @@ protected: void _xor_rmw(typename Traits::X86OperandMem *DestSrc0, Operand *Src1) { Context.insert(Traits::Insts::XorRMW::create(Func, DestSrc0, Src1)); } - void _set_dest_nonkillable() { - Context.getLastInserted()->setDestNonKillable(); - } bool optimizeScalarMul(Variable *Dest, Operand *Src0, int32_t Src1); void findRMW(); diff --git a/src/IceTargetLoweringX86BaseImpl.h b/src/IceTargetLoweringX86BaseImpl.h index 68cbf94d2..a277db23a 100644 --- a/src/IceTargetLoweringX86BaseImpl.h +++ b/src/IceTargetLoweringX86BaseImpl.h @@ -3521,9 +3521,8 @@ void TargetX86Base::lowerIntrinsicCall( return; } case Intrinsics::Memset: { - // The value operand needs to be extended to a stack slot size - // because the PNaCl ABI requires arguments to be at least 32 bits - // wide. + // The value operand needs to be extended to a stack slot size because the + // PNaCl ABI requires arguments to be at least 32 bits wide. Operand *ValOp = Instr->getArg(1); assert(ValOp->getType() == IceType_i8); Variable *ValExt = Func->template makeVariable(stackSlotType()); @@ -5257,8 +5256,7 @@ Operand *TargetX86Base::randomizeOrPoolImmediate(Constant *Immediate, _lea(Reg, Traits::X86OperandMem::create(Func, IceType_i32, Reg, Offset, nullptr, 0)); // make sure liveness analysis won't kill this variable, otherwise a - // liveness - // assertion will be triggered. + // liveness assertion will be triggered. _set_dest_nonkillable(); if (Immediate->getType() != IceType_i32) { Variable *TruncReg = makeReg(Immediate->getType(), RegNum); diff --git a/tests_lit/llvm2ice_tests/nacl-other-intrinsics.ll b/tests_lit/llvm2ice_tests/nacl-other-intrinsics.ll index da56571f2..ef72d6ecb 100644 --- a/tests_lit/llvm2ice_tests/nacl-other-intrinsics.ll +++ b/tests_lit/llvm2ice_tests/nacl-other-intrinsics.ll @@ -1,21 +1,34 @@ ; This tests the NaCl intrinsics not related to atomic operations. -; RUN: %p2i -i %s --filetype=obj --disassemble --args -O2 -sandbox \ -; RUN: | FileCheck %s -; RUN: %p2i -i %s --filetype=obj --disassemble --args -Om1 -sandbox \ -; RUN: | FileCheck %s +; RUN: %if --need=target_X8632 --command %p2i --filetype=obj --disassemble \ +; RUN: --target x8632 -i %s --args -O2 -sandbox \ +; RUN: | %if --need=target_X8632 --command FileCheck %s +; RUN: %if --need=target_X8632 --command %p2i --filetype=obj --disassemble \ +; RUN: --target x8632 -i %s --args -Om1 -sandbox \ +; RUN: | %if --need=target_X8632 --command FileCheck %s ; Do another run w/ O2 and a different check-prefix (otherwise O2 and Om1 ; share the same "CHECK" prefix). This separate run helps check that ; some code is optimized out. -; RUN: %p2i -i %s --filetype=obj --disassemble --args -O2 -sandbox \ -; RUN: | FileCheck --check-prefix=CHECKO2REM %s +; RUN: %if --need=target_X8632 --command %p2i --filetype=obj --disassemble \ +; RUN: --target x8632 -i %s --args -O2 -sandbox \ +; RUN: | %if --need=target_X8632 \ +; RUN: --command FileCheck --check-prefix=CHECKO2REM %s ; Do O2 runs without -sandbox to make sure llvm.nacl.read.tp gets ; lowered to __nacl_read_tp instead of gs:0x0. ; We also know that because it's O2, it'll have the O2REM optimizations. -; RUN: %p2i -i %s --filetype=obj --disassemble --args -O2 \ -; RUN: | FileCheck --check-prefix=CHECKO2UNSANDBOXEDREM %s +; RUN: %if --need=target_X8632 --command %p2i --filetype=obj --disassemble \ +; RUN: --target x8632 -i %s --args -O2 \ +; RUN: | %if --need=target_X8632 \ +; RUN: --command FileCheck --check-prefix=CHECKO2UNSANDBOXEDREM %s + +; RUN: %if --need=target_ARM32 --need=allow_dump \ +; RUN: --command %p2i --filetype=asm --assemble --disassemble --target arm32 \ +; RUN: -i %s --args -O2 --skip-unimplemented \ +; RUN: | %if --need=target_ARM32 --need=allow_dump \ +; RUN: --command FileCheck --check-prefix ARM32 %s + declare i8* @llvm.nacl.read.tp() declare void @llvm.memcpy.p0i8.p0i8.i32(i8*, i8*, i32, i32, i1) @@ -106,6 +119,8 @@ entry: ; CHECK: call {{.*}} R_{{.*}} memcpy ; CHECKO2REM-LABEL: test_memcpy ; CHECKO2UNSANDBOXEDREM-LABEL: test_memcpy +; ARM32-LABEL: test_memcpy +; ARM32: bl {{.*}} memcpy ; TODO(jvoung) -- if we want to be clever, we can do this and the memmove, ; memset without a function call. @@ -114,11 +129,13 @@ entry: %dst = inttoptr i32 %iptr_dst to i8* %src = inttoptr i32 %iptr_src to i8* call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %src, - i32 8, i32 1, i1 false) + i32 32, i32 1, i1 false) ret void } ; CHECK-LABEL: test_memcpy_const_len_align ; CHECK: call {{.*}} R_{{.*}} memcpy +; ARM32-LABEL: test_memcpy_const_len_align +; ARM32: bl {{.*}} memcpy define void @test_memmove(i32 %iptr_dst, i32 %iptr_src, i32 %len) { entry: @@ -130,17 +147,21 @@ entry: } ; CHECK-LABEL: test_memmove ; CHECK: call {{.*}} R_{{.*}} memmove +; ARM32-LABEL: test_memmove +; ARM32: bl {{.*}} memmove define void @test_memmove_const_len_align(i32 %iptr_dst, i32 %iptr_src) { entry: %dst = inttoptr i32 %iptr_dst to i8* %src = inttoptr i32 %iptr_src to i8* call void @llvm.memmove.p0i8.p0i8.i32(i8* %dst, i8* %src, - i32 8, i32 1, i1 false) + i32 32, i32 1, i1 false) ret void } ; CHECK-LABEL: test_memmove_const_len_align ; CHECK: call {{.*}} R_{{.*}} memmove +; ARM32-LABEL: test_memmove_const_len_align +; ARM32: bl {{.*}} memmove define void @test_memset(i32 %iptr_dst, i32 %wide_val, i32 %len) { entry: @@ -153,18 +174,24 @@ entry: ; CHECK-LABEL: test_memset ; CHECK: movzx ; CHECK: call {{.*}} R_{{.*}} memset +; ARM32-LABEL: test_memset +; ARM32: uxtb +; ARM32: bl {{.*}} memset define void @test_memset_const_len_align(i32 %iptr_dst, i32 %wide_val) { entry: %val = trunc i32 %wide_val to i8 %dst = inttoptr i32 %iptr_dst to i8* call void @llvm.memset.p0i8.i32(i8* %dst, i8 %val, - i32 8, i32 1, i1 false) + i32 32, i32 1, i1 false) ret void } ; CHECK-LABEL: test_memset_const_len_align ; CHECK: movzx ; CHECK: call {{.*}} R_{{.*}} memset +; ARM32-LABEL: test_memset_const_len_align +; ARM32: uxtb +; ARM32: bl {{.*}} memset define void @test_memset_const_val(i32 %iptr_dst, i32 %len) { entry: @@ -176,7 +203,9 @@ entry: ; Make sure the argument is legalized (can't movzx reg, 0). ; CHECK: movzx {{.*}},{{[^0]}} ; CHECK: call {{.*}} R_{{.*}} memset - +; ARM32-LABEL: test_memset_const_val +; ARM32: uxtb +; ARM32: bl {{.*}} memset define i32 @test_setjmplongjmp(i32 %iptr_env) { entry: @@ -198,6 +227,9 @@ NonZero: ; CHECKO2REM-LABEL: test_setjmplongjmp ; CHECKO2REM: call {{.*}} R_{{.*}} setjmp ; CHECKO2REM: call {{.*}} R_{{.*}} longjmp +; ARM32-LABEL: test_setjmplongjmp +; ARM32: bl {{.*}} setjmp +; ARM32: bl {{.*}} longjmp define i32 @test_setjmp_unused(i32 %iptr_env, i32 %i_other) { entry: @@ -344,6 +376,8 @@ NonZero: } ; CHECK-LABEL: test_trap ; CHECK: ud2 +; ARM32-LABEL: test_trap +; ARM32: .word 0xe7fedef0 define i32 @test_bswap_16(i32 %x) { entry: @@ -356,6 +390,9 @@ entry: ; Make sure this is the right operand size so that the most significant bit ; to least significant bit rotation happens at the right boundary. ; CHECK: rol {{[abcd]x|si|di|bp|word ptr}},0x8 +; ARM32-LABEL: test_bswap_16 +; ARM32: rev +; ARM32: lsr {{.*}} #16 define i32 @test_bswap_32(i32 %x) { entry: @@ -364,6 +401,8 @@ entry: } ; CHECK-LABEL: test_bswap_32 ; CHECK: bswap e{{.*}} +; ARM32-LABEL: test_bswap_32 +; ARM32: rev define i64 @test_bswap_64(i64 %x) { entry: @@ -373,6 +412,9 @@ entry: ; CHECK-LABEL: test_bswap_64 ; CHECK: bswap e{{.*}} ; CHECK: bswap e{{.*}} +; ARM32-LABEL: test_bswap_64 +; ARM32: rev +; ARM32: rev define i32 @test_ctlz_32(i32 %x) { entry: @@ -387,6 +429,8 @@ entry: ; CHECK: mov [[REG_RES:e.*]],0x3f ; CHECK: cmovne [[REG_RES]],[[REG_TMP]] ; CHECK: xor [[REG_RES]],0x1f +; ARM32-LABEL: test_ctlz_32 +; ARM32: clz define i32 @test_ctlz_32_const() { entry: @@ -398,6 +442,8 @@ entry: ; or memory. ; CHECK-LABEL: test_ctlz_32_const ; CHECK: bsr e{{.*}},{{.*}}e{{.*}} +; ARM32-LABEL: test_ctlz_32_const +; ARM32: clz define i32 @test_ctlz_32_ignored(i32 %x) { entry: @@ -424,6 +470,12 @@ entry: ; CHECK: test [[REG_UPPER:.*]],[[REG_UPPER]] ; CHECK: cmove [[REG_RES2]],[[REG_RES1]] ; CHECK: mov {{.*}},0x0 +; ARM32-LABEL: test_ctlz_64 +; ARM32: clz +; ARM32: cmp {{.*}}, #0 +; ARM32: add {{.*}}, #32 +; ARM32: clzne +; ARM32: mov {{.*}}, #0 define i32 @test_ctlz_64_const(i64 %x) { entry: @@ -434,7 +486,9 @@ entry: ; CHECK-LABEL: test_ctlz_64_const ; CHECK: bsr e{{.*}},{{.*}}e{{.*}} ; CHECK: bsr e{{.*}},{{.*}}e{{.*}} - +; ARM32-LABEL: test_ctlz_64 +; ARM32: clz +; ARM32: clzne define i32 @test_ctlz_64_ignored(i64 %x) { entry: @@ -453,6 +507,9 @@ entry: ; CHECK: bsf [[REG_IF_NOTZERO:e.*]],{{.*}} ; CHECK: mov [[REG_IF_ZERO:e.*]],0x20 ; CHECK: cmovne [[REG_IF_ZERO]],[[REG_IF_NOTZERO]] +; ARM32-LABEL: test_cttz_32 +; ARM32: rbit +; ARM32: clz define i64 @test_cttz_64(i64 %x) { entry: @@ -468,6 +525,14 @@ entry: ; CHECK: test [[REG_LOWER]],[[REG_LOWER]] ; CHECK: cmove [[REG_RES2]],[[REG_RES1]] ; CHECK: mov {{.*}},0x0 +; ARM32-LABEL: test_cttz_64 +; ARM32: rbit +; ARM32: rbit +; ARM32: clz +; ARM32: cmp {{.*}}, #0 +; ARM32: add {{.*}}, #32 +; ARM32: clzne +; ARM32: mov {{.*}}, #0 define i32 @test_popcount_32(i32 %x) { entry: @@ -476,6 +541,8 @@ entry: } ; CHECK-LABEL: test_popcount_32 ; CHECK: call {{.*}} R_{{.*}} __popcountsi2 +; ARM32-LABEL: test_popcount_32 +; ARM32: bl {{.*}} __popcountsi2 define i64 @test_popcount_64(i64 %x) { entry: @@ -487,7 +554,9 @@ entry: ; __popcountdi2 only returns a 32-bit result, so clear the upper bits of ; the return value just in case. ; CHECK: mov {{.*}},0x0 - +; ARM32-LABEL: test_popcount_64 +; ARM32: bl {{.*}} __popcountdi2 +; ARM32: mov {{.*}}, #0 define i32 @test_popcount_64_ret_i32(i64 %x) { entry: @@ -509,6 +578,9 @@ entry: ; CHECK-LABEL: test_stacksave_noalloca ; CHECK: mov {{.*}},esp ; CHECK: mov esp,{{.*}} +; ARM32-LABEL: test_stacksave_noalloca +; ARM32: mov {{.*}}, sp +; ARM32: mov sp, {{.*}} declare i32 @foo(i32 %x) @@ -544,3 +616,8 @@ entry: ; CHECK: mov {{.*}},esp ; CHECK: mov {{.*}},esp ; CHECK: mov esp,{{.*}} +; ARM32-LABEL: test_stacksave_multiple +; ARM32: mov {{.*}}, sp +; ARM32: mov {{.*}}, sp +; ARM32: mov {{.*}}, sp +; ARM32: mov sp, {{.*}} -- 2.11.0