From c77f817f923b2fcbf043da76e1a3066f7729aef2 Mon Sep 17 00:00:00 2001 From: Jim Stichnoth Date: Sun, 31 May 2015 23:34:44 -0700 Subject: [PATCH] Subzero: Fold the load instruction into the next cast instruction. This is similar to the way a load instruction may be folded into the next arithmetic instruction. Usually the effect is to improve a sequence like: mov ax, WORD PTR [mem] movsx eax, ax into this: movsx eax, WORD PTR [mem] without actually improving register allocation, though other kinds of casts may have different improvements. Existing tests needed to be fixed when they "inadvertently" did a cast to i32 return type and triggered the optimization when it wasn't wanted. These were fixed by inserting a "dummy" instruction between the load and the cast. BUG= https://code.google.com/p/nativeclient/issues/detail?id=4095 R=jvoung@chromium.org Review URL: https://codereview.chromium.org/1152783006 --- src/IceTargetLoweringX8632.cpp | 108 ++++++--- tests_lit/llvm2ice_tests/8bit.pnacl.ll | 6 +- tests_lit/llvm2ice_tests/load_cast.ll | 266 +++++++++++++++++++++ tests_lit/llvm2ice_tests/nacl-atomic-intrinsics.ll | 6 +- 4 files changed, 344 insertions(+), 42 deletions(-) create mode 100644 tests_lit/llvm2ice_tests/load_cast.ll diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp index 7edb2c36f..a1ba9d600 100644 --- a/src/IceTargetLoweringX8632.cpp +++ b/src/IceTargetLoweringX8632.cpp @@ -883,7 +883,7 @@ void TargetX8632::addProlog(CfgNode *Node) { // that stack slot. if (SpillVariable *SpillVar = llvm::dyn_cast(Var)) { assert(Var->getWeight().isZero()); - if (!SpillVar->getLinkedTo()->hasReg()) { + if (SpillVar->getLinkedTo() && !SpillVar->getLinkedTo()->hasReg()) { VariablesLinkedToSpillSlots.push_back(Var); continue; } @@ -1160,8 +1160,9 @@ void TargetX8632::split64(Variable *Var) { } Operand *TargetX8632::loOperand(Operand *Operand) { - assert(Operand->getType() == IceType_i64); - if (Operand->getType() != IceType_i64) + assert(Operand->getType() == IceType_i64 || + Operand->getType() == IceType_f64); + if (Operand->getType() != IceType_i64 && Operand->getType() != IceType_f64) return Operand; if (Variable *Var = llvm::dyn_cast(Operand)) { split64(Var); @@ -1180,8 +1181,9 @@ Operand *TargetX8632::loOperand(Operand *Operand) { } Operand *TargetX8632::hiOperand(Operand *Operand) { - assert(Operand->getType() == IceType_i64); - if (Operand->getType() != IceType_i64) + assert(Operand->getType() == IceType_i64 || + Operand->getType() == IceType_f64); + if (Operand->getType() != IceType_i64 && Operand->getType() != IceType_f64) return Operand; if (Variable *Var = llvm::dyn_cast(Operand)) { split64(Var); @@ -2463,20 +2465,25 @@ void TargetX8632::lowerCast(const InstCast *Inst) { // a_lo.i32 = t_lo.i32 // t_hi.i32 = hi(s.f64) // a_hi.i32 = t_hi.i32 - SpillVariable *SpillVar = Func->makeVariable(IceType_f64); - SpillVar->setLinkedTo(llvm::dyn_cast(Src0RM)); - Variable *Spill = SpillVar; - Spill->setWeight(RegWeight::Zero); - _movq(Spill, Src0RM); + Operand *SpillLo, *SpillHi; + if (auto *Src0Var = llvm::dyn_cast(Src0RM)) { + SpillVariable *SpillVar = + Func->makeVariable(IceType_f64); + SpillVar->setLinkedTo(Src0Var); + Variable *Spill = SpillVar; + Spill->setWeight(RegWeight::Zero); + _movq(Spill, Src0RM); + SpillLo = VariableSplit::create(Func, Spill, VariableSplit::Low); + SpillHi = VariableSplit::create(Func, Spill, VariableSplit::High); + } else { + SpillLo = loOperand(Src0RM); + SpillHi = hiOperand(Src0RM); + } Variable *DestLo = llvm::cast(loOperand(Dest)); Variable *DestHi = llvm::cast(hiOperand(Dest)); Variable *T_Lo = makeReg(IceType_i32); Variable *T_Hi = makeReg(IceType_i32); - VariableSplit *SpillLo = - VariableSplit::create(Func, Spill, VariableSplit::Low); - VariableSplit *SpillHi = - VariableSplit::create(Func, Spill, VariableSplit::High); _mov(T_Lo, SpillLo); _mov(DestLo, T_Lo); @@ -2486,6 +2493,12 @@ void TargetX8632::lowerCast(const InstCast *Inst) { case IceType_f64: { Src0 = legalize(Src0); assert(Src0->getType() == IceType_i64); + if (llvm::isa(Src0)) { + Variable *T = Func->makeVariable(Dest->getType()); + _movq(T, Src0); + _movq(Dest, T); + break; + } // a.f64 = bitcast b.i64 ==> // t_lo.i32 = b_lo.i32 // FakeDef(s.f64) @@ -3955,20 +3968,23 @@ void computeAddressOpt(Cfg *Func, const Inst *Instr, Variable *&Base, } // anonymous namespace -void TargetX8632::lowerLoad(const InstLoad *Inst) { +void TargetX8632::lowerLoad(const InstLoad *Load) { // A Load instruction can be treated the same as an Assign // instruction, after the source operand is transformed into an // OperandX8632Mem operand. Note that the address mode // optimization already creates an OperandX8632Mem operand, so it // doesn't need another level of transformation. - Type Ty = Inst->getDest()->getType(); - Operand *Src0 = FormMemoryOperand(Inst->getSourceAddress(), Ty); + Type Ty = Load->getDest()->getType(); + Operand *Src0 = FormMemoryOperand(Load->getSourceAddress(), Ty); // Fuse this load with a subsequent Arithmetic instruction in the // following situations: // a=[mem]; c=b+a ==> c=b+[mem] if last use of a and a not in b // a=[mem]; c=a+b ==> c=b+[mem] if commutative and above is true // + // Fuse this load with a subsequent Cast instruction: + // a=[mem]; b=cast(a) ==> b=cast([mem]) if last use of a + // // TODO: Clean up and test thoroughly. // (E.g., if there is an mfence-all make sure the load ends up on the // same side of the fence). @@ -3979,30 +3995,46 @@ void TargetX8632::lowerLoad(const InstLoad *Inst) { // load instruction's dest variable, and that instruction ends that // variable's live range, then make the substitution. Deal with // commutativity optimization in the arithmetic instruction lowering. - InstArithmetic *NewArith = nullptr; - if (InstArithmetic *Arith = - llvm::dyn_cast_or_null(Context.getNextInst())) { - Variable *DestLoad = Inst->getDest(); - Variable *Src0Arith = llvm::dyn_cast(Arith->getSrc(0)); - Variable *Src1Arith = llvm::dyn_cast(Arith->getSrc(1)); - if (Src1Arith == DestLoad && Arith->isLastUse(Src1Arith) && - DestLoad != Src0Arith) { - NewArith = InstArithmetic::create(Func, Arith->getOp(), Arith->getDest(), - Arith->getSrc(0), Src0); - } else if (Src0Arith == DestLoad && Arith->isCommutative() && - Arith->isLastUse(Src0Arith) && DestLoad != Src1Arith) { - NewArith = InstArithmetic::create(Func, Arith->getOp(), Arith->getDest(), - Arith->getSrc(1), Src0); - } - if (NewArith) { - Arith->setDeleted(); - Context.advanceNext(); - lowerArithmetic(NewArith); - return; + // + // TODO(stichnot): Do load fusing as a separate pass. Run it before + // the bool folding pass. Modify Ice::Inst to allow src operands to + // be replaced, including updating Inst::LiveRangesEnded, to avoid + // having to manually mostly clone each instruction type. + Inst *NextInst = Context.getNextInst(); + Variable *DestLoad = Load->getDest(); + if (NextInst && NextInst->isLastUse(DestLoad)) { + if (auto *Arith = llvm::dyn_cast(NextInst)) { + InstArithmetic *NewArith = nullptr; + Variable *Src0Arith = llvm::dyn_cast(Arith->getSrc(0)); + Variable *Src1Arith = llvm::dyn_cast(Arith->getSrc(1)); + if (Src1Arith == DestLoad && DestLoad != Src0Arith) { + NewArith = InstArithmetic::create( + Func, Arith->getOp(), Arith->getDest(), Arith->getSrc(0), Src0); + } else if (Src0Arith == DestLoad && Arith->isCommutative() && + DestLoad != Src1Arith) { + NewArith = InstArithmetic::create( + Func, Arith->getOp(), Arith->getDest(), Arith->getSrc(1), Src0); + } + if (NewArith) { + Arith->setDeleted(); + Context.advanceNext(); + lowerArithmetic(NewArith); + return; + } + } else if (auto *Cast = llvm::dyn_cast(NextInst)) { + Variable *Src0Cast = llvm::dyn_cast(Cast->getSrc(0)); + if (Src0Cast == DestLoad) { + InstCast *NewCast = + InstCast::create(Func, Cast->getCastKind(), Cast->getDest(), Src0); + Cast->setDeleted(); + Context.advanceNext(); + lowerCast(NewCast); + return; + } } } - InstAssign *Assign = InstAssign::create(Func, Inst->getDest(), Src0); + InstAssign *Assign = InstAssign::create(Func, DestLoad, Src0); lowerAssign(Assign); } diff --git a/tests_lit/llvm2ice_tests/8bit.pnacl.ll b/tests_lit/llvm2ice_tests/8bit.pnacl.ll index 4987a1d5c..81db96d7f 100644 --- a/tests_lit/llvm2ice_tests/8bit.pnacl.ll +++ b/tests_lit/llvm2ice_tests/8bit.pnacl.ll @@ -335,7 +335,8 @@ define i32 @load_i8(i32 %addr_arg) { entry: %addr = inttoptr i32 %addr_arg to i8* %ret = load i8* %addr, align 1 - %ret_ext = zext i8 %ret to i32 + %ret2 = sub i8 %ret, 0 + %ret_ext = zext i8 %ret2 to i32 ret i32 %ret_ext } ; CHECK-LABEL: load_i8 @@ -345,7 +346,8 @@ define i32 @load_i8_global(i32 %addr_arg) { entry: %addr = bitcast [1 x i8]* @global8 to i8* %ret = load i8* %addr, align 1 - %ret_ext = zext i8 %ret to i32 + %ret2 = sub i8 %ret, 0 + %ret_ext = zext i8 %ret2 to i32 ret i32 %ret_ext } ; CHECK-LABEL: load_i8_global diff --git a/tests_lit/llvm2ice_tests/load_cast.ll b/tests_lit/llvm2ice_tests/load_cast.ll new file mode 100644 index 000000000..5395d04ff --- /dev/null +++ b/tests_lit/llvm2ice_tests/load_cast.ll @@ -0,0 +1,266 @@ +; Tests desired and undesired folding of load instructions into cast +; instructions. The folding is only done when liveness analysis is performed, +; so only O2 is tested. + +; RUN: %p2i --filetype=obj --disassemble -i %s --args -O2 | FileCheck %s + +; Not testing trunc, or 32-bit bitcast, because the lowered code uses pretty +; much the same mov instructions regardless of whether folding is done. + +define internal i32 @zext_fold(i32 %arg) { +entry: + %ptr = add i32 %arg, 200 + %addr = inttoptr i32 %ptr to i8* + %load = load i8* %addr, align 1 + %result = zext i8 %load to i32 + ret i32 %result +} +; CHECK-LABEL: zext_fold +; CHECK: movzx {{.*}},BYTE PTR [{{.*}}+0xc8] + +define internal i32 @zext_nofold(i32 %arg) { +entry: + %ptr = add i32 %arg, 200 + %addr = inttoptr i32 %ptr to i8* + %load = load i8* %addr, align 1 + %tmp1 = zext i8 %load to i32 + %tmp2 = zext i8 %load to i32 + %result = add i32 %tmp1, %tmp2 + ret i32 %result +} +; Test that load folding does not happen. +; CHECK-LABEL: zext_nofold +; CHECK-NOT: movzx {{.*}},BYTE PTR [{{.*}}+0xc8] + +define internal i32 @sext_fold(i32 %arg) { +entry: + %ptr = add i32 %arg, 200 + %addr = inttoptr i32 %ptr to i8* + %load = load i8* %addr, align 1 + %result = sext i8 %load to i32 + ret i32 %result +} +; CHECK-LABEL: sext_fold +; CHECK: movsx {{.*}},BYTE PTR [{{.*}}+0xc8] + +define internal i32 @sext_nofold(i32 %arg) { +entry: + %ptr = add i32 %arg, 200 + %addr = inttoptr i32 %ptr to i8* + %load = load i8* %addr, align 1 + %tmp1 = sext i8 %load to i32 + %tmp2 = sext i8 %load to i32 + %result = add i32 %tmp1, %tmp2 + ret i32 %result +} +; Test that load folding does not happen. +; CHECK-LABEL: sext_nofold +; CHECK-NOT: movsx {{.*}},BYTE PTR [{{.*}}+0xc8] + +define internal float @fptrunc_fold(i32 %arg) { +entry: + %ptr = add i32 %arg, 200 + %addr = inttoptr i32 %ptr to double* + %load = load double* %addr, align 8 + %result = fptrunc double %load to float + ret float %result +} +; CHECK-LABEL: fptrunc_fold +; CHECK: cvtsd2ss {{.*}},QWORD PTR [{{.*}}+0xc8] + +define internal float @fptrunc_nofold(i32 %arg) { +entry: + %ptr = add i32 %arg, 200 + %addr = inttoptr i32 %ptr to double* + %load = load double* %addr, align 8 + %tmp1 = fptrunc double %load to float + %tmp2 = fptrunc double %load to float + %result = fadd float %tmp1, %tmp2 + ret float %result +} +; Test that load folding does not happen. +; CHECK-LABEL: fptrunc_nofold +; CHECK-NOT: cvtsd2ss {{.*}},QWORD PTR [{{.*}}+0xc8] + +define internal double @fpext_fold(i32 %arg) { +entry: + %ptr = add i32 %arg, 200 + %addr = inttoptr i32 %ptr to float* + %load = load float* %addr, align 4 + %result = fpext float %load to double + ret double %result +} +; CHECK-LABEL: fpext_fold +; CHECK: cvtss2sd {{.*}},DWORD PTR [{{.*}}+0xc8] + +define internal double @fpext_nofold(i32 %arg) { +entry: + %ptr = add i32 %arg, 200 + %addr = inttoptr i32 %ptr to float* + %load = load float* %addr, align 4 + %tmp1 = fpext float %load to double + %tmp2 = fpext float %load to double + %result = fadd double %tmp1, %tmp2 + ret double %result +} +; Test that load folding does not happen. +; CHECK-LABEL: fpext_nofold +; CHECK-NOT: cvtss2sd {{.*}},DWORD PTR [{{.*}}+0xc8] + +define internal i32 @fptoui_fold(i32 %arg) { +entry: + %ptr = add i32 %arg, 200 + %addr = inttoptr i32 %ptr to double* + %load = load double* %addr, align 8 + %result = fptoui double %load to i16 + %result2 = zext i16 %result to i32 + ret i32 %result2 +} +; CHECK-LABEL: fptoui_fold +; CHECK: cvttsd2si {{.*}},QWORD PTR [{{.*}}+0xc8] + +define internal i32 @fptoui_nofold(i32 %arg) { +entry: + %ptr = add i32 %arg, 200 + %addr = inttoptr i32 %ptr to double* + %load = load double* %addr, align 8 + %tmp1 = fptoui double %load to i16 + %tmp2 = fptoui double %load to i16 + %result = add i16 %tmp1, %tmp2 + %result2 = zext i16 %result to i32 + ret i32 %result2 +} +; Test that load folding does not happen. +; CHECK-LABEL: fptoui_nofold +; CHECK-NOT: cvttsd2si {{.*}},QWORD PTR [{{.*}}+0xc8] + +define internal i32 @fptosi_fold(i32 %arg) { +entry: + %ptr = add i32 %arg, 200 + %addr = inttoptr i32 %ptr to double* + %load = load double* %addr, align 8 + %result = fptosi double %load to i16 + %result2 = zext i16 %result to i32 + ret i32 %result2 +} +; CHECK-LABEL: fptosi_fold +; CHECK: cvttsd2si {{.*}},QWORD PTR [{{.*}}+0xc8] + +define internal i32 @fptosi_nofold(i32 %arg) { +entry: + %ptr = add i32 %arg, 200 + %addr = inttoptr i32 %ptr to double* + %load = load double* %addr, align 8 + %tmp1 = fptosi double %load to i16 + %tmp2 = fptosi double %load to i16 + %result = add i16 %tmp1, %tmp2 + %result2 = zext i16 %result to i32 + ret i32 %result2 +} +; Test that load folding does not happen. +; CHECK-LABEL: fptosi_nofold +; CHECK-NOT: cvttsd2si {{.*}},QWORD PTR [{{.*}}+0xc8] + +define internal double @uitofp_fold(i32 %arg) { +entry: + %ptr = add i32 %arg, 200 + %addr = inttoptr i32 %ptr to i16* + %load = load i16* %addr, align 1 + %result = uitofp i16 %load to double + ret double %result +} +; CHECK-LABEL: uitofp_fold +; CHECK: movzx {{.*}},WORD PTR [{{.*}}+0xc8] + +define internal double @uitofp_nofold(i32 %arg) { +entry: + %ptr = add i32 %arg, 200 + %addr = inttoptr i32 %ptr to i16* + %load = load i16* %addr, align 1 + %tmp1 = uitofp i16 %load to double + %tmp2 = uitofp i16 %load to double + %result = fadd double %tmp1, %tmp2 + ret double %result +} +; Test that load folding does not happen. +; CHECK-LABEL: uitofp_nofold +; CHECK-NOT: movzx {{.*}},WORD PTR [{{.*}}+0xc8] + +define internal double @sitofp_fold(i32 %arg) { +entry: + %ptr = add i32 %arg, 200 + %addr = inttoptr i32 %ptr to i16* + %load = load i16* %addr, align 1 + %result = sitofp i16 %load to double + ret double %result +} +; CHECK-LABEL: sitofp_fold +; CHECK: movsx {{.*}},WORD PTR [{{.*}}+0xc8] + +define internal double @sitofp_nofold(i32 %arg) { +entry: + %ptr = add i32 %arg, 200 + %addr = inttoptr i32 %ptr to i16* + %load = load i16* %addr, align 1 + %tmp1 = sitofp i16 %load to double + %tmp2 = sitofp i16 %load to double + %result = fadd double %tmp1, %tmp2 + ret double %result +} +; Test that load folding does not happen. +; CHECK-LABEL: sitofp_nofold +; CHECK-NOT: movsx {{.*}},WORD PTR [{{.*}}+0xc8] + +define internal double @bitcast_i64_fold(i32 %arg) { +entry: + %ptr = add i32 %arg, 200 + %addr = inttoptr i32 %ptr to i64* + %load = load i64* %addr, align 1 + %result = bitcast i64 %load to double + ret double %result +} +; CHECK-LABEL: bitcast_i64_fold +; CHECK: movq {{.*}},QWORD PTR [{{.*}}+0xc8] + +define internal double @bitcast_i64_nofold(i32 %arg) { +entry: + %ptr = add i32 %arg, 200 + %addr = inttoptr i32 %ptr to i64* + %load = load i64* %addr, align 1 + %tmp1 = bitcast i64 %load to double + %tmp2 = bitcast i64 %load to double + %result = fadd double %tmp1, %tmp2 + ret double %result +} +; Test that load folding does not happen. +; CHECK-LABEL: bitcast_i64_nofold +; CHECK-NOT: movq {{.*}},QWORD PTR [{{.*}}+0xc8] + +define internal i64 @bitcast_double_fold(i32 %arg) { +entry: + %ptr = add i32 %arg, 200 + %addr = inttoptr i32 %ptr to double* + %load = load double* %addr, align 8 + %result = bitcast double %load to i64 + ret i64 %result +} +; CHECK-LABEL: bitcast_double_fold +; CHECK-NOT: QWORD PTR +; CHECK: mov {{.*}},DWORD PTR [{{.*}}+0xc8] +; CHECK: mov {{.*}},DWORD PTR [{{.*}}+0xcc] +; CHECK-NOT: QWORD PTR + +define internal i64 @bitcast_double_nofold(i32 %arg) { +entry: + %ptr = add i32 %arg, 200 + %addr = inttoptr i32 %ptr to double* + %load = load double* %addr, align 8 + %tmp1 = bitcast double %load to i64 + %tmp2 = bitcast double %load to i64 + %result = add i64 %tmp1, %tmp2 + ret i64 %result +} +; Test that load folding does not happen. +; CHECK-LABEL: bitcast_double_nofold +; CHECK: QWORD PTR +; CHECK: QWORD PTR diff --git a/tests_lit/llvm2ice_tests/nacl-atomic-intrinsics.ll b/tests_lit/llvm2ice_tests/nacl-atomic-intrinsics.ll index a0b8b6c5e..1d24497d1 100644 --- a/tests_lit/llvm2ice_tests/nacl-atomic-intrinsics.ll +++ b/tests_lit/llvm2ice_tests/nacl-atomic-intrinsics.ll @@ -48,7 +48,8 @@ entry: %ptr = inttoptr i32 %iptr to i8* ; parameter value "6" is for the sequential consistency memory order. %i = call i8 @llvm.nacl.atomic.load.i8(i8* %ptr, i32 6) - %r = zext i8 %i to i32 + %i2 = sub i8 %i, 0 + %r = zext i8 %i2 to i32 ret i32 %r } ; CHECK-LABEL: test_atomic_load_8 @@ -59,7 +60,8 @@ define i32 @test_atomic_load_16(i32 %iptr) { entry: %ptr = inttoptr i32 %iptr to i16* %i = call i16 @llvm.nacl.atomic.load.i16(i16* %ptr, i32 6) - %r = zext i16 %i to i32 + %i2 = sub i16 %i, 0 + %r = zext i16 %i2 to i32 ret i32 %r } ; CHECK-LABEL: test_atomic_load_16 -- 2.11.0