return;
Func->dump("After x86 address mode opt");
+ doLoadOpt();
Func->genCode();
if (Func->hasError())
return;
}
}
+namespace {
+
+// Converts a ConstantInteger32 operand into its constant value, or
+// MemoryOrderInvalid if the operand is not a ConstantInteger32.
+uint64_t getConstantMemoryOrder(Operand *Opnd) {
+ if (auto Integer = llvm::dyn_cast<ConstantInteger32>(Opnd))
+ return Integer->getValue();
+ return Intrinsics::MemoryOrderInvalid;
+}
+
+// Determines whether the dest of a Load instruction can be folded
+// into one of the src operands of a 2-operand instruction. This is
+// true as long as the load dest matches exactly one of the binary
+// instruction's src operands. Replaces Src0 or Src1 with LoadSrc if
+// the answer is true.
+bool canFoldLoadIntoBinaryInst(Operand *LoadSrc, Variable *LoadDest,
+ Operand *&Src0, Operand *&Src1) {
+ if (Src0 == LoadDest && Src1 != LoadDest) {
+ Src0 = LoadSrc;
+ return true;
+ }
+ if (Src0 != LoadDest && Src1 == LoadDest) {
+ Src1 = LoadSrc;
+ return true;
+ }
+ return false;
+}
+
+} // end of anonymous namespace
+
+void TargetX8632::doLoadOpt() {
+ for (CfgNode *Node : Func->getNodes()) {
+ Context.init(Node);
+ while (!Context.atEnd()) {
+ Variable *LoadDest = nullptr;
+ Operand *LoadSrc = nullptr;
+ Inst *CurInst = Context.getCur();
+ Inst *Next = Context.getNextInst();
+ // Determine whether the current instruction is a Load
+ // instruction or equivalent.
+ if (auto *Load = llvm::dyn_cast<InstLoad>(CurInst)) {
+ // An InstLoad always qualifies.
+ LoadDest = Load->getDest();
+ const bool DoLegalize = false;
+ LoadSrc = formMemoryOperand(Load->getSourceAddress(),
+ LoadDest->getType(), DoLegalize);
+ } else if (auto *Intrin = llvm::dyn_cast<InstIntrinsicCall>(CurInst)) {
+ // An AtomicLoad intrinsic qualifies as long as it has a valid
+ // memory ordering, and can be implemented in a single
+ // instruction (i.e., not i64).
+ Intrinsics::IntrinsicID ID = Intrin->getIntrinsicInfo().ID;
+ if (ID == Intrinsics::AtomicLoad &&
+ Intrin->getDest()->getType() != IceType_i64 &&
+ Intrinsics::isMemoryOrderValid(
+ ID, getConstantMemoryOrder(Intrin->getArg(1)))) {
+ LoadDest = Intrin->getDest();
+ const bool DoLegalize = false;
+ LoadSrc = formMemoryOperand(Intrin->getArg(0), LoadDest->getType(),
+ DoLegalize);
+ }
+ }
+ // A Load instruction can be folded into the following
+ // instruction only if the following instruction ends the Load's
+ // Dest variable's live range.
+ if (LoadDest && Next && Next->isLastUse(LoadDest)) {
+ assert(LoadSrc);
+ Inst *NewInst = nullptr;
+ if (auto *Arith = llvm::dyn_cast<InstArithmetic>(Next)) {
+ Operand *Src0 = Arith->getSrc(0);
+ Operand *Src1 = Arith->getSrc(1);
+ if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
+ NewInst = InstArithmetic::create(Func, Arith->getOp(),
+ Arith->getDest(), Src0, Src1);
+ }
+ } else if (auto *Icmp = llvm::dyn_cast<InstIcmp>(Next)) {
+ Operand *Src0 = Icmp->getSrc(0);
+ Operand *Src1 = Icmp->getSrc(1);
+ if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
+ NewInst = InstIcmp::create(Func, Icmp->getCondition(),
+ Icmp->getDest(), Src0, Src1);
+ }
+ } else if (auto *Fcmp = llvm::dyn_cast<InstFcmp>(Next)) {
+ Operand *Src0 = Fcmp->getSrc(0);
+ Operand *Src1 = Fcmp->getSrc(1);
+ if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
+ NewInst = InstFcmp::create(Func, Fcmp->getCondition(),
+ Fcmp->getDest(), Src0, Src1);
+ }
+ } else if (auto *Select = llvm::dyn_cast<InstSelect>(Next)) {
+ Operand *Src0 = Select->getTrueOperand();
+ Operand *Src1 = Select->getFalseOperand();
+ if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
+ NewInst = InstSelect::create(Func, Select->getDest(),
+ Select->getCondition(), Src0, Src1);
+ }
+ } else if (auto *Cast = llvm::dyn_cast<InstCast>(Next)) {
+ // The load dest can always be folded into a Cast
+ // instruction.
+ Variable *Src0 = llvm::dyn_cast<Variable>(Cast->getSrc(0));
+ if (Src0 == LoadDest) {
+ NewInst = InstCast::create(Func, Cast->getCastKind(),
+ Cast->getDest(), LoadSrc);
+ }
+ }
+ if (NewInst) {
+ CurInst->setDeleted();
+ Next->setDeleted();
+ Context.insert(NewInst);
+ // Update NewInst->LiveRangesEnded so that target lowering
+ // may benefit. Also update NewInst->HasSideEffects.
+ NewInst->spliceLivenessInfo(Next, CurInst);
+ }
+ }
+ Context.advanceCur();
+ Context.advanceNext();
+ }
+ }
+ Func->dump("After load optimization");
+}
+
bool TargetX8632::doBranchOpt(Inst *I, const CfgNode *NextNode) {
if (InstX8632Br *Br = llvm::dyn_cast<InstX8632Br>(I)) {
return Br->optimizeBranch(NextNode);
// that stack slot.
std::function<bool(Variable *)> TargetVarHook =
[&VariablesLinkedToSpillSlots](Variable *Var) {
- if (SpillVariable *SpillVar = llvm::dyn_cast<SpillVariable>(Var)) {
- assert(Var->getWeight().isZero());
- if (SpillVar->getLinkedTo() && !SpillVar->getLinkedTo()->hasReg()) {
- VariablesLinkedToSpillSlots.push_back(Var);
- return true;
- }
- }
- return false;
- };
+ if (SpillVariable *SpillVar = llvm::dyn_cast<SpillVariable>(Var)) {
+ assert(Var->getWeight().isZero());
+ if (SpillVar->getLinkedTo() && !SpillVar->getLinkedTo()->hasReg()) {
+ VariablesLinkedToSpillSlots.push_back(Var);
+ return true;
+ }
+ }
+ return false;
+ };
// Compute the list of spilled variables and bounds for GlobalsSize, etc.
getVarStackSlotParams(SortedSpilledVariables, RegsUsed, &GlobalsSize,
Variable *Dest = Inst->getDest();
Operand *Src0 = legalize(Inst->getSrc(0));
Operand *Src1 = legalize(Inst->getSrc(1));
+ if (Inst->isCommutative()) {
+ if (!llvm::isa<Variable>(Src0) && llvm::isa<Variable>(Src1))
+ std::swap(Src0, Src1);
+ }
if (Dest->getType() == IceType_i64) {
Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
}
}
-namespace {
-
-// Converts a ConstantInteger32 operand into its constant value, or
-// MemoryOrderInvalid if the operand is not a ConstantInteger32.
-uint64_t getConstantMemoryOrder(Operand *Opnd) {
- if (auto Integer = llvm::dyn_cast<ConstantInteger32>(Opnd))
- return Integer->getValue();
- return Intrinsics::MemoryOrderInvalid;
-}
-
-} // end of anonymous namespace
-
void TargetX8632::lowerIntrinsicCall(const InstIntrinsicCall *Instr) {
switch (Intrinsics::IntrinsicID ID = Instr->getIntrinsicInfo().ID) {
case Intrinsics::AtomicCmpxchg: {
Func->setError("Unexpected memory ordering for AtomicRMW");
return;
}
- lowerAtomicRMW(Instr->getDest(),
- static_cast<uint32_t>(llvm::cast<ConstantInteger32>(
- Instr->getArg(0))->getValue()),
- Instr->getArg(1), Instr->getArg(2));
+ lowerAtomicRMW(
+ Instr->getDest(),
+ static_cast<uint32_t>(
+ llvm::cast<ConstantInteger32>(Instr->getArg(0))->getValue()),
+ Instr->getArg(1), Instr->getArg(2));
return;
case Intrinsics::AtomicStore: {
if (!Intrinsics::isMemoryOrderValid(
// OperandX8632Mem operand. Note that the address mode
// optimization already creates an OperandX8632Mem operand, so it
// doesn't need another level of transformation.
- Type Ty = Load->getDest()->getType();
- Operand *Src0 = formMemoryOperand(Load->getSourceAddress(), Ty);
-
- // Fuse this load with a subsequent Arithmetic instruction in the
- // following situations:
- // a=[mem]; c=b+a ==> c=b+[mem] if last use of a and a not in b
- // a=[mem]; c=a+b ==> c=b+[mem] if commutative and above is true
- //
- // Fuse this load with a subsequent Cast instruction:
- // a=[mem]; b=cast(a) ==> b=cast([mem]) if last use of a
- //
- // TODO: Clean up and test thoroughly.
- // (E.g., if there is an mfence-all make sure the load ends up on the
- // same side of the fence).
- //
- // TODO: Why limit to Arithmetic instructions? This could probably be
- // applied to most any instruction type. Look at all source operands
- // in the following instruction, and if there is one instance of the
- // load instruction's dest variable, and that instruction ends that
- // variable's live range, then make the substitution. Deal with
- // commutativity optimization in the arithmetic instruction lowering.
- //
- // TODO(stichnot): Do load fusing as a separate pass. Run it before
- // the bool folding pass. Modify Ice::Inst to allow src operands to
- // be replaced, including updating Inst::LiveRangesEnded, to avoid
- // having to manually mostly clone each instruction type.
- Inst *NextInst = Context.getNextInst();
Variable *DestLoad = Load->getDest();
- if (NextInst && NextInst->isLastUse(DestLoad)) {
- if (auto *Arith = llvm::dyn_cast<InstArithmetic>(NextInst)) {
- InstArithmetic *NewArith = nullptr;
- Variable *Src0Arith = llvm::dyn_cast<Variable>(Arith->getSrc(0));
- Variable *Src1Arith = llvm::dyn_cast<Variable>(Arith->getSrc(1));
- if (Src1Arith == DestLoad && DestLoad != Src0Arith) {
- NewArith = InstArithmetic::create(
- Func, Arith->getOp(), Arith->getDest(), Arith->getSrc(0), Src0);
- } else if (Src0Arith == DestLoad && Arith->isCommutative() &&
- DestLoad != Src1Arith) {
- NewArith = InstArithmetic::create(
- Func, Arith->getOp(), Arith->getDest(), Arith->getSrc(1), Src0);
- }
- if (NewArith) {
- Arith->setDeleted();
- Context.advanceNext();
- lowerArithmetic(NewArith);
- return;
- }
- } else if (auto *Cast = llvm::dyn_cast<InstCast>(NextInst)) {
- Variable *Src0Cast = llvm::dyn_cast<Variable>(Cast->getSrc(0));
- if (Src0Cast == DestLoad) {
- InstCast *NewCast =
- InstCast::create(Func, Cast->getCastKind(), Cast->getDest(), Src0);
- Cast->setDeleted();
- Context.advanceNext();
- lowerCast(NewCast);
- return;
- }
- }
- }
-
+ Type Ty = DestLoad->getType();
+ Operand *Src0 = formMemoryOperand(Load->getSourceAddress(), Ty);
InstAssign *Assign = InstAssign::create(Func, DestLoad, Src0);
lowerAssign(Assign);
}
return legalize(Src0, IsSrc1ImmOrReg ? (Legal_Reg | Legal_Mem) : Legal_Reg);
}
-OperandX8632Mem *TargetX8632::formMemoryOperand(Operand *Operand, Type Ty) {
+OperandX8632Mem *TargetX8632::formMemoryOperand(Operand *Operand, Type Ty,
+ bool DoLegalize) {
OperandX8632Mem *Mem = llvm::dyn_cast<OperandX8632Mem>(Operand);
// It may be the case that address mode optimization already creates
// an OperandX8632Mem, so in that case it wouldn't need another level
}
Mem = OperandX8632Mem::create(Func, Ty, Base, Offset);
}
- return llvm::cast<OperandX8632Mem>(legalize(Mem));
+ return llvm::cast<OperandX8632Mem>(DoLegalize ? legalize(Mem) : Mem);
}
Variable *TargetX8632::makeReg(Type Type, int32_t RegNum) {
@g32_c = internal global [4 x i8] zeroinitializer, align 4
@g32_d = internal global [4 x i8] zeroinitializer, align 4
-define i32 @test_fused_load_add_a() {
+define i32 @test_fused_load_sub_a() {
entry:
%p_alloca = alloca i8, i32 4, align 4
%p_alloca_bc = bitcast i8* %p_alloca to i32*
%p_a = bitcast [4 x i8]* @g32_a to i32*
%l_a = call i32 @llvm.nacl.atomic.load.i32(i32* %p_a, i32 6)
- %l_a2 = add i32 %l_a, 1
+ %l_a2 = sub i32 1, %l_a
call void @llvm.nacl.atomic.store.i32(i32 %l_a2, i32* %p_a, i32 6)
%p_b = bitcast [4 x i8]* @g32_b to i32*
%l_b = load i32, i32* %p_b, align 1
- %l_b2 = add i32 %l_b, 1
+ %l_b2 = sub i32 1, %l_b
store i32 %l_b2, i32* %p_b, align 1
%p_c = bitcast [4 x i8]* @g32_c to i32*
%l_c = load i32, i32* %p_c, align 1
- %l_c2 = add i32 %l_c, 1
+ %l_c2 = sub i32 1, %l_c
call void @llvm.nacl.atomic.fence.all()
store i32 %l_c2, i32* %p_c, align 1
ret i32 %l_c2
}
-; CHECK-LABEL: test_fused_load_add_a
+; CHECK-LABEL: test_fused_load_sub_a
; alloca store
; CHECK: mov {{.*}},esp
; CHECK: mov DWORD PTR {{.*}},0x3e7
; atomic store (w/ its own mfence)
-; The load + add are optimized into one everywhere.
-; CHECK: add {{.*}},DWORD PTR {{.*}}g32_a
+; The load + sub are optimized into one everywhere.
+; CHECK: sub {{.*}},DWORD PTR {{.*}}g32_a
; CHECK: mov DWORD PTR
; CHECK: mfence
-; CHECK: add {{.*}},DWORD PTR {{.*}}g32_b
+; CHECK: sub {{.*}},DWORD PTR {{.*}}g32_b
; CHECK: mov DWORD PTR
-; CHECK: add {{.*}},DWORD PTR {{.*}}g32_c
+; CHECK: sub {{.*}},DWORD PTR {{.*}}g32_c
; CHECK: mfence
; CHECK: mov DWORD PTR
; Test with the fence moved up a bit.
-define i32 @test_fused_load_add_b() {
+define i32 @test_fused_load_sub_b() {
entry:
%p_alloca = alloca i8, i32 4, align 4
%p_alloca_bc = bitcast i8* %p_alloca to i32*
%p_a = bitcast [4 x i8]* @g32_a to i32*
%l_a = call i32 @llvm.nacl.atomic.load.i32(i32* %p_a, i32 6)
- %l_a2 = add i32 %l_a, 1
+ %l_a2 = sub i32 1, %l_a
call void @llvm.nacl.atomic.store.i32(i32 %l_a2, i32* %p_a, i32 6)
%p_b = bitcast [4 x i8]* @g32_b to i32*
%l_b = load i32, i32* %p_b, align 1
- %l_b2 = add i32 %l_b, 1
+ %l_b2 = sub i32 1, %l_b
store i32 %l_b2, i32* %p_b, align 1
%p_c = bitcast [4 x i8]* @g32_c to i32*
call void @llvm.nacl.atomic.fence.all()
%l_c = load i32, i32* %p_c, align 1
- %l_c2 = add i32 %l_c, 1
+ %l_c2 = sub i32 1, %l_c
store i32 %l_c2, i32* %p_c, align 1
ret i32 %l_c2
}
-; CHECK-LABEL: test_fused_load_add_b
+; CHECK-LABEL: test_fused_load_sub_b
; alloca store
; CHECK: mov {{.*}},esp
; CHECK: mov DWORD PTR {{.*}},0x3e7
; atomic store (w/ its own mfence)
-; CHECK: add {{.*}},DWORD PTR {{.*}}g32_a
+; CHECK: sub {{.*}},DWORD PTR {{.*}}g32_a
; CHECK: mov DWORD PTR
; CHECK: mfence
-; CHECK: add {{.*}},DWORD PTR {{.*}}g32_b
+; CHECK: sub {{.*}},DWORD PTR {{.*}}g32_b
; CHECK: mov DWORD PTR
; CHECK: mfence
-; Load + add can still be optimized into one instruction
+; Load + sub can still be optimized into one instruction
; because it is not separated by a fence.
-; CHECK: add {{.*}},DWORD PTR {{.*}}g32_c
+; CHECK: sub {{.*}},DWORD PTR {{.*}}g32_c
; CHECK: mov DWORD PTR
-; Test with the fence splitting a load/add.
-define i32 @test_fused_load_add_c() {
+; Test with the fence splitting a load/sub.
+define i32 @test_fused_load_sub_c() {
entry:
%p_alloca = alloca i8, i32 4, align 4
%p_alloca_bc = bitcast i8* %p_alloca to i32*
%p_a = bitcast [4 x i8]* @g32_a to i32*
%l_a = call i32 @llvm.nacl.atomic.load.i32(i32* %p_a, i32 6)
- %l_a2 = add i32 %l_a, 1
+ %l_a2 = sub i32 1, %l_a
call void @llvm.nacl.atomic.store.i32(i32 %l_a2, i32* %p_a, i32 6)
%p_b = bitcast [4 x i8]* @g32_b to i32*
%l_b = load i32, i32* %p_b, align 1
call void @llvm.nacl.atomic.fence.all()
- %l_b2 = add i32 %l_b, 1
+ %l_b2 = sub i32 1, %l_b
store i32 %l_b2, i32* %p_b, align 1
%p_c = bitcast [4 x i8]* @g32_c to i32*
%l_c = load i32, i32* %p_c, align 1
- %l_c2 = add i32 %l_c, 1
+ %l_c2 = sub i32 1, %l_c
store i32 %l_c2, i32* %p_c, align 1
ret i32 %l_c2
}
-; CHECK-LABEL: test_fused_load_add_c
+; CHECK-LABEL: test_fused_load_sub_c
; alloca store
; CHECK: mov {{.*}},esp
; CHECK: mov DWORD PTR {{.*}},0x3e7
; atomic store (w/ its own mfence)
-; CHECK: add {{.*}},DWORD PTR {{.*}}g32_a
+; CHECK: sub {{.*}},DWORD PTR {{.*}}g32_a
; CHECK: mov DWORD PTR
; CHECK: mfence
-; This load + add are no longer optimized into one,
+; This load + sub are no longer optimized into one,
; though perhaps it should be legal as long as
; the load stays on the same side of the fence.
; CHECK: mov {{.*}},DWORD PTR {{.*}}g32_b
; CHECK: mfence
-; CHECK: add {{.*}},0x1
+; CHECK: mov {{.*}},0x1
+; CHECK: sub
; CHECK: mov DWORD PTR
-; CHECK: add {{.*}},DWORD PTR {{.*}}g32_c
+; CHECK: sub {{.*}},DWORD PTR {{.*}}g32_c
; CHECK: mov DWORD PTR