1 //===- subzero/src/IceTargetLoweringX8632.cpp - x86-32 lowering -----------===//
3 // The Subzero Code Generator
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This file implements the TargetLoweringX8632 class, which
11 // consists almost entirely of the lowering sequence for each
12 // high-level instruction.
14 //===----------------------------------------------------------------------===//
16 #include "llvm/Support/MathExtras.h"
19 #include "IceCfgNode.h"
20 #include "IceClFlags.h"
22 #include "IceELFObjectWriter.h"
23 #include "IceGlobalInits.h"
24 #include "IceInstX8632.h"
25 #include "IceLiveness.h"
26 #include "IceOperand.h"
27 #include "IceRegistersX8632.h"
28 #include "IceTargetLoweringX8632.def"
29 #include "IceTargetLoweringX8632.h"
36 // The following table summarizes the logic for lowering the fcmp
37 // instruction. There is one table entry for each of the 16 conditions.
39 // The first four columns describe the case when the operands are
40 // floating point scalar values. A comment in lowerFcmp() describes the
41 // lowering template. In the most general case, there is a compare
42 // followed by two conditional branches, because some fcmp conditions
43 // don't map to a single x86 conditional branch. However, in many cases
44 // it is possible to swap the operands in the comparison and have a
45 // single conditional branch. Since it's quite tedious to validate the
46 // table by hand, good execution tests are helpful.
48 // The last two columns describe the case when the operands are vectors
49 // of floating point values. For most fcmp conditions, there is a clear
50 // mapping to a single x86 cmpps instruction variant. Some fcmp
51 // conditions require special code to handle and these are marked in the
52 // table with a Cmpps_Invalid predicate.
53 const struct TableFcmp_ {
55 bool SwapScalarOperands;
56 CondX86::BrCond C1, C2;
57 bool SwapVectorOperands;
58 CondX86::CmppsCond Predicate;
60 #define X(val, dflt, swapS, C1, C2, swapV, pred) \
61 { dflt, swapS, CondX86::C1, CondX86::C2, swapV, CondX86::pred } \
66 const size_t TableFcmpSize = llvm::array_lengthof(TableFcmp);
68 // The following table summarizes the logic for lowering the icmp instruction
69 // for i32 and narrower types. Each icmp condition has a clear mapping to an
70 // x86 conditional branch instruction.
72 const struct TableIcmp32_ {
73 CondX86::BrCond Mapping;
75 #define X(val, C_32, C1_64, C2_64, C3_64) \
81 const size_t TableIcmp32Size = llvm::array_lengthof(TableIcmp32);
83 // The following table summarizes the logic for lowering the icmp instruction
84 // for the i64 type. For Eq and Ne, two separate 32-bit comparisons and
85 // conditional branches are needed. For the other conditions, three separate
86 // conditional branches are needed.
87 const struct TableIcmp64_ {
88 CondX86::BrCond C1, C2, C3;
90 #define X(val, C_32, C1_64, C2_64, C3_64) \
91 { CondX86::C1_64, CondX86::C2_64, CondX86::C3_64 } \
96 const size_t TableIcmp64Size = llvm::array_lengthof(TableIcmp64);
98 CondX86::BrCond getIcmp32Mapping(InstIcmp::ICond Cond) {
99 size_t Index = static_cast<size_t>(Cond);
100 assert(Index < TableIcmp32Size);
101 return TableIcmp32[Index].Mapping;
104 const struct TableTypeX8632Attributes_ {
105 Type InVectorElementType;
106 } TableTypeX8632Attributes[] = {
107 #define X(tag, elementty, cvt, sdss, pack, width, fld) \
113 const size_t TableTypeX8632AttributesSize =
114 llvm::array_lengthof(TableTypeX8632Attributes);
116 // Return the type which the elements of the vector have in the X86
117 // representation of the vector.
118 Type getInVectorElementType(Type Ty) {
119 assert(isVectorType(Ty));
120 size_t Index = static_cast<size_t>(Ty);
122 assert(Index < TableTypeX8632AttributesSize);
123 return TableTypeX8632Attributes[Ty].InVectorElementType;
126 // The maximum number of arguments to pass in XMM registers
127 const uint32_t X86_MAX_XMM_ARGS = 4;
128 // The number of bits in a byte
129 const uint32_t X86_CHAR_BIT = 8;
131 const uint32_t X86_STACK_ALIGNMENT_BYTES = 16;
132 // Size of the return address on the stack
133 const uint32_t X86_RET_IP_SIZE_BYTES = 4;
134 // The number of different NOP instructions
135 const uint32_t X86_NUM_NOP_VARIANTS = 5;
137 // Value is in bytes. Return Value adjusted to the next highest multiple
138 // of the stack alignment.
139 uint32_t applyStackAlignment(uint32_t Value) {
140 return Utils::applyAlignment(Value, X86_STACK_ALIGNMENT_BYTES);
143 // In some cases, there are x-macros tables for both high-level and
144 // low-level instructions/operands that use the same enum key value.
145 // The tables are kept separate to maintain a proper separation
146 // between abstraction layers. There is a risk that the tables could
147 // get out of sync if enum values are reordered or if entries are
148 // added or deleted. The following dummy namespaces use
149 // static_asserts to ensure everything is kept in sync.
151 // Validate the enum values in FCMPX8632_TABLE.
153 // Define a temporary set of enum values based on low-level table
156 #define X(val, dflt, swapS, C1, C2, swapV, pred) _tmp_##val,
161 // Define a set of constants based on high-level table entries.
162 #define X(tag, str) static const int _table1_##tag = InstFcmp::tag;
165 // Define a set of constants based on low-level table entries, and
166 // ensure the table entry keys are consistent.
167 #define X(val, dflt, swapS, C1, C2, swapV, pred) \
168 static const int _table2_##val = _tmp_##val; \
170 _table1_##val == _table2_##val, \
171 "Inconsistency between FCMPX8632_TABLE and ICEINSTFCMP_TABLE");
174 // Repeat the static asserts with respect to the high-level table
175 // entries in case the high-level table has extra entries.
176 #define X(tag, str) \
178 _table1_##tag == _table2_##tag, \
179 "Inconsistency between FCMPX8632_TABLE and ICEINSTFCMP_TABLE");
182 } // end of namespace dummy1
184 // Validate the enum values in ICMPX8632_TABLE.
186 // Define a temporary set of enum values based on low-level table
189 #define X(val, C_32, C1_64, C2_64, C3_64) _tmp_##val,
194 // Define a set of constants based on high-level table entries.
195 #define X(tag, str) static const int _table1_##tag = InstIcmp::tag;
198 // Define a set of constants based on low-level table entries, and
199 // ensure the table entry keys are consistent.
200 #define X(val, C_32, C1_64, C2_64, C3_64) \
201 static const int _table2_##val = _tmp_##val; \
203 _table1_##val == _table2_##val, \
204 "Inconsistency between ICMPX8632_TABLE and ICEINSTICMP_TABLE");
207 // Repeat the static asserts with respect to the high-level table
208 // entries in case the high-level table has extra entries.
209 #define X(tag, str) \
211 _table1_##tag == _table2_##tag, \
212 "Inconsistency between ICMPX8632_TABLE and ICEINSTICMP_TABLE");
215 } // end of namespace dummy2
217 // Validate the enum values in ICETYPEX8632_TABLE.
219 // Define a temporary set of enum values based on low-level table
222 #define X(tag, elementty, cvt, sdss, pack, width, fld) _tmp_##tag,
227 // Define a set of constants based on high-level table entries.
228 #define X(tag, size, align, elts, elty, str) \
229 static const int _table1_##tag = tag;
232 // Define a set of constants based on low-level table entries, and
233 // ensure the table entry keys are consistent.
234 #define X(tag, elementty, cvt, sdss, pack, width, fld) \
235 static const int _table2_##tag = _tmp_##tag; \
236 static_assert(_table1_##tag == _table2_##tag, \
237 "Inconsistency between ICETYPEX8632_TABLE and ICETYPE_TABLE");
240 // Repeat the static asserts with respect to the high-level table
241 // entries in case the high-level table has extra entries.
242 #define X(tag, size, align, elts, elty, str) \
243 static_assert(_table1_##tag == _table2_##tag, \
244 "Inconsistency between ICETYPEX8632_TABLE and ICETYPE_TABLE");
247 } // end of namespace dummy3
249 } // end of anonymous namespace
251 BoolFoldingEntry::BoolFoldingEntry(Inst *I)
252 : Instr(I), IsComplex(BoolFolding::hasComplexLowering(I)), IsLiveOut(true),
255 BoolFolding::BoolFoldingProducerKind
256 BoolFolding::getProducerKind(const Inst *Instr) {
257 if (llvm::isa<InstIcmp>(Instr)) {
258 if (Instr->getSrc(0)->getType() != IceType_i64)
260 return PK_None; // TODO(stichnot): actually PK_Icmp64;
262 return PK_None; // TODO(stichnot): remove this
264 if (llvm::isa<InstFcmp>(Instr))
266 if (auto *Cast = llvm::dyn_cast<InstCast>(Instr)) {
267 switch (Cast->getCastKind()) {
270 case InstCast::Trunc:
277 BoolFolding::BoolFoldingConsumerKind
278 BoolFolding::getConsumerKind(const Inst *Instr) {
279 if (llvm::isa<InstBr>(Instr))
281 if (llvm::isa<InstSelect>(Instr))
283 return CK_None; // TODO(stichnot): remove this
285 if (auto *Cast = llvm::dyn_cast<InstCast>(Instr)) {
286 switch (Cast->getCastKind()) {
298 // Returns true if the producing instruction has a "complex" lowering
299 // sequence. This generally means that its lowering sequence requires
300 // more than one conditional branch, namely 64-bit integer compares
301 // and some floating-point compares. When this is true, and there is
302 // more than one consumer, we prefer to disable the folding
303 // optimization because it minimizes branches.
304 bool BoolFolding::hasComplexLowering(const Inst *Instr) {
305 switch (getProducerKind(Instr)) {
311 return TableFcmp[llvm::cast<InstFcmp>(Instr)->getCondition()].C2 !=
316 void BoolFolding::init(CfgNode *Node) {
318 for (Inst &Instr : Node->getInsts()) {
319 // Check whether Instr is a valid producer.
320 Variable *Var = Instr.getDest();
321 if (!Instr.isDeleted() // only consider non-deleted instructions
322 && Var // only instructions with an actual dest var
323 && Var->getType() == IceType_i1 // only bool-type dest vars
324 && getProducerKind(&Instr) != PK_None) { // white-listed instructions
325 Producers[Var->getIndex()] = BoolFoldingEntry(&Instr);
327 // Check each src variable against the map.
328 for (SizeT I = 0; I < Instr.getSrcSize(); ++I) {
329 Operand *Src = Instr.getSrc(I);
330 SizeT NumVars = Src->getNumVars();
331 for (SizeT J = 0; J < NumVars; ++J) {
332 const Variable *Var = Src->getVar(J);
333 SizeT VarNum = Var->getIndex();
334 if (containsValid(VarNum)) {
335 if (I != 0 // All valid consumers use Var as the first source operand
336 || getConsumerKind(&Instr) == CK_None // must be white-listed
337 || (Producers[VarNum].IsComplex && // complex can't be multi-use
338 Producers[VarNum].NumUses > 0)) {
342 ++Producers[VarNum].NumUses;
343 if (Instr.isLastUse(Var)) {
344 Producers[VarNum].IsLiveOut = false;
350 for (auto &I : Producers) {
351 // Ignore entries previously marked invalid.
352 if (I.second.Instr == nullptr)
354 // Disable the producer if its dest may be live beyond this block.
355 if (I.second.IsLiveOut) {
359 // Mark as "dead" rather than outright deleting. This is so that
360 // other peephole style optimizations during or before lowering
361 // have access to this instruction in undeleted form. See for
362 // example tryOptimizedCmpxchgCmpBr().
363 I.second.Instr->setDead();
367 const Inst *BoolFolding::getProducerFor(const Operand *Opnd) const {
368 auto *Var = llvm::dyn_cast<const Variable>(Opnd);
371 SizeT VarNum = Var->getIndex();
372 auto Element = Producers.find(VarNum);
373 if (Element == Producers.end())
375 return Element->second.Instr;
378 void BoolFolding::dump(const Cfg *Func) const {
379 if (!ALLOW_DUMP || !Func->isVerbose(IceV_Folding))
381 OstreamLocker L(Func->getContext());
382 Ostream &Str = Func->getContext()->getStrDump();
383 for (auto &I : Producers) {
384 if (I.second.Instr == nullptr)
386 Str << "Found foldable producer:\n ";
387 I.second.Instr->dump(Func);
392 void TargetX8632::initNodeForLowering(CfgNode *Node) {
393 FoldingInfo.init(Node);
394 FoldingInfo.dump(Func);
397 TargetX8632::TargetX8632(Cfg *Func)
398 : TargetLowering(Func),
399 InstructionSet(static_cast<X86InstructionSet>(
400 Func->getContext()->getFlags().getTargetInstructionSet() -
401 TargetInstructionSet::X86InstructionSet_Begin)),
402 IsEbpBasedFrame(false), NeedsStackAlignment(false),
403 SpillAreaSizeBytes(0) {
404 static_assert((X86InstructionSet::End - X86InstructionSet::Begin) ==
405 (TargetInstructionSet::X86InstructionSet_End -
406 TargetInstructionSet::X86InstructionSet_Begin),
407 "X86InstructionSet range different from TargetInstructionSet");
408 // TODO: Don't initialize IntegerRegisters and friends every time.
409 // Instead, initialize in some sort of static initializer for the
411 llvm::SmallBitVector IntegerRegisters(RegX8632::Reg_NUM);
412 llvm::SmallBitVector IntegerRegistersI8(RegX8632::Reg_NUM);
413 llvm::SmallBitVector FloatRegisters(RegX8632::Reg_NUM);
414 llvm::SmallBitVector VectorRegisters(RegX8632::Reg_NUM);
415 llvm::SmallBitVector InvalidRegisters(RegX8632::Reg_NUM);
416 ScratchRegs.resize(RegX8632::Reg_NUM);
417 #define X(val, encode, name, name16, name8, scratch, preserved, stackptr, \
418 frameptr, isI8, isInt, isFP) \
419 IntegerRegisters[RegX8632::val] = isInt; \
420 IntegerRegistersI8[RegX8632::val] = isI8; \
421 FloatRegisters[RegX8632::val] = isFP; \
422 VectorRegisters[RegX8632::val] = isFP; \
423 ScratchRegs[RegX8632::val] = scratch;
426 TypeToRegisterSet[IceType_void] = InvalidRegisters;
427 TypeToRegisterSet[IceType_i1] = IntegerRegistersI8;
428 TypeToRegisterSet[IceType_i8] = IntegerRegistersI8;
429 TypeToRegisterSet[IceType_i16] = IntegerRegisters;
430 TypeToRegisterSet[IceType_i32] = IntegerRegisters;
431 TypeToRegisterSet[IceType_i64] = IntegerRegisters;
432 TypeToRegisterSet[IceType_f32] = FloatRegisters;
433 TypeToRegisterSet[IceType_f64] = FloatRegisters;
434 TypeToRegisterSet[IceType_v4i1] = VectorRegisters;
435 TypeToRegisterSet[IceType_v8i1] = VectorRegisters;
436 TypeToRegisterSet[IceType_v16i1] = VectorRegisters;
437 TypeToRegisterSet[IceType_v16i8] = VectorRegisters;
438 TypeToRegisterSet[IceType_v8i16] = VectorRegisters;
439 TypeToRegisterSet[IceType_v4i32] = VectorRegisters;
440 TypeToRegisterSet[IceType_v4f32] = VectorRegisters;
443 void TargetX8632::translateO2() {
444 TimerMarker T(TimerStack::TT_O2, Func);
446 if (!Ctx->getFlags().getPhiEdgeSplit()) {
447 // Lower Phi instructions.
448 Func->placePhiLoads();
449 if (Func->hasError())
451 Func->placePhiStores();
452 if (Func->hasError())
455 if (Func->hasError())
457 Func->dump("After Phi lowering");
460 // Address mode optimization.
461 Func->getVMetadata()->init(VMK_SingleDefs);
462 Func->doAddressOpt();
465 Func->doArgLowering();
467 // Target lowering. This requires liveness analysis for some parts
468 // of the lowering decisions, such as compare/branch fusing. If
469 // non-lightweight liveness analysis is used, the instructions need
470 // to be renumbered first. TODO: This renumbering should only be
471 // necessary if we're actually calculating live intervals, which we
472 // only do for register allocation.
473 Func->renumberInstructions();
474 if (Func->hasError())
477 // TODO: It should be sufficient to use the fastest liveness
478 // calculation, i.e. livenessLightweight(). However, for some
479 // reason that slows down the rest of the translation. Investigate.
480 Func->liveness(Liveness_Basic);
481 if (Func->hasError())
483 Func->dump("After x86 address mode opt");
487 if (Func->hasError())
489 Func->dump("After x86 codegen");
491 // Register allocation. This requires instruction renumbering and
492 // full liveness analysis.
493 Func->renumberInstructions();
494 if (Func->hasError())
496 Func->liveness(Liveness_Intervals);
497 if (Func->hasError())
499 // Validate the live range computations. The expensive validation
500 // call is deliberately only made when assertions are enabled.
501 assert(Func->validateLiveness());
502 // The post-codegen dump is done here, after liveness analysis and
503 // associated cleanup, to make the dump cleaner and more useful.
504 Func->dump("After initial x8632 codegen");
505 Func->getVMetadata()->init(VMK_All);
506 regAlloc(RAK_Global);
507 if (Func->hasError())
509 Func->dump("After linear scan regalloc");
511 if (Ctx->getFlags().getPhiEdgeSplit()) {
512 Func->advancedPhiLowering();
513 Func->dump("After advanced Phi lowering");
516 // Stack frame mapping.
518 if (Func->hasError())
520 Func->dump("After stack frame mapping");
522 Func->contractEmptyNodes();
523 Func->reorderNodes();
525 // Branch optimization. This needs to be done just before code
526 // emission. In particular, no transformations that insert or
527 // reorder CfgNodes should be done after branch optimization. We go
528 // ahead and do it before nop insertion to reduce the amount of work
529 // needed for searching for opportunities.
531 Func->dump("After branch optimization");
534 if (Ctx->getFlags().shouldDoNopInsertion()) {
535 Func->doNopInsertion();
539 void TargetX8632::translateOm1() {
540 TimerMarker T(TimerStack::TT_Om1, Func);
542 Func->placePhiLoads();
543 if (Func->hasError())
545 Func->placePhiStores();
546 if (Func->hasError())
549 if (Func->hasError())
551 Func->dump("After Phi lowering");
553 Func->doArgLowering();
556 if (Func->hasError())
558 Func->dump("After initial x8632 codegen");
560 regAlloc(RAK_InfOnly);
561 if (Func->hasError())
563 Func->dump("After regalloc of infinite-weight variables");
566 if (Func->hasError())
568 Func->dump("After stack frame mapping");
571 if (Ctx->getFlags().shouldDoNopInsertion()) {
572 Func->doNopInsertion();
578 // Converts a ConstantInteger32 operand into its constant value, or
579 // MemoryOrderInvalid if the operand is not a ConstantInteger32.
580 uint64_t getConstantMemoryOrder(Operand *Opnd) {
581 if (auto Integer = llvm::dyn_cast<ConstantInteger32>(Opnd))
582 return Integer->getValue();
583 return Intrinsics::MemoryOrderInvalid;
586 // Determines whether the dest of a Load instruction can be folded
587 // into one of the src operands of a 2-operand instruction. This is
588 // true as long as the load dest matches exactly one of the binary
589 // instruction's src operands. Replaces Src0 or Src1 with LoadSrc if
590 // the answer is true.
591 bool canFoldLoadIntoBinaryInst(Operand *LoadSrc, Variable *LoadDest,
592 Operand *&Src0, Operand *&Src1) {
593 if (Src0 == LoadDest && Src1 != LoadDest) {
597 if (Src0 != LoadDest && Src1 == LoadDest) {
604 } // end of anonymous namespace
606 void TargetX8632::doLoadOpt() {
607 for (CfgNode *Node : Func->getNodes()) {
609 while (!Context.atEnd()) {
610 Variable *LoadDest = nullptr;
611 Operand *LoadSrc = nullptr;
612 Inst *CurInst = Context.getCur();
613 Inst *Next = Context.getNextInst();
614 // Determine whether the current instruction is a Load
615 // instruction or equivalent.
616 if (auto *Load = llvm::dyn_cast<InstLoad>(CurInst)) {
617 // An InstLoad always qualifies.
618 LoadDest = Load->getDest();
619 const bool DoLegalize = false;
620 LoadSrc = formMemoryOperand(Load->getSourceAddress(),
621 LoadDest->getType(), DoLegalize);
622 } else if (auto *Intrin = llvm::dyn_cast<InstIntrinsicCall>(CurInst)) {
623 // An AtomicLoad intrinsic qualifies as long as it has a valid
624 // memory ordering, and can be implemented in a single
625 // instruction (i.e., not i64).
626 Intrinsics::IntrinsicID ID = Intrin->getIntrinsicInfo().ID;
627 if (ID == Intrinsics::AtomicLoad &&
628 Intrin->getDest()->getType() != IceType_i64 &&
629 Intrinsics::isMemoryOrderValid(
630 ID, getConstantMemoryOrder(Intrin->getArg(1)))) {
631 LoadDest = Intrin->getDest();
632 const bool DoLegalize = false;
633 LoadSrc = formMemoryOperand(Intrin->getArg(0), LoadDest->getType(),
637 // A Load instruction can be folded into the following
638 // instruction only if the following instruction ends the Load's
639 // Dest variable's live range.
640 if (LoadDest && Next && Next->isLastUse(LoadDest)) {
642 Inst *NewInst = nullptr;
643 if (auto *Arith = llvm::dyn_cast<InstArithmetic>(Next)) {
644 Operand *Src0 = Arith->getSrc(0);
645 Operand *Src1 = Arith->getSrc(1);
646 if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
647 NewInst = InstArithmetic::create(Func, Arith->getOp(),
648 Arith->getDest(), Src0, Src1);
650 } else if (auto *Icmp = llvm::dyn_cast<InstIcmp>(Next)) {
651 Operand *Src0 = Icmp->getSrc(0);
652 Operand *Src1 = Icmp->getSrc(1);
653 if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
654 NewInst = InstIcmp::create(Func, Icmp->getCondition(),
655 Icmp->getDest(), Src0, Src1);
657 } else if (auto *Fcmp = llvm::dyn_cast<InstFcmp>(Next)) {
658 Operand *Src0 = Fcmp->getSrc(0);
659 Operand *Src1 = Fcmp->getSrc(1);
660 if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
661 NewInst = InstFcmp::create(Func, Fcmp->getCondition(),
662 Fcmp->getDest(), Src0, Src1);
664 } else if (auto *Select = llvm::dyn_cast<InstSelect>(Next)) {
665 Operand *Src0 = Select->getTrueOperand();
666 Operand *Src1 = Select->getFalseOperand();
667 if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
668 NewInst = InstSelect::create(Func, Select->getDest(),
669 Select->getCondition(), Src0, Src1);
671 } else if (auto *Cast = llvm::dyn_cast<InstCast>(Next)) {
672 // The load dest can always be folded into a Cast
674 Variable *Src0 = llvm::dyn_cast<Variable>(Cast->getSrc(0));
675 if (Src0 == LoadDest) {
676 NewInst = InstCast::create(Func, Cast->getCastKind(),
677 Cast->getDest(), LoadSrc);
681 CurInst->setDeleted();
683 Context.insert(NewInst);
684 // Update NewInst->LiveRangesEnded so that target lowering
685 // may benefit. Also update NewInst->HasSideEffects.
686 NewInst->spliceLivenessInfo(Next, CurInst);
689 Context.advanceCur();
690 Context.advanceNext();
693 Func->dump("After load optimization");
696 bool TargetX8632::doBranchOpt(Inst *I, const CfgNode *NextNode) {
697 if (InstX8632Br *Br = llvm::dyn_cast<InstX8632Br>(I)) {
698 return Br->optimizeBranch(NextNode);
703 IceString TargetX8632::RegNames[] = {
704 #define X(val, encode, name, name16, name8, scratch, preserved, stackptr, \
705 frameptr, isI8, isInt, isFP) \
711 Variable *TargetX8632::getPhysicalRegister(SizeT RegNum, Type Ty) {
712 if (Ty == IceType_void)
714 if (PhysicalRegisters[Ty].empty())
715 PhysicalRegisters[Ty].resize(RegX8632::Reg_NUM);
716 assert(RegNum < PhysicalRegisters[Ty].size());
717 Variable *Reg = PhysicalRegisters[Ty][RegNum];
718 if (Reg == nullptr) {
719 Reg = Func->makeVariable(Ty);
720 Reg->setRegNum(RegNum);
721 PhysicalRegisters[Ty][RegNum] = Reg;
722 // Specially mark esp as an "argument" so that it is considered
723 // live upon function entry.
724 if (RegNum == RegX8632::Reg_esp) {
725 Func->addImplicitArg(Reg);
726 Reg->setIgnoreLiveness();
732 IceString TargetX8632::getRegName(SizeT RegNum, Type Ty) const {
733 assert(RegNum < RegX8632::Reg_NUM);
734 static IceString RegNames8[] = {
735 #define X(val, encode, name, name16, name8, scratch, preserved, stackptr, \
736 frameptr, isI8, isInt, isFP) \
741 static IceString RegNames16[] = {
742 #define X(val, encode, name, name16, name8, scratch, preserved, stackptr, \
743 frameptr, isI8, isInt, isFP) \
751 return RegNames8[RegNum];
753 return RegNames16[RegNum];
755 return RegNames[RegNum];
759 void TargetX8632::emitVariable(const Variable *Var) const {
760 Ostream &Str = Ctx->getStrEmit();
762 Str << "%" << getRegName(Var->getRegNum(), Var->getType());
765 if (Var->getWeight().isInf())
766 llvm_unreachable("Infinite-weight Variable has no register assigned");
767 int32_t Offset = Var->getStackOffset();
768 if (!hasFramePointer())
769 Offset += getStackAdjustment();
772 const Type FrameSPTy = IceType_i32;
773 Str << "(%" << getRegName(getFrameOrStackReg(), FrameSPTy) << ")";
776 X8632::Address TargetX8632::stackVarToAsmOperand(const Variable *Var) const {
778 llvm_unreachable("Stack Variable has a register assigned");
779 if (Var->getWeight().isInf())
780 llvm_unreachable("Infinite-weight Variable has no register assigned");
781 int32_t Offset = Var->getStackOffset();
782 if (!hasFramePointer())
783 Offset += getStackAdjustment();
784 return X8632::Address(RegX8632::getEncodedGPR(getFrameOrStackReg()), Offset);
787 void TargetX8632::lowerArguments() {
788 VarList &Args = Func->getArgs();
789 // The first four arguments of vector type, regardless of their
790 // position relative to the other arguments in the argument list, are
791 // passed in registers xmm0 - xmm3.
792 unsigned NumXmmArgs = 0;
794 Context.init(Func->getEntryNode());
795 Context.setInsertPoint(Context.getCur());
797 for (SizeT I = 0, E = Args.size(); I < E && NumXmmArgs < X86_MAX_XMM_ARGS;
799 Variable *Arg = Args[I];
800 Type Ty = Arg->getType();
801 if (!isVectorType(Ty))
803 // Replace Arg in the argument list with the home register. Then
804 // generate an instruction in the prolog to copy the home register
805 // to the assigned location of Arg.
806 int32_t RegNum = RegX8632::Reg_xmm0 + NumXmmArgs;
808 Variable *RegisterArg = Func->makeVariable(Ty);
810 RegisterArg->setName(Func, "home_reg:" + Arg->getName(Func));
811 RegisterArg->setRegNum(RegNum);
812 RegisterArg->setIsArg();
813 Arg->setIsArg(false);
815 Args[I] = RegisterArg;
816 Context.insert(InstAssign::create(Func, Arg, RegisterArg));
820 // Helper function for addProlog().
822 // This assumes Arg is an argument passed on the stack. This sets the
823 // frame offset for Arg and updates InArgsSizeBytes according to Arg's
824 // width. For an I64 arg that has been split into Lo and Hi components,
825 // it calls itself recursively on the components, taking care to handle
826 // Lo first because of the little-endian architecture. Lastly, this
827 // function generates an instruction to copy Arg into its assigned
828 // register if applicable.
829 void TargetX8632::finishArgumentLowering(Variable *Arg, Variable *FramePtr,
830 size_t BasicFrameOffset,
831 size_t &InArgsSizeBytes) {
832 Variable *Lo = Arg->getLo();
833 Variable *Hi = Arg->getHi();
834 Type Ty = Arg->getType();
835 if (Lo && Hi && Ty == IceType_i64) {
836 assert(Lo->getType() != IceType_i64); // don't want infinite recursion
837 assert(Hi->getType() != IceType_i64); // don't want infinite recursion
838 finishArgumentLowering(Lo, FramePtr, BasicFrameOffset, InArgsSizeBytes);
839 finishArgumentLowering(Hi, FramePtr, BasicFrameOffset, InArgsSizeBytes);
842 if (isVectorType(Ty)) {
843 InArgsSizeBytes = applyStackAlignment(InArgsSizeBytes);
845 Arg->setStackOffset(BasicFrameOffset + InArgsSizeBytes);
846 InArgsSizeBytes += typeWidthInBytesOnStack(Ty);
848 assert(Ty != IceType_i64);
849 OperandX8632Mem *Mem = OperandX8632Mem::create(
850 Func, Ty, FramePtr, Ctx->getConstantInt32(Arg->getStackOffset()));
851 if (isVectorType(Arg->getType())) {
856 // This argument-copying instruction uses an explicit
857 // OperandX8632Mem operand instead of a Variable, so its
858 // fill-from-stack operation has to be tracked separately for
860 Ctx->statsUpdateFills();
864 Type TargetX8632::stackSlotType() { return IceType_i32; }
866 void TargetX8632::addProlog(CfgNode *Node) {
867 // Stack frame layout:
869 // +------------------------+
870 // | 1. return address |
871 // +------------------------+
872 // | 2. preserved registers |
873 // +------------------------+
875 // +------------------------+
876 // | 4. global spill area |
877 // +------------------------+
879 // +------------------------+
880 // | 6. local spill area |
881 // +------------------------+
883 // +------------------------+
885 // +------------------------+
887 // The following variables record the size in bytes of the given areas:
888 // * X86_RET_IP_SIZE_BYTES: area 1
889 // * PreservedRegsSizeBytes: area 2
890 // * SpillAreaPaddingBytes: area 3
891 // * GlobalsSize: area 4
892 // * GlobalsAndSubsequentPaddingSize: areas 4 - 5
893 // * LocalsSpillAreaSize: area 6
894 // * SpillAreaSizeBytes: areas 3 - 7
896 // Determine stack frame offsets for each Variable without a
897 // register assignment. This can be done as one variable per stack
898 // slot. Or, do coalescing by running the register allocator again
899 // with an infinite set of registers (as a side effect, this gives
900 // variables a second chance at physical register assignment).
902 // A middle ground approach is to leverage sparsity and allocate one
903 // block of space on the frame for globals (variables with
904 // multi-block lifetime), and one block to share for locals
905 // (single-block lifetime).
908 Context.setInsertPoint(Context.getCur());
910 llvm::SmallBitVector CalleeSaves =
911 getRegisterSet(RegSet_CalleeSave, RegSet_None);
912 RegsUsed = llvm::SmallBitVector(CalleeSaves.size());
913 VarList SortedSpilledVariables, VariablesLinkedToSpillSlots;
914 size_t GlobalsSize = 0;
915 // If there is a separate locals area, this represents that area.
916 // Otherwise it counts any variable not counted by GlobalsSize.
917 SpillAreaSizeBytes = 0;
918 // If there is a separate locals area, this specifies the alignment
920 uint32_t LocalsSlotsAlignmentBytes = 0;
921 // The entire spill locations area gets aligned to largest natural
922 // alignment of the variables that have a spill slot.
923 uint32_t SpillAreaAlignmentBytes = 0;
924 // A spill slot linked to a variable with a stack slot should reuse
926 std::function<bool(Variable *)> TargetVarHook =
927 [&VariablesLinkedToSpillSlots](Variable *Var) {
928 if (SpillVariable *SpillVar = llvm::dyn_cast<SpillVariable>(Var)) {
929 assert(Var->getWeight().isZero());
930 if (SpillVar->getLinkedTo() && !SpillVar->getLinkedTo()->hasReg()) {
931 VariablesLinkedToSpillSlots.push_back(Var);
938 // Compute the list of spilled variables and bounds for GlobalsSize, etc.
939 getVarStackSlotParams(SortedSpilledVariables, RegsUsed, &GlobalsSize,
940 &SpillAreaSizeBytes, &SpillAreaAlignmentBytes,
941 &LocalsSlotsAlignmentBytes, TargetVarHook);
942 uint32_t LocalsSpillAreaSize = SpillAreaSizeBytes;
943 SpillAreaSizeBytes += GlobalsSize;
945 // Add push instructions for preserved registers.
946 uint32_t NumCallee = 0;
947 size_t PreservedRegsSizeBytes = 0;
948 for (SizeT i = 0; i < CalleeSaves.size(); ++i) {
949 if (CalleeSaves[i] && RegsUsed[i]) {
951 PreservedRegsSizeBytes += 4;
952 _push(getPhysicalRegister(i));
955 Ctx->statsUpdateRegistersSaved(NumCallee);
957 // Generate "push ebp; mov ebp, esp"
958 if (IsEbpBasedFrame) {
959 assert((RegsUsed & getRegisterSet(RegSet_FramePointer, RegSet_None))
961 PreservedRegsSizeBytes += 4;
962 Variable *ebp = getPhysicalRegister(RegX8632::Reg_ebp);
963 Variable *esp = getPhysicalRegister(RegX8632::Reg_esp);
966 // Keep ebp live for late-stage liveness analysis
967 // (e.g. asm-verbose mode).
968 Context.insert(InstFakeUse::create(Func, ebp));
971 // Align the variables area. SpillAreaPaddingBytes is the size of
972 // the region after the preserved registers and before the spill areas.
973 // LocalsSlotsPaddingBytes is the amount of padding between the globals
974 // and locals area if they are separate.
975 assert(SpillAreaAlignmentBytes <= X86_STACK_ALIGNMENT_BYTES);
976 assert(LocalsSlotsAlignmentBytes <= SpillAreaAlignmentBytes);
977 uint32_t SpillAreaPaddingBytes = 0;
978 uint32_t LocalsSlotsPaddingBytes = 0;
979 alignStackSpillAreas(X86_RET_IP_SIZE_BYTES + PreservedRegsSizeBytes,
980 SpillAreaAlignmentBytes, GlobalsSize,
981 LocalsSlotsAlignmentBytes, &SpillAreaPaddingBytes,
982 &LocalsSlotsPaddingBytes);
983 SpillAreaSizeBytes += SpillAreaPaddingBytes + LocalsSlotsPaddingBytes;
984 uint32_t GlobalsAndSubsequentPaddingSize =
985 GlobalsSize + LocalsSlotsPaddingBytes;
987 // Align esp if necessary.
988 if (NeedsStackAlignment) {
989 uint32_t StackOffset = X86_RET_IP_SIZE_BYTES + PreservedRegsSizeBytes;
990 uint32_t StackSize = applyStackAlignment(StackOffset + SpillAreaSizeBytes);
991 SpillAreaSizeBytes = StackSize - StackOffset;
994 // Generate "sub esp, SpillAreaSizeBytes"
995 if (SpillAreaSizeBytes)
996 _sub(getPhysicalRegister(RegX8632::Reg_esp),
997 Ctx->getConstantInt32(SpillAreaSizeBytes));
998 Ctx->statsUpdateFrameBytes(SpillAreaSizeBytes);
1000 resetStackAdjustment();
1002 // Fill in stack offsets for stack args, and copy args into registers
1003 // for those that were register-allocated. Args are pushed right to
1004 // left, so Arg[0] is closest to the stack/frame pointer.
1005 Variable *FramePtr = getPhysicalRegister(getFrameOrStackReg());
1006 size_t BasicFrameOffset = PreservedRegsSizeBytes + X86_RET_IP_SIZE_BYTES;
1007 if (!IsEbpBasedFrame)
1008 BasicFrameOffset += SpillAreaSizeBytes;
1010 const VarList &Args = Func->getArgs();
1011 size_t InArgsSizeBytes = 0;
1012 unsigned NumXmmArgs = 0;
1013 for (Variable *Arg : Args) {
1014 // Skip arguments passed in registers.
1015 if (isVectorType(Arg->getType()) && NumXmmArgs < X86_MAX_XMM_ARGS) {
1019 finishArgumentLowering(Arg, FramePtr, BasicFrameOffset, InArgsSizeBytes);
1022 // Fill in stack offsets for locals.
1023 assignVarStackSlots(SortedSpilledVariables, SpillAreaPaddingBytes,
1024 SpillAreaSizeBytes, GlobalsAndSubsequentPaddingSize,
1026 // Assign stack offsets to variables that have been linked to spilled
1028 for (Variable *Var : VariablesLinkedToSpillSlots) {
1029 Variable *Linked = (llvm::cast<SpillVariable>(Var))->getLinkedTo();
1030 Var->setStackOffset(Linked->getStackOffset());
1032 this->HasComputedFrame = true;
1034 if (ALLOW_DUMP && Func->isVerbose(IceV_Frame)) {
1035 OstreamLocker L(Func->getContext());
1036 Ostream &Str = Func->getContext()->getStrDump();
1038 Str << "Stack layout:\n";
1039 uint32_t EspAdjustmentPaddingSize =
1040 SpillAreaSizeBytes - LocalsSpillAreaSize -
1041 GlobalsAndSubsequentPaddingSize - SpillAreaPaddingBytes;
1042 Str << " in-args = " << InArgsSizeBytes << " bytes\n"
1043 << " return address = " << X86_RET_IP_SIZE_BYTES << " bytes\n"
1044 << " preserved registers = " << PreservedRegsSizeBytes << " bytes\n"
1045 << " spill area padding = " << SpillAreaPaddingBytes << " bytes\n"
1046 << " globals spill area = " << GlobalsSize << " bytes\n"
1047 << " globals-locals spill areas intermediate padding = "
1048 << GlobalsAndSubsequentPaddingSize - GlobalsSize << " bytes\n"
1049 << " locals spill area = " << LocalsSpillAreaSize << " bytes\n"
1050 << " esp alignment padding = " << EspAdjustmentPaddingSize
1053 Str << "Stack details:\n"
1054 << " esp adjustment = " << SpillAreaSizeBytes << " bytes\n"
1055 << " spill area alignment = " << SpillAreaAlignmentBytes << " bytes\n"
1056 << " locals spill area alignment = " << LocalsSlotsAlignmentBytes
1058 << " is ebp based = " << IsEbpBasedFrame << "\n";
1062 void TargetX8632::addEpilog(CfgNode *Node) {
1063 InstList &Insts = Node->getInsts();
1064 InstList::reverse_iterator RI, E;
1065 for (RI = Insts.rbegin(), E = Insts.rend(); RI != E; ++RI) {
1066 if (llvm::isa<InstX8632Ret>(*RI))
1072 // Convert the reverse_iterator position into its corresponding
1073 // (forward) iterator position.
1074 InstList::iterator InsertPoint = RI.base();
1077 Context.setInsertPoint(InsertPoint);
1079 Variable *esp = getPhysicalRegister(RegX8632::Reg_esp);
1080 if (IsEbpBasedFrame) {
1081 Variable *ebp = getPhysicalRegister(RegX8632::Reg_ebp);
1082 // For late-stage liveness analysis (e.g. asm-verbose mode),
1083 // adding a fake use of esp before the assignment of esp=ebp keeps
1084 // previous esp adjustments from being dead-code eliminated.
1085 Context.insert(InstFakeUse::create(Func, esp));
1089 // add esp, SpillAreaSizeBytes
1090 if (SpillAreaSizeBytes)
1091 _add(esp, Ctx->getConstantInt32(SpillAreaSizeBytes));
1094 // Add pop instructions for preserved registers.
1095 llvm::SmallBitVector CalleeSaves =
1096 getRegisterSet(RegSet_CalleeSave, RegSet_None);
1097 for (SizeT i = 0; i < CalleeSaves.size(); ++i) {
1098 SizeT j = CalleeSaves.size() - i - 1;
1099 if (j == RegX8632::Reg_ebp && IsEbpBasedFrame)
1101 if (CalleeSaves[j] && RegsUsed[j]) {
1102 _pop(getPhysicalRegister(j));
1106 if (!Ctx->getFlags().getUseSandboxing())
1108 // Change the original ret instruction into a sandboxed return sequence.
1114 // FakeUse <original_ret_operand>
1115 const SizeT BundleSize = 1
1116 << Func->getAssembler<>()->getBundleAlignLog2Bytes();
1117 Variable *T_ecx = makeReg(IceType_i32, RegX8632::Reg_ecx);
1120 _and(T_ecx, Ctx->getConstantInt32(~(BundleSize - 1)));
1123 if (RI->getSrcSize()) {
1124 Variable *RetValue = llvm::cast<Variable>(RI->getSrc(0));
1125 Context.insert(InstFakeUse::create(Func, RetValue));
1130 void TargetX8632::split64(Variable *Var) {
1131 switch (Var->getType()) {
1135 // TODO: Only consider F64 if we need to push each half when
1136 // passing as an argument to a function call. Note that each half
1137 // is still typed as I32.
1141 Variable *Lo = Var->getLo();
1142 Variable *Hi = Var->getHi();
1147 assert(Hi == nullptr);
1148 Lo = Func->makeVariable(IceType_i32);
1149 Hi = Func->makeVariable(IceType_i32);
1151 Lo->setName(Func, Var->getName(Func) + "__lo");
1152 Hi->setName(Func, Var->getName(Func) + "__hi");
1154 Var->setLoHi(Lo, Hi);
1155 if (Var->getIsArg()) {
1161 Operand *TargetX8632::loOperand(Operand *Operand) {
1162 assert(Operand->getType() == IceType_i64 ||
1163 Operand->getType() == IceType_f64);
1164 if (Operand->getType() != IceType_i64 && Operand->getType() != IceType_f64)
1166 if (Variable *Var = llvm::dyn_cast<Variable>(Operand)) {
1168 return Var->getLo();
1170 if (ConstantInteger64 *Const = llvm::dyn_cast<ConstantInteger64>(Operand)) {
1171 return Ctx->getConstantInt32(static_cast<uint32_t>(Const->getValue()));
1173 if (OperandX8632Mem *Mem = llvm::dyn_cast<OperandX8632Mem>(Operand)) {
1174 return OperandX8632Mem::create(Func, IceType_i32, Mem->getBase(),
1175 Mem->getOffset(), Mem->getIndex(),
1176 Mem->getShift(), Mem->getSegmentRegister());
1178 llvm_unreachable("Unsupported operand type");
1182 Operand *TargetX8632::hiOperand(Operand *Operand) {
1183 assert(Operand->getType() == IceType_i64 ||
1184 Operand->getType() == IceType_f64);
1185 if (Operand->getType() != IceType_i64 && Operand->getType() != IceType_f64)
1187 if (Variable *Var = llvm::dyn_cast<Variable>(Operand)) {
1189 return Var->getHi();
1191 if (ConstantInteger64 *Const = llvm::dyn_cast<ConstantInteger64>(Operand)) {
1192 return Ctx->getConstantInt32(
1193 static_cast<uint32_t>(Const->getValue() >> 32));
1195 if (OperandX8632Mem *Mem = llvm::dyn_cast<OperandX8632Mem>(Operand)) {
1196 Constant *Offset = Mem->getOffset();
1197 if (Offset == nullptr) {
1198 Offset = Ctx->getConstantInt32(4);
1199 } else if (ConstantInteger32 *IntOffset =
1200 llvm::dyn_cast<ConstantInteger32>(Offset)) {
1201 Offset = Ctx->getConstantInt32(4 + IntOffset->getValue());
1202 } else if (ConstantRelocatable *SymOffset =
1203 llvm::dyn_cast<ConstantRelocatable>(Offset)) {
1204 assert(!Utils::WouldOverflowAdd(SymOffset->getOffset(), 4));
1206 Ctx->getConstantSym(4 + SymOffset->getOffset(), SymOffset->getName(),
1207 SymOffset->getSuppressMangling());
1209 return OperandX8632Mem::create(Func, IceType_i32, Mem->getBase(), Offset,
1210 Mem->getIndex(), Mem->getShift(),
1211 Mem->getSegmentRegister());
1213 llvm_unreachable("Unsupported operand type");
1217 llvm::SmallBitVector TargetX8632::getRegisterSet(RegSetMask Include,
1218 RegSetMask Exclude) const {
1219 llvm::SmallBitVector Registers(RegX8632::Reg_NUM);
1221 #define X(val, encode, name, name16, name8, scratch, preserved, stackptr, \
1222 frameptr, isI8, isInt, isFP) \
1223 if (scratch && (Include & RegSet_CallerSave)) \
1224 Registers[RegX8632::val] = true; \
1225 if (preserved && (Include & RegSet_CalleeSave)) \
1226 Registers[RegX8632::val] = true; \
1227 if (stackptr && (Include & RegSet_StackPointer)) \
1228 Registers[RegX8632::val] = true; \
1229 if (frameptr && (Include & RegSet_FramePointer)) \
1230 Registers[RegX8632::val] = true; \
1231 if (scratch && (Exclude & RegSet_CallerSave)) \
1232 Registers[RegX8632::val] = false; \
1233 if (preserved && (Exclude & RegSet_CalleeSave)) \
1234 Registers[RegX8632::val] = false; \
1235 if (stackptr && (Exclude & RegSet_StackPointer)) \
1236 Registers[RegX8632::val] = false; \
1237 if (frameptr && (Exclude & RegSet_FramePointer)) \
1238 Registers[RegX8632::val] = false;
1247 void TargetX8632::lowerAlloca(const InstAlloca *Inst) {
1248 IsEbpBasedFrame = true;
1249 // Conservatively require the stack to be aligned. Some stack
1250 // adjustment operations implemented below assume that the stack is
1251 // aligned before the alloca. All the alloca code ensures that the
1252 // stack alignment is preserved after the alloca. The stack alignment
1253 // restriction can be relaxed in some cases.
1254 NeedsStackAlignment = true;
1256 // TODO(stichnot): minimize the number of adjustments of esp, etc.
1257 Variable *esp = getPhysicalRegister(RegX8632::Reg_esp);
1258 Operand *TotalSize = legalize(Inst->getSizeInBytes());
1259 Variable *Dest = Inst->getDest();
1260 uint32_t AlignmentParam = Inst->getAlignInBytes();
1261 // For default align=0, set it to the real value 1, to avoid any
1262 // bit-manipulation problems below.
1263 AlignmentParam = std::max(AlignmentParam, 1u);
1265 // LLVM enforces power of 2 alignment.
1266 assert(llvm::isPowerOf2_32(AlignmentParam));
1267 assert(llvm::isPowerOf2_32(X86_STACK_ALIGNMENT_BYTES));
1269 uint32_t Alignment = std::max(AlignmentParam, X86_STACK_ALIGNMENT_BYTES);
1270 if (Alignment > X86_STACK_ALIGNMENT_BYTES) {
1271 _and(esp, Ctx->getConstantInt32(-Alignment));
1273 if (const auto *ConstantTotalSize =
1274 llvm::dyn_cast<ConstantInteger32>(TotalSize)) {
1275 uint32_t Value = ConstantTotalSize->getValue();
1276 Value = Utils::applyAlignment(Value, Alignment);
1277 _sub(esp, Ctx->getConstantInt32(Value));
1279 // Non-constant sizes need to be adjusted to the next highest
1280 // multiple of the required alignment at runtime.
1281 Variable *T = makeReg(IceType_i32);
1283 _add(T, Ctx->getConstantInt32(Alignment - 1));
1284 _and(T, Ctx->getConstantInt32(-Alignment));
1290 // Strength-reduce scalar integer multiplication by a constant (for
1291 // i32 or narrower) for certain constants. The lea instruction can be
1292 // used to multiply by 3, 5, or 9, and the lsh instruction can be used
1293 // to multiply by powers of 2. These can be combined such that
1294 // e.g. multiplying by 100 can be done as 2 lea-based multiplies by 5,
1295 // combined with left-shifting by 2.
1296 bool TargetX8632::optimizeScalarMul(Variable *Dest, Operand *Src0,
1298 // Disable this optimization for Om1 and O0, just to keep things
1300 if (Ctx->getFlags().getOptLevel() < Opt_1)
1302 Type Ty = Dest->getType();
1303 Variable *T = nullptr;
1311 _mov(Dest, Ctx->getConstantZero(Ty));
1319 // Don't bother with the edge case where Src1 == MININT.
1322 const bool Src1IsNegative = Src1 < 0;
1325 uint32_t Count9 = 0;
1326 uint32_t Count5 = 0;
1327 uint32_t Count3 = 0;
1328 uint32_t Count2 = 0;
1329 uint32_t CountOps = 0;
1331 if (Src1 % 9 == 0) {
1335 } else if (Src1 % 5 == 0) {
1339 } else if (Src1 % 3 == 0) {
1343 } else if (Src1 % 2 == 0) {
1352 // Lea optimization only works for i16 and i32 types, not i8.
1353 if (Ty != IceType_i16 && Ty != IceType_i32 && (Count3 || Count5 || Count9))
1355 // Limit the number of lea/shl operations for a single multiply, to
1356 // a somewhat arbitrary choice of 3.
1357 const uint32_t MaxOpsForOptimizedMul = 3;
1358 if (CountOps > MaxOpsForOptimizedMul)
1361 Constant *Zero = Ctx->getConstantZero(IceType_i32);
1362 for (uint32_t i = 0; i < Count9; ++i) {
1363 const uint16_t Shift = 3; // log2(9-1)
1364 _lea(T, OperandX8632Mem::create(Func, IceType_void, T, Zero, T, Shift));
1365 _set_dest_nonkillable();
1367 for (uint32_t i = 0; i < Count5; ++i) {
1368 const uint16_t Shift = 2; // log2(5-1)
1369 _lea(T, OperandX8632Mem::create(Func, IceType_void, T, Zero, T, Shift));
1370 _set_dest_nonkillable();
1372 for (uint32_t i = 0; i < Count3; ++i) {
1373 const uint16_t Shift = 1; // log2(3-1)
1374 _lea(T, OperandX8632Mem::create(Func, IceType_void, T, Zero, T, Shift));
1375 _set_dest_nonkillable();
1378 _shl(T, Ctx->getConstantInt(Ty, Count2));
1386 void TargetX8632::lowerArithmetic(const InstArithmetic *Inst) {
1387 Variable *Dest = Inst->getDest();
1388 Operand *Src0 = legalize(Inst->getSrc(0));
1389 Operand *Src1 = legalize(Inst->getSrc(1));
1390 if (Inst->isCommutative()) {
1391 if (!llvm::isa<Variable>(Src0) && llvm::isa<Variable>(Src1))
1392 std::swap(Src0, Src1);
1393 if (llvm::isa<Constant>(Src0) && !llvm::isa<Constant>(Src1))
1394 std::swap(Src0, Src1);
1396 if (Dest->getType() == IceType_i64) {
1397 Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
1398 Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
1399 Operand *Src0Lo = loOperand(Src0);
1400 Operand *Src0Hi = hiOperand(Src0);
1401 Operand *Src1Lo = loOperand(Src1);
1402 Operand *Src1Hi = hiOperand(Src1);
1403 Variable *T_Lo = nullptr, *T_Hi = nullptr;
1404 switch (Inst->getOp()) {
1405 case InstArithmetic::_num:
1406 llvm_unreachable("Unknown arithmetic operator");
1408 case InstArithmetic::Add:
1416 case InstArithmetic::And:
1424 case InstArithmetic::Or:
1432 case InstArithmetic::Xor:
1440 case InstArithmetic::Sub:
1448 case InstArithmetic::Mul: {
1449 Variable *T_1 = nullptr, *T_2 = nullptr, *T_3 = nullptr;
1450 Variable *T_4Lo = makeReg(IceType_i32, RegX8632::Reg_eax);
1451 Variable *T_4Hi = makeReg(IceType_i32, RegX8632::Reg_edx);
1452 // gcc does the following:
1454 // t1 = b.hi; t1 *=(imul) c.lo
1455 // t2 = c.hi; t2 *=(imul) b.lo
1457 // t4.hi:edx,t4.lo:eax = t3:eax *(mul) c.lo
1462 // The mul instruction cannot take an immediate operand.
1463 Src1Lo = legalize(Src1Lo, Legal_Reg | Legal_Mem);
1468 _mov(T_3, Src0Lo, RegX8632::Reg_eax);
1469 _mul(T_4Lo, T_3, Src1Lo);
1470 // The mul instruction produces two dest variables, edx:eax. We
1471 // create a fake definition of edx to account for this.
1472 Context.insert(InstFakeDef::create(Func, T_4Hi, T_4Lo));
1473 _mov(DestLo, T_4Lo);
1476 _mov(DestHi, T_4Hi);
1478 case InstArithmetic::Shl: {
1479 // TODO: Refactor the similarities between Shl, Lshr, and Ashr.
1480 // gcc does the following:
1482 // t1:ecx = c.lo & 0xff
1485 // t3 = shld t3, t2, t1
1495 Variable *T_1 = nullptr, *T_2 = nullptr, *T_3 = nullptr;
1496 Constant *BitTest = Ctx->getConstantInt32(0x20);
1497 Constant *Zero = Ctx->getConstantZero(IceType_i32);
1498 InstX8632Label *Label = InstX8632Label::create(Func, this);
1499 _mov(T_1, Src1Lo, RegX8632::Reg_ecx);
1502 _shld(T_3, T_2, T_1);
1504 _test(T_1, BitTest);
1505 _br(CondX86::Br_e, Label);
1506 // T_2 and T_3 are being assigned again because of the
1507 // intra-block control flow, so we need the _mov_nonkillable
1508 // variant to avoid liveness problems.
1509 _mov_nonkillable(T_3, T_2);
1510 _mov_nonkillable(T_2, Zero);
1511 Context.insert(Label);
1515 case InstArithmetic::Lshr: {
1516 // a=b>>c (unsigned) ==>
1517 // t1:ecx = c.lo & 0xff
1520 // t2 = shrd t2, t3, t1
1530 Variable *T_1 = nullptr, *T_2 = nullptr, *T_3 = nullptr;
1531 Constant *BitTest = Ctx->getConstantInt32(0x20);
1532 Constant *Zero = Ctx->getConstantZero(IceType_i32);
1533 InstX8632Label *Label = InstX8632Label::create(Func, this);
1534 _mov(T_1, Src1Lo, RegX8632::Reg_ecx);
1537 _shrd(T_2, T_3, T_1);
1539 _test(T_1, BitTest);
1540 _br(CondX86::Br_e, Label);
1541 // T_2 and T_3 are being assigned again because of the
1542 // intra-block control flow, so we need the _mov_nonkillable
1543 // variant to avoid liveness problems.
1544 _mov_nonkillable(T_2, T_3);
1545 _mov_nonkillable(T_3, Zero);
1546 Context.insert(Label);
1550 case InstArithmetic::Ashr: {
1551 // a=b>>c (signed) ==>
1552 // t1:ecx = c.lo & 0xff
1555 // t2 = shrd t2, t3, t1
1561 // t3 = sar t3, 0x1f
1565 Variable *T_1 = nullptr, *T_2 = nullptr, *T_3 = nullptr;
1566 Constant *BitTest = Ctx->getConstantInt32(0x20);
1567 Constant *SignExtend = Ctx->getConstantInt32(0x1f);
1568 InstX8632Label *Label = InstX8632Label::create(Func, this);
1569 _mov(T_1, Src1Lo, RegX8632::Reg_ecx);
1572 _shrd(T_2, T_3, T_1);
1574 _test(T_1, BitTest);
1575 _br(CondX86::Br_e, Label);
1576 // T_2 and T_3 are being assigned again because of the
1577 // intra-block control flow, so T_2 needs the _mov_nonkillable
1578 // variant to avoid liveness problems. T_3 doesn't need special
1579 // treatment because it is reassigned via _sar instead of _mov.
1580 _mov_nonkillable(T_2, T_3);
1581 _sar(T_3, SignExtend);
1582 Context.insert(Label);
1586 case InstArithmetic::Udiv: {
1587 const SizeT MaxSrcs = 2;
1588 InstCall *Call = makeHelperCall(H_udiv_i64, Dest, MaxSrcs);
1589 Call->addArg(Inst->getSrc(0));
1590 Call->addArg(Inst->getSrc(1));
1593 case InstArithmetic::Sdiv: {
1594 const SizeT MaxSrcs = 2;
1595 InstCall *Call = makeHelperCall(H_sdiv_i64, Dest, MaxSrcs);
1596 Call->addArg(Inst->getSrc(0));
1597 Call->addArg(Inst->getSrc(1));
1600 case InstArithmetic::Urem: {
1601 const SizeT MaxSrcs = 2;
1602 InstCall *Call = makeHelperCall(H_urem_i64, Dest, MaxSrcs);
1603 Call->addArg(Inst->getSrc(0));
1604 Call->addArg(Inst->getSrc(1));
1607 case InstArithmetic::Srem: {
1608 const SizeT MaxSrcs = 2;
1609 InstCall *Call = makeHelperCall(H_srem_i64, Dest, MaxSrcs);
1610 Call->addArg(Inst->getSrc(0));
1611 Call->addArg(Inst->getSrc(1));
1614 case InstArithmetic::Fadd:
1615 case InstArithmetic::Fsub:
1616 case InstArithmetic::Fmul:
1617 case InstArithmetic::Fdiv:
1618 case InstArithmetic::Frem:
1619 llvm_unreachable("FP instruction with i64 type");
1624 if (isVectorType(Dest->getType())) {
1625 // TODO: Trap on integer divide and integer modulo by zero.
1626 // See: https://code.google.com/p/nativeclient/issues/detail?id=3899
1627 if (llvm::isa<OperandX8632Mem>(Src1))
1628 Src1 = legalizeToVar(Src1);
1629 switch (Inst->getOp()) {
1630 case InstArithmetic::_num:
1631 llvm_unreachable("Unknown arithmetic operator");
1633 case InstArithmetic::Add: {
1634 Variable *T = makeReg(Dest->getType());
1639 case InstArithmetic::And: {
1640 Variable *T = makeReg(Dest->getType());
1645 case InstArithmetic::Or: {
1646 Variable *T = makeReg(Dest->getType());
1651 case InstArithmetic::Xor: {
1652 Variable *T = makeReg(Dest->getType());
1657 case InstArithmetic::Sub: {
1658 Variable *T = makeReg(Dest->getType());
1663 case InstArithmetic::Mul: {
1664 bool TypesAreValidForPmull =
1665 Dest->getType() == IceType_v4i32 || Dest->getType() == IceType_v8i16;
1666 bool InstructionSetIsValidForPmull =
1667 Dest->getType() == IceType_v8i16 || InstructionSet >= SSE4_1;
1668 if (TypesAreValidForPmull && InstructionSetIsValidForPmull) {
1669 Variable *T = makeReg(Dest->getType());
1673 } else if (Dest->getType() == IceType_v4i32) {
1674 // Lowering sequence:
1675 // Note: The mask arguments have index 0 on the left.
1678 // pshufd T2, Src0, {1,0,3,0}
1679 // pshufd T3, Src1, {1,0,3,0}
1680 // # T1 = {Src0[0] * Src1[0], Src0[2] * Src1[2]}
1682 // # T2 = {Src0[1] * Src1[1], Src0[3] * Src1[3]}
1684 // # T1 = {lo(T1[0]), lo(T1[2]), lo(T2[0]), lo(T2[2])}
1685 // shufps T1, T2, {0,2,0,2}
1686 // pshufd T4, T1, {0,2,1,3}
1689 // Mask that directs pshufd to create a vector with entries
1691 const unsigned Constant1030 = 0x31;
1692 Constant *Mask1030 = Ctx->getConstantInt32(Constant1030);
1693 // Mask that directs shufps to create a vector with entries
1694 // Dest[0, 2], Src[0, 2]
1695 const unsigned Mask0202 = 0x88;
1696 // Mask that directs pshufd to create a vector with entries
1698 const unsigned Mask0213 = 0xd8;
1699 Variable *T1 = makeReg(IceType_v4i32);
1700 Variable *T2 = makeReg(IceType_v4i32);
1701 Variable *T3 = makeReg(IceType_v4i32);
1702 Variable *T4 = makeReg(IceType_v4i32);
1704 _pshufd(T2, Src0, Mask1030);
1705 _pshufd(T3, Src1, Mask1030);
1708 _shufps(T1, T2, Ctx->getConstantInt32(Mask0202));
1709 _pshufd(T4, T1, Ctx->getConstantInt32(Mask0213));
1712 assert(Dest->getType() == IceType_v16i8);
1713 scalarizeArithmetic(Inst->getOp(), Dest, Src0, Src1);
1716 case InstArithmetic::Shl:
1717 case InstArithmetic::Lshr:
1718 case InstArithmetic::Ashr:
1719 case InstArithmetic::Udiv:
1720 case InstArithmetic::Urem:
1721 case InstArithmetic::Sdiv:
1722 case InstArithmetic::Srem:
1723 scalarizeArithmetic(Inst->getOp(), Dest, Src0, Src1);
1725 case InstArithmetic::Fadd: {
1726 Variable *T = makeReg(Dest->getType());
1731 case InstArithmetic::Fsub: {
1732 Variable *T = makeReg(Dest->getType());
1737 case InstArithmetic::Fmul: {
1738 Variable *T = makeReg(Dest->getType());
1743 case InstArithmetic::Fdiv: {
1744 Variable *T = makeReg(Dest->getType());
1749 case InstArithmetic::Frem:
1750 scalarizeArithmetic(Inst->getOp(), Dest, Src0, Src1);
1755 Variable *T_edx = nullptr;
1756 Variable *T = nullptr;
1757 switch (Inst->getOp()) {
1758 case InstArithmetic::_num:
1759 llvm_unreachable("Unknown arithmetic operator");
1761 case InstArithmetic::Add:
1766 case InstArithmetic::And:
1771 case InstArithmetic::Or:
1776 case InstArithmetic::Xor:
1781 case InstArithmetic::Sub:
1786 case InstArithmetic::Mul:
1787 if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) {
1788 if (optimizeScalarMul(Dest, Src0, C->getValue()))
1791 // The 8-bit version of imul only allows the form "imul r/m8"
1792 // where T must be in eax.
1793 if (isByteSizedArithType(Dest->getType())) {
1794 _mov(T, Src0, RegX8632::Reg_eax);
1795 Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
1802 case InstArithmetic::Shl:
1804 if (!llvm::isa<Constant>(Src1))
1805 Src1 = legalizeToVar(Src1, RegX8632::Reg_ecx);
1809 case InstArithmetic::Lshr:
1811 if (!llvm::isa<Constant>(Src1))
1812 Src1 = legalizeToVar(Src1, RegX8632::Reg_ecx);
1816 case InstArithmetic::Ashr:
1818 if (!llvm::isa<Constant>(Src1))
1819 Src1 = legalizeToVar(Src1, RegX8632::Reg_ecx);
1823 case InstArithmetic::Udiv:
1824 // div and idiv are the few arithmetic operators that do not allow
1825 // immediates as the operand.
1826 Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
1827 if (isByteSizedArithType(Dest->getType())) {
1828 Variable *T_ah = nullptr;
1829 Constant *Zero = Ctx->getConstantZero(IceType_i8);
1830 _mov(T, Src0, RegX8632::Reg_eax);
1831 _mov(T_ah, Zero, RegX8632::Reg_ah);
1832 _div(T, Src1, T_ah);
1835 Constant *Zero = Ctx->getConstantZero(IceType_i32);
1836 _mov(T, Src0, RegX8632::Reg_eax);
1837 _mov(T_edx, Zero, RegX8632::Reg_edx);
1838 _div(T, Src1, T_edx);
1842 case InstArithmetic::Sdiv:
1843 // TODO(stichnot): Enable this after doing better performance
1844 // and cross testing.
1845 if (false && Ctx->getFlags().getOptLevel() >= Opt_1) {
1846 // Optimize division by constant power of 2, but not for Om1
1847 // or O0, just to keep things simple there.
1848 if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) {
1849 int32_t Divisor = C->getValue();
1850 uint32_t UDivisor = static_cast<uint32_t>(Divisor);
1851 if (Divisor > 0 && llvm::isPowerOf2_32(UDivisor)) {
1852 uint32_t LogDiv = llvm::Log2_32(UDivisor);
1853 Type Ty = Dest->getType();
1854 // LLVM does the following for dest=src/(1<<log):
1856 // sar t,typewidth-1 // -1 if src is negative, 0 if not
1857 // shr t,typewidth-log
1861 uint32_t TypeWidth = X86_CHAR_BIT * typeWidthInBytes(Ty);
1863 // If for some reason we are dividing by 1, just treat it
1864 // like an assignment.
1866 // The initial sar is unnecessary when dividing by 2.
1868 _sar(T, Ctx->getConstantInt(Ty, TypeWidth - 1));
1869 _shr(T, Ctx->getConstantInt(Ty, TypeWidth - LogDiv));
1871 _sar(T, Ctx->getConstantInt(Ty, LogDiv));
1878 Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
1879 if (isByteSizedArithType(Dest->getType())) {
1880 _mov(T, Src0, RegX8632::Reg_eax);
1885 T_edx = makeReg(IceType_i32, RegX8632::Reg_edx);
1886 _mov(T, Src0, RegX8632::Reg_eax);
1888 _idiv(T, Src1, T_edx);
1892 case InstArithmetic::Urem:
1893 Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
1894 if (isByteSizedArithType(Dest->getType())) {
1895 Variable *T_ah = nullptr;
1896 Constant *Zero = Ctx->getConstantZero(IceType_i8);
1897 _mov(T, Src0, RegX8632::Reg_eax);
1898 _mov(T_ah, Zero, RegX8632::Reg_ah);
1899 _div(T_ah, Src1, T);
1902 Constant *Zero = Ctx->getConstantZero(IceType_i32);
1903 _mov(T_edx, Zero, RegX8632::Reg_edx);
1904 _mov(T, Src0, RegX8632::Reg_eax);
1905 _div(T_edx, Src1, T);
1909 case InstArithmetic::Srem:
1910 // TODO(stichnot): Enable this after doing better performance
1911 // and cross testing.
1912 if (false && Ctx->getFlags().getOptLevel() >= Opt_1) {
1913 // Optimize mod by constant power of 2, but not for Om1 or O0,
1914 // just to keep things simple there.
1915 if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) {
1916 int32_t Divisor = C->getValue();
1917 uint32_t UDivisor = static_cast<uint32_t>(Divisor);
1918 if (Divisor > 0 && llvm::isPowerOf2_32(UDivisor)) {
1919 uint32_t LogDiv = llvm::Log2_32(UDivisor);
1920 Type Ty = Dest->getType();
1921 // LLVM does the following for dest=src%(1<<log):
1923 // sar t,typewidth-1 // -1 if src is negative, 0 if not
1924 // shr t,typewidth-log
1930 uint32_t TypeWidth = X86_CHAR_BIT * typeWidthInBytes(Ty);
1931 // If for some reason we are dividing by 1, just assign 0.
1933 _mov(Dest, Ctx->getConstantZero(Ty));
1937 // The initial sar is unnecessary when dividing by 2.
1939 _sar(T, Ctx->getConstantInt(Ty, TypeWidth - 1));
1940 _shr(T, Ctx->getConstantInt(Ty, TypeWidth - LogDiv));
1942 _and(T, Ctx->getConstantInt(Ty, -(1 << LogDiv)));
1950 Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
1951 if (isByteSizedArithType(Dest->getType())) {
1952 Variable *T_ah = makeReg(IceType_i8, RegX8632::Reg_ah);
1953 _mov(T, Src0, RegX8632::Reg_eax);
1955 Context.insert(InstFakeDef::create(Func, T_ah));
1956 _idiv(T_ah, Src1, T);
1959 T_edx = makeReg(IceType_i32, RegX8632::Reg_edx);
1960 _mov(T, Src0, RegX8632::Reg_eax);
1962 _idiv(T_edx, Src1, T);
1966 case InstArithmetic::Fadd:
1971 case InstArithmetic::Fsub:
1976 case InstArithmetic::Fmul:
1981 case InstArithmetic::Fdiv:
1986 case InstArithmetic::Frem: {
1987 const SizeT MaxSrcs = 2;
1988 Type Ty = Dest->getType();
1989 InstCall *Call = makeHelperCall(
1990 isFloat32Asserting32Or64(Ty) ? H_frem_f32 : H_frem_f64, Dest, MaxSrcs);
1993 return lowerCall(Call);
1998 void TargetX8632::lowerAssign(const InstAssign *Inst) {
1999 Variable *Dest = Inst->getDest();
2000 Operand *Src0 = Inst->getSrc(0);
2001 assert(Dest->getType() == Src0->getType());
2002 if (Dest->getType() == IceType_i64) {
2003 Src0 = legalize(Src0);
2004 Operand *Src0Lo = loOperand(Src0);
2005 Operand *Src0Hi = hiOperand(Src0);
2006 Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
2007 Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
2008 Variable *T_Lo = nullptr, *T_Hi = nullptr;
2016 // If Dest already has a physical register, then legalize the
2017 // Src operand into a Variable with the same register
2018 // assignment. This is mostly a workaround for advanced phi
2019 // lowering's ad-hoc register allocation which assumes no
2020 // register allocation is needed when at least one of the
2021 // operands is non-memory.
2022 RI = legalize(Src0, Legal_Reg, Dest->getRegNum());
2024 // If Dest could be a stack operand, then RI must be a physical
2025 // register or a scalar integer immediate.
2026 RI = legalize(Src0, Legal_Reg | Legal_Imm);
2027 if (isVectorType(Dest->getType()))
2034 void TargetX8632::lowerBr(const InstBr *Inst) {
2035 if (Inst->isUnconditional()) {
2036 _br(Inst->getTargetUnconditional());
2039 Operand *Cond = Inst->getCondition();
2041 // Handle folding opportunities.
2042 if (const class Inst *Producer = FoldingInfo.getProducerFor(Cond)) {
2043 assert(Producer->isDeleted());
2044 switch (BoolFolding::getProducerKind(Producer)) {
2047 case BoolFolding::PK_Icmp32: {
2048 // TODO(stichnot): Refactor similarities between this block and
2049 // the corresponding code in lowerIcmp().
2050 auto *Cmp = llvm::dyn_cast<InstIcmp>(Producer);
2051 Operand *Src0 = Producer->getSrc(0);
2052 Operand *Src1 = legalize(Producer->getSrc(1));
2053 Operand *Src0RM = legalizeSrc0ForCmp(Src0, Src1);
2055 _br(getIcmp32Mapping(Cmp->getCondition()), Inst->getTargetTrue(),
2056 Inst->getTargetFalse());
2062 Operand *Src0 = legalize(Cond, Legal_Reg | Legal_Mem);
2063 Constant *Zero = Ctx->getConstantZero(IceType_i32);
2065 _br(CondX86::Br_ne, Inst->getTargetTrue(), Inst->getTargetFalse());
2068 void TargetX8632::lowerCall(const InstCall *Instr) {
2069 // x86-32 calling convention:
2071 // * At the point before the call, the stack must be aligned to 16
2074 // * The first four arguments of vector type, regardless of their
2075 // position relative to the other arguments in the argument list, are
2076 // placed in registers xmm0 - xmm3.
2078 // * Other arguments are pushed onto the stack in right-to-left order,
2079 // such that the left-most argument ends up on the top of the stack at
2080 // the lowest memory address.
2082 // * Stack arguments of vector type are aligned to start at the next
2083 // highest multiple of 16 bytes. Other stack arguments are aligned to
2086 // This intends to match the section "IA-32 Function Calling
2087 // Convention" of the document "OS X ABI Function Call Guide" by
2089 NeedsStackAlignment = true;
2091 typedef std::vector<Operand *> OperandList;
2092 OperandList XmmArgs;
2093 OperandList StackArgs, StackArgLocations;
2094 uint32_t ParameterAreaSizeBytes = 0;
2096 // Classify each argument operand according to the location where the
2097 // argument is passed.
2098 for (SizeT i = 0, NumArgs = Instr->getNumArgs(); i < NumArgs; ++i) {
2099 Operand *Arg = Instr->getArg(i);
2100 Type Ty = Arg->getType();
2101 // The PNaCl ABI requires the width of arguments to be at least 32 bits.
2102 assert(typeWidthInBytes(Ty) >= 4);
2103 if (isVectorType(Ty) && XmmArgs.size() < X86_MAX_XMM_ARGS) {
2104 XmmArgs.push_back(Arg);
2106 StackArgs.push_back(Arg);
2107 if (isVectorType(Arg->getType())) {
2108 ParameterAreaSizeBytes = applyStackAlignment(ParameterAreaSizeBytes);
2110 Variable *esp = Func->getTarget()->getPhysicalRegister(RegX8632::Reg_esp);
2111 Constant *Loc = Ctx->getConstantInt32(ParameterAreaSizeBytes);
2112 StackArgLocations.push_back(OperandX8632Mem::create(Func, Ty, esp, Loc));
2113 ParameterAreaSizeBytes += typeWidthInBytesOnStack(Arg->getType());
2117 // Adjust the parameter area so that the stack is aligned. It is
2118 // assumed that the stack is already aligned at the start of the
2119 // calling sequence.
2120 ParameterAreaSizeBytes = applyStackAlignment(ParameterAreaSizeBytes);
2122 // Subtract the appropriate amount for the argument area. This also
2123 // takes care of setting the stack adjustment during emission.
2125 // TODO: If for some reason the call instruction gets dead-code
2126 // eliminated after lowering, we would need to ensure that the
2127 // pre-call and the post-call esp adjustment get eliminated as well.
2128 if (ParameterAreaSizeBytes) {
2129 _adjust_stack(ParameterAreaSizeBytes);
2132 // Copy arguments that are passed on the stack to the appropriate
2134 for (SizeT i = 0, e = StackArgs.size(); i < e; ++i) {
2135 lowerStore(InstStore::create(Func, StackArgs[i], StackArgLocations[i]));
2138 // Copy arguments to be passed in registers to the appropriate
2140 // TODO: Investigate the impact of lowering arguments passed in
2141 // registers after lowering stack arguments as opposed to the other
2142 // way around. Lowering register arguments after stack arguments may
2143 // reduce register pressure. On the other hand, lowering register
2144 // arguments first (before stack arguments) may result in more compact
2145 // code, as the memory operand displacements may end up being smaller
2146 // before any stack adjustment is done.
2147 for (SizeT i = 0, NumXmmArgs = XmmArgs.size(); i < NumXmmArgs; ++i) {
2148 Variable *Reg = legalizeToVar(XmmArgs[i], RegX8632::Reg_xmm0 + i);
2149 // Generate a FakeUse of register arguments so that they do not get
2150 // dead code eliminated as a result of the FakeKill of scratch
2151 // registers after the call.
2152 Context.insert(InstFakeUse::create(Func, Reg));
2154 // Generate the call instruction. Assign its result to a temporary
2155 // with high register allocation weight.
2156 Variable *Dest = Instr->getDest();
2157 // ReturnReg doubles as ReturnRegLo as necessary.
2158 Variable *ReturnReg = nullptr;
2159 Variable *ReturnRegHi = nullptr;
2161 switch (Dest->getType()) {
2163 llvm_unreachable("Invalid Call dest type");
2171 ReturnReg = makeReg(Dest->getType(), RegX8632::Reg_eax);
2174 ReturnReg = makeReg(IceType_i32, RegX8632::Reg_eax);
2175 ReturnRegHi = makeReg(IceType_i32, RegX8632::Reg_edx);
2179 // Leave ReturnReg==ReturnRegHi==nullptr, and capture the result with
2180 // the fstp instruction.
2189 ReturnReg = makeReg(Dest->getType(), RegX8632::Reg_xmm0);
2193 Operand *CallTarget = legalize(Instr->getCallTarget());
2194 const bool NeedSandboxing = Ctx->getFlags().getUseSandboxing();
2195 if (NeedSandboxing) {
2196 if (llvm::isa<Constant>(CallTarget)) {
2197 _bundle_lock(InstBundleLock::Opt_AlignToEnd);
2199 Variable *CallTargetVar = nullptr;
2200 _mov(CallTargetVar, CallTarget);
2201 _bundle_lock(InstBundleLock::Opt_AlignToEnd);
2202 const SizeT BundleSize =
2203 1 << Func->getAssembler<>()->getBundleAlignLog2Bytes();
2204 _and(CallTargetVar, Ctx->getConstantInt32(~(BundleSize - 1)));
2205 CallTarget = CallTargetVar;
2208 Inst *NewCall = InstX8632Call::create(Func, ReturnReg, CallTarget);
2209 Context.insert(NewCall);
2213 Context.insert(InstFakeDef::create(Func, ReturnRegHi));
2215 // Add the appropriate offset to esp. The call instruction takes care
2216 // of resetting the stack offset during emission.
2217 if (ParameterAreaSizeBytes) {
2218 Variable *esp = Func->getTarget()->getPhysicalRegister(RegX8632::Reg_esp);
2219 _add(esp, Ctx->getConstantInt32(ParameterAreaSizeBytes));
2222 // Insert a register-kill pseudo instruction.
2223 Context.insert(InstFakeKill::create(Func, NewCall));
2225 // Generate a FakeUse to keep the call live if necessary.
2226 if (Instr->hasSideEffects() && ReturnReg) {
2227 Inst *FakeUse = InstFakeUse::create(Func, ReturnReg);
2228 Context.insert(FakeUse);
2234 // Assign the result of the call to Dest.
2237 assert(Dest->getType() == IceType_i64);
2239 Variable *DestLo = Dest->getLo();
2240 Variable *DestHi = Dest->getHi();
2241 _mov(DestLo, ReturnReg);
2242 _mov(DestHi, ReturnRegHi);
2244 assert(Dest->getType() == IceType_i32 || Dest->getType() == IceType_i16 ||
2245 Dest->getType() == IceType_i8 || Dest->getType() == IceType_i1 ||
2246 isVectorType(Dest->getType()));
2247 if (isVectorType(Dest->getType())) {
2248 _movp(Dest, ReturnReg);
2250 _mov(Dest, ReturnReg);
2253 } else if (isScalarFloatingType(Dest->getType())) {
2254 // Special treatment for an FP function which returns its result in
2256 // If Dest ends up being a physical xmm register, the fstp emit code
2257 // will route st(0) through a temporary stack slot.
2259 // Create a fake use of Dest in case it actually isn't used,
2260 // because st(0) still needs to be popped.
2261 Context.insert(InstFakeUse::create(Func, Dest));
2265 void TargetX8632::lowerCast(const InstCast *Inst) {
2266 // a = cast(b) ==> t=cast(b); a=t; (link t->b, link a->t, no overlap)
2267 InstCast::OpKind CastKind = Inst->getCastKind();
2268 Variable *Dest = Inst->getDest();
2271 Func->setError("Cast type not supported");
2273 case InstCast::Sext: {
2274 // Src0RM is the source operand legalized to physical register or memory,
2275 // but not immediate, since the relevant x86 native instructions don't
2276 // allow an immediate operand. If the operand is an immediate, we could
2277 // consider computing the strength-reduced result at translation time,
2278 // but we're unlikely to see something like that in the bitcode that
2279 // the optimizer wouldn't have already taken care of.
2280 Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
2281 if (isVectorType(Dest->getType())) {
2282 Type DestTy = Dest->getType();
2283 if (DestTy == IceType_v16i8) {
2284 // onemask = materialize(1,1,...); dst = (src & onemask) > 0
2285 Variable *OneMask = makeVectorOfOnes(Dest->getType());
2286 Variable *T = makeReg(DestTy);
2289 Variable *Zeros = makeVectorOfZeros(Dest->getType());
2293 // width = width(elty) - 1; dest = (src << width) >> width
2295 X86_CHAR_BIT * typeWidthInBytes(typeElementType(DestTy)) - 1;
2296 Constant *ShiftConstant = Ctx->getConstantInt8(ShiftAmount);
2297 Variable *T = makeReg(DestTy);
2299 _psll(T, ShiftConstant);
2300 _psra(T, ShiftConstant);
2303 } else if (Dest->getType() == IceType_i64) {
2304 // t1=movsx src; t2=t1; t2=sar t2, 31; dst.lo=t1; dst.hi=t2
2305 Constant *Shift = Ctx->getConstantInt32(31);
2306 Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
2307 Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
2308 Variable *T_Lo = makeReg(DestLo->getType());
2309 if (Src0RM->getType() == IceType_i32) {
2311 } else if (Src0RM->getType() == IceType_i1) {
2312 _movzx(T_Lo, Src0RM);
2316 _movsx(T_Lo, Src0RM);
2319 Variable *T_Hi = nullptr;
2321 if (Src0RM->getType() != IceType_i1)
2322 // For i1, the sar instruction is already done above.
2325 } else if (Src0RM->getType() == IceType_i1) {
2327 // shl t1, dst_bitwidth - 1
2328 // sar t1, dst_bitwidth - 1
2330 size_t DestBits = X86_CHAR_BIT * typeWidthInBytes(Dest->getType());
2331 Constant *ShiftAmount = Ctx->getConstantInt32(DestBits - 1);
2332 Variable *T = makeReg(Dest->getType());
2333 if (typeWidthInBytes(Dest->getType()) <=
2334 typeWidthInBytes(Src0RM->getType())) {
2337 // Widen the source using movsx or movzx. (It doesn't matter
2338 // which one, since the following shl/sar overwrite the bits.)
2341 _shl(T, ShiftAmount);
2342 _sar(T, ShiftAmount);
2345 // t1 = movsx src; dst = t1
2346 Variable *T = makeReg(Dest->getType());
2352 case InstCast::Zext: {
2353 Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
2354 if (isVectorType(Dest->getType())) {
2355 // onemask = materialize(1,1,...); dest = onemask & src
2356 Type DestTy = Dest->getType();
2357 Variable *OneMask = makeVectorOfOnes(DestTy);
2358 Variable *T = makeReg(DestTy);
2362 } else if (Dest->getType() == IceType_i64) {
2363 // t1=movzx src; dst.lo=t1; dst.hi=0
2364 Constant *Zero = Ctx->getConstantZero(IceType_i32);
2365 Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
2366 Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
2367 Variable *Tmp = makeReg(DestLo->getType());
2368 if (Src0RM->getType() == IceType_i32) {
2371 _movzx(Tmp, Src0RM);
2373 if (Src0RM->getType() == IceType_i1) {
2374 Constant *One = Ctx->getConstantInt32(1);
2379 } else if (Src0RM->getType() == IceType_i1) {
2380 // t = Src0RM; t &= 1; Dest = t
2381 Constant *One = Ctx->getConstantInt32(1);
2382 Type DestTy = Dest->getType();
2384 if (DestTy == IceType_i8) {
2385 T = makeReg(DestTy);
2388 // Use 32-bit for both 16-bit and 32-bit, since 32-bit ops are shorter.
2389 T = makeReg(IceType_i32);
2395 // t1 = movzx src; dst = t1
2396 Variable *T = makeReg(Dest->getType());
2402 case InstCast::Trunc: {
2403 if (isVectorType(Dest->getType())) {
2404 // onemask = materialize(1,1,...); dst = src & onemask
2405 Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
2406 Type Src0Ty = Src0RM->getType();
2407 Variable *OneMask = makeVectorOfOnes(Src0Ty);
2408 Variable *T = makeReg(Dest->getType());
2413 Operand *Src0 = Inst->getSrc(0);
2414 if (Src0->getType() == IceType_i64)
2415 Src0 = loOperand(Src0);
2416 Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
2417 // t1 = trunc Src0RM; Dest = t1
2418 Variable *T = nullptr;
2420 if (Dest->getType() == IceType_i1)
2421 _and(T, Ctx->getConstantInt1(1));
2426 case InstCast::Fptrunc:
2427 case InstCast::Fpext: {
2428 Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
2429 // t1 = cvt Src0RM; Dest = t1
2430 Variable *T = makeReg(Dest->getType());
2431 _cvt(T, Src0RM, InstX8632Cvt::Float2float);
2435 case InstCast::Fptosi:
2436 if (isVectorType(Dest->getType())) {
2437 assert(Dest->getType() == IceType_v4i32 &&
2438 Inst->getSrc(0)->getType() == IceType_v4f32);
2439 Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
2440 if (llvm::isa<OperandX8632Mem>(Src0RM))
2441 Src0RM = legalizeToVar(Src0RM);
2442 Variable *T = makeReg(Dest->getType());
2443 _cvt(T, Src0RM, InstX8632Cvt::Tps2dq);
2445 } else if (Dest->getType() == IceType_i64) {
2446 // Use a helper for converting floating-point values to 64-bit
2447 // integers. SSE2 appears to have no way to convert from xmm
2448 // registers to something like the edx:eax register pair, and
2449 // gcc and clang both want to use x87 instructions complete with
2450 // temporary manipulation of the status word. This helper is
2451 // not needed for x86-64.
2453 const SizeT MaxSrcs = 1;
2454 Type SrcType = Inst->getSrc(0)->getType();
2456 makeHelperCall(isFloat32Asserting32Or64(SrcType) ? H_fptosi_f32_i64
2459 Call->addArg(Inst->getSrc(0));
2462 Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
2463 // t1.i32 = cvt Src0RM; t2.dest_type = t1; Dest = t2.dest_type
2464 Variable *T_1 = makeReg(IceType_i32);
2465 Variable *T_2 = makeReg(Dest->getType());
2466 _cvt(T_1, Src0RM, InstX8632Cvt::Tss2si);
2467 _mov(T_2, T_1); // T_1 and T_2 may have different integer types
2468 if (Dest->getType() == IceType_i1)
2469 _and(T_2, Ctx->getConstantInt1(1));
2473 case InstCast::Fptoui:
2474 if (isVectorType(Dest->getType())) {
2475 assert(Dest->getType() == IceType_v4i32 &&
2476 Inst->getSrc(0)->getType() == IceType_v4f32);
2477 const SizeT MaxSrcs = 1;
2478 InstCall *Call = makeHelperCall(H_fptoui_4xi32_f32, Dest, MaxSrcs);
2479 Call->addArg(Inst->getSrc(0));
2481 } else if (Dest->getType() == IceType_i64 ||
2482 Dest->getType() == IceType_i32) {
2483 // Use a helper for both x86-32 and x86-64.
2485 const SizeT MaxSrcs = 1;
2486 Type DestType = Dest->getType();
2487 Type SrcType = Inst->getSrc(0)->getType();
2488 IceString TargetString;
2489 if (isInt32Asserting32Or64(DestType)) {
2490 TargetString = isFloat32Asserting32Or64(SrcType) ? H_fptoui_f32_i32
2493 TargetString = isFloat32Asserting32Or64(SrcType) ? H_fptoui_f32_i64
2496 InstCall *Call = makeHelperCall(TargetString, Dest, MaxSrcs);
2497 Call->addArg(Inst->getSrc(0));
2501 Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
2502 // t1.i32 = cvt Src0RM; t2.dest_type = t1; Dest = t2.dest_type
2503 Variable *T_1 = makeReg(IceType_i32);
2504 Variable *T_2 = makeReg(Dest->getType());
2505 _cvt(T_1, Src0RM, InstX8632Cvt::Tss2si);
2506 _mov(T_2, T_1); // T_1 and T_2 may have different integer types
2507 if (Dest->getType() == IceType_i1)
2508 _and(T_2, Ctx->getConstantInt1(1));
2512 case InstCast::Sitofp:
2513 if (isVectorType(Dest->getType())) {
2514 assert(Dest->getType() == IceType_v4f32 &&
2515 Inst->getSrc(0)->getType() == IceType_v4i32);
2516 Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
2517 if (llvm::isa<OperandX8632Mem>(Src0RM))
2518 Src0RM = legalizeToVar(Src0RM);
2519 Variable *T = makeReg(Dest->getType());
2520 _cvt(T, Src0RM, InstX8632Cvt::Dq2ps);
2522 } else if (Inst->getSrc(0)->getType() == IceType_i64) {
2523 // Use a helper for x86-32.
2524 const SizeT MaxSrcs = 1;
2525 Type DestType = Dest->getType();
2527 makeHelperCall(isFloat32Asserting32Or64(DestType) ? H_sitofp_i64_f32
2530 // TODO: Call the correct compiler-rt helper function.
2531 Call->addArg(Inst->getSrc(0));
2535 Operand *Src0RM = legalize(Inst->getSrc(0), Legal_Reg | Legal_Mem);
2536 // Sign-extend the operand.
2537 // t1.i32 = movsx Src0RM; t2 = Cvt t1.i32; Dest = t2
2538 Variable *T_1 = makeReg(IceType_i32);
2539 Variable *T_2 = makeReg(Dest->getType());
2540 if (Src0RM->getType() == IceType_i32)
2543 _movsx(T_1, Src0RM);
2544 _cvt(T_2, T_1, InstX8632Cvt::Si2ss);
2548 case InstCast::Uitofp: {
2549 Operand *Src0 = Inst->getSrc(0);
2550 if (isVectorType(Src0->getType())) {
2551 assert(Dest->getType() == IceType_v4f32 &&
2552 Src0->getType() == IceType_v4i32);
2553 const SizeT MaxSrcs = 1;
2554 InstCall *Call = makeHelperCall(H_uitofp_4xi32_4xf32, Dest, MaxSrcs);
2557 } else if (Src0->getType() == IceType_i64 ||
2558 Src0->getType() == IceType_i32) {
2559 // Use a helper for x86-32 and x86-64. Also use a helper for
2561 const SizeT MaxSrcs = 1;
2562 Type DestType = Dest->getType();
2563 IceString TargetString;
2564 if (isInt32Asserting32Or64(Src0->getType())) {
2565 TargetString = isFloat32Asserting32Or64(DestType) ? H_uitofp_i32_f32
2568 TargetString = isFloat32Asserting32Or64(DestType) ? H_uitofp_i64_f32
2571 InstCall *Call = makeHelperCall(TargetString, Dest, MaxSrcs);
2576 Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
2577 // Zero-extend the operand.
2578 // t1.i32 = movzx Src0RM; t2 = Cvt t1.i32; Dest = t2
2579 Variable *T_1 = makeReg(IceType_i32);
2580 Variable *T_2 = makeReg(Dest->getType());
2581 if (Src0RM->getType() == IceType_i32)
2584 _movzx(T_1, Src0RM);
2585 _cvt(T_2, T_1, InstX8632Cvt::Si2ss);
2590 case InstCast::Bitcast: {
2591 Operand *Src0 = Inst->getSrc(0);
2592 if (Dest->getType() == Src0->getType()) {
2593 InstAssign *Assign = InstAssign::create(Func, Dest, Src0);
2594 lowerAssign(Assign);
2597 switch (Dest->getType()) {
2599 llvm_unreachable("Unexpected Bitcast dest type");
2601 assert(Src0->getType() == IceType_v8i1);
2602 InstCall *Call = makeHelperCall(H_bitcast_8xi1_i8, Dest, 1);
2607 assert(Src0->getType() == IceType_v16i1);
2608 InstCall *Call = makeHelperCall(H_bitcast_16xi1_i16, Dest, 1);
2614 Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
2615 Type DestType = Dest->getType();
2616 Type SrcType = Src0RM->getType();
2618 assert((DestType == IceType_i32 && SrcType == IceType_f32) ||
2619 (DestType == IceType_f32 && SrcType == IceType_i32));
2620 // a.i32 = bitcast b.f32 ==>
2622 // s.f32 = spill t.f32
2624 Variable *T = nullptr;
2625 // TODO: Should be able to force a spill setup by calling legalize() with
2626 // Legal_Mem and not Legal_Reg or Legal_Imm.
2627 SpillVariable *SpillVar = Func->makeVariable<SpillVariable>(SrcType);
2628 SpillVar->setLinkedTo(Dest);
2629 Variable *Spill = SpillVar;
2630 Spill->setWeight(RegWeight::Zero);
2636 Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
2637 assert(Src0RM->getType() == IceType_f64);
2638 // a.i64 = bitcast b.f64 ==>
2639 // s.f64 = spill b.f64
2640 // t_lo.i32 = lo(s.f64)
2641 // a_lo.i32 = t_lo.i32
2642 // t_hi.i32 = hi(s.f64)
2643 // a_hi.i32 = t_hi.i32
2644 Operand *SpillLo, *SpillHi;
2645 if (auto *Src0Var = llvm::dyn_cast<Variable>(Src0RM)) {
2646 SpillVariable *SpillVar =
2647 Func->makeVariable<SpillVariable>(IceType_f64);
2648 SpillVar->setLinkedTo(Src0Var);
2649 Variable *Spill = SpillVar;
2650 Spill->setWeight(RegWeight::Zero);
2651 _movq(Spill, Src0RM);
2652 SpillLo = VariableSplit::create(Func, Spill, VariableSplit::Low);
2653 SpillHi = VariableSplit::create(Func, Spill, VariableSplit::High);
2655 SpillLo = loOperand(Src0RM);
2656 SpillHi = hiOperand(Src0RM);
2659 Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
2660 Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
2661 Variable *T_Lo = makeReg(IceType_i32);
2662 Variable *T_Hi = makeReg(IceType_i32);
2664 _mov(T_Lo, SpillLo);
2666 _mov(T_Hi, SpillHi);
2670 Src0 = legalize(Src0);
2671 assert(Src0->getType() == IceType_i64);
2672 if (llvm::isa<OperandX8632Mem>(Src0)) {
2673 Variable *T = Func->makeVariable(Dest->getType());
2678 // a.f64 = bitcast b.i64 ==>
2679 // t_lo.i32 = b_lo.i32
2681 // lo(s.f64) = t_lo.i32
2682 // t_hi.i32 = b_hi.i32
2683 // hi(s.f64) = t_hi.i32
2685 SpillVariable *SpillVar = Func->makeVariable<SpillVariable>(IceType_f64);
2686 SpillVar->setLinkedTo(Dest);
2687 Variable *Spill = SpillVar;
2688 Spill->setWeight(RegWeight::Zero);
2690 Variable *T_Lo = nullptr, *T_Hi = nullptr;
2691 VariableSplit *SpillLo =
2692 VariableSplit::create(Func, Spill, VariableSplit::Low);
2693 VariableSplit *SpillHi =
2694 VariableSplit::create(Func, Spill, VariableSplit::High);
2695 _mov(T_Lo, loOperand(Src0));
2696 // Technically, the Spill is defined after the _store happens, but
2697 // SpillLo is considered a "use" of Spill so define Spill before it
2699 Context.insert(InstFakeDef::create(Func, Spill));
2700 _store(T_Lo, SpillLo);
2701 _mov(T_Hi, hiOperand(Src0));
2702 _store(T_Hi, SpillHi);
2705 case IceType_v8i1: {
2706 assert(Src0->getType() == IceType_i8);
2707 InstCall *Call = makeHelperCall(H_bitcast_i8_8xi1, Dest, 1);
2708 Variable *Src0AsI32 = Func->makeVariable(stackSlotType());
2709 // Arguments to functions are required to be at least 32 bits wide.
2710 lowerCast(InstCast::create(Func, InstCast::Zext, Src0AsI32, Src0));
2711 Call->addArg(Src0AsI32);
2714 case IceType_v16i1: {
2715 assert(Src0->getType() == IceType_i16);
2716 InstCall *Call = makeHelperCall(H_bitcast_i16_16xi1, Dest, 1);
2717 Variable *Src0AsI32 = Func->makeVariable(stackSlotType());
2718 // Arguments to functions are required to be at least 32 bits wide.
2719 lowerCast(InstCast::create(Func, InstCast::Zext, Src0AsI32, Src0));
2720 Call->addArg(Src0AsI32);
2726 case IceType_v4f32: {
2727 _movp(Dest, legalizeToVar(Src0));
2735 void TargetX8632::lowerExtractElement(const InstExtractElement *Inst) {
2736 Operand *SourceVectNotLegalized = Inst->getSrc(0);
2737 ConstantInteger32 *ElementIndex =
2738 llvm::dyn_cast<ConstantInteger32>(Inst->getSrc(1));
2739 // Only constant indices are allowed in PNaCl IR.
2740 assert(ElementIndex);
2742 unsigned Index = ElementIndex->getValue();
2743 Type Ty = SourceVectNotLegalized->getType();
2744 Type ElementTy = typeElementType(Ty);
2745 Type InVectorElementTy = getInVectorElementType(Ty);
2746 Variable *ExtractedElementR = makeReg(InVectorElementTy);
2748 // TODO(wala): Determine the best lowering sequences for each type.
2750 Ty == IceType_v8i16 || Ty == IceType_v8i1 || InstructionSet >= SSE4_1;
2751 if (CanUsePextr && Ty != IceType_v4f32) {
2752 // Use pextrb, pextrw, or pextrd.
2753 Constant *Mask = Ctx->getConstantInt32(Index);
2754 Variable *SourceVectR = legalizeToVar(SourceVectNotLegalized);
2755 _pextr(ExtractedElementR, SourceVectR, Mask);
2756 } else if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) {
2757 // Use pshufd and movd/movss.
2758 Variable *T = nullptr;
2760 // The shuffle only needs to occur if the element to be extracted
2761 // is not at the lowest index.
2762 Constant *Mask = Ctx->getConstantInt32(Index);
2764 _pshufd(T, legalize(SourceVectNotLegalized, Legal_Reg | Legal_Mem), Mask);
2766 T = legalizeToVar(SourceVectNotLegalized);
2769 if (InVectorElementTy == IceType_i32) {
2770 _movd(ExtractedElementR, T);
2771 } else { // Ty == IceType_f32
2772 // TODO(wala): _movss is only used here because _mov does not
2773 // allow a vector source and a scalar destination. _mov should be
2774 // able to be used here.
2775 // _movss is a binary instruction, so the FakeDef is needed to
2776 // keep the live range analysis consistent.
2777 Context.insert(InstFakeDef::create(Func, ExtractedElementR));
2778 _movss(ExtractedElementR, T);
2781 assert(Ty == IceType_v16i8 || Ty == IceType_v16i1);
2782 // Spill the value to a stack slot and do the extraction in memory.
2784 // TODO(wala): use legalize(SourceVectNotLegalized, Legal_Mem) when
2785 // support for legalizing to mem is implemented.
2786 Variable *Slot = Func->makeVariable(Ty);
2787 Slot->setWeight(RegWeight::Zero);
2788 _movp(Slot, legalizeToVar(SourceVectNotLegalized));
2790 // Compute the location of the element in memory.
2791 unsigned Offset = Index * typeWidthInBytes(InVectorElementTy);
2792 OperandX8632Mem *Loc =
2793 getMemoryOperandForStackSlot(InVectorElementTy, Slot, Offset);
2794 _mov(ExtractedElementR, Loc);
2797 if (ElementTy == IceType_i1) {
2798 // Truncate extracted integers to i1s if necessary.
2799 Variable *T = makeReg(IceType_i1);
2801 InstCast::create(Func, InstCast::Trunc, T, ExtractedElementR);
2803 ExtractedElementR = T;
2806 // Copy the element to the destination.
2807 Variable *Dest = Inst->getDest();
2808 _mov(Dest, ExtractedElementR);
2811 void TargetX8632::lowerFcmp(const InstFcmp *Inst) {
2812 Operand *Src0 = Inst->getSrc(0);
2813 Operand *Src1 = Inst->getSrc(1);
2814 Variable *Dest = Inst->getDest();
2816 if (isVectorType(Dest->getType())) {
2817 InstFcmp::FCond Condition = Inst->getCondition();
2818 size_t Index = static_cast<size_t>(Condition);
2819 assert(Index < TableFcmpSize);
2821 if (TableFcmp[Index].SwapVectorOperands) {
2827 Variable *T = nullptr;
2829 if (Condition == InstFcmp::True) {
2830 // makeVectorOfOnes() requires an integer vector type.
2831 T = makeVectorOfMinusOnes(IceType_v4i32);
2832 } else if (Condition == InstFcmp::False) {
2833 T = makeVectorOfZeros(Dest->getType());
2835 Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
2836 Operand *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
2837 if (llvm::isa<OperandX8632Mem>(Src1RM))
2838 Src1RM = legalizeToVar(Src1RM);
2840 switch (Condition) {
2842 CondX86::CmppsCond Predicate = TableFcmp[Index].Predicate;
2843 assert(Predicate != CondX86::Cmpps_Invalid);
2844 T = makeReg(Src0RM->getType());
2846 _cmpps(T, Src1RM, Predicate);
2848 case InstFcmp::One: {
2849 // Check both unequal and ordered.
2850 T = makeReg(Src0RM->getType());
2851 Variable *T2 = makeReg(Src0RM->getType());
2853 _cmpps(T, Src1RM, CondX86::Cmpps_neq);
2855 _cmpps(T2, Src1RM, CondX86::Cmpps_ord);
2858 case InstFcmp::Ueq: {
2859 // Check both equal or unordered.
2860 T = makeReg(Src0RM->getType());
2861 Variable *T2 = makeReg(Src0RM->getType());
2863 _cmpps(T, Src1RM, CondX86::Cmpps_eq);
2865 _cmpps(T2, Src1RM, CondX86::Cmpps_unord);
2872 eliminateNextVectorSextInstruction(Dest);
2876 // Lowering a = fcmp cond, b, c
2877 // ucomiss b, c /* only if C1 != Br_None */
2878 // /* but swap b,c order if SwapOperands==true */
2880 // j<C1> label /* only if C1 != Br_None */
2881 // j<C2> label /* only if C2 != Br_None */
2882 // FakeUse(a) /* only if C1 != Br_None */
2883 // mov a, !<default> /* only if C1 != Br_None */
2884 // label: /* only if C1 != Br_None */
2886 // setcc lowering when C1 != Br_None && C2 == Br_None:
2887 // ucomiss b, c /* but swap b,c order if SwapOperands==true */
2889 InstFcmp::FCond Condition = Inst->getCondition();
2890 size_t Index = static_cast<size_t>(Condition);
2891 assert(Index < TableFcmpSize);
2892 if (TableFcmp[Index].SwapScalarOperands)
2893 std::swap(Src0, Src1);
2894 bool HasC1 = (TableFcmp[Index].C1 != CondX86::Br_None);
2895 bool HasC2 = (TableFcmp[Index].C2 != CondX86::Br_None);
2897 Src0 = legalize(Src0);
2898 Operand *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
2899 Variable *T = nullptr;
2901 _ucomiss(T, Src1RM);
2903 assert(TableFcmp[Index].Default);
2904 _setcc(Dest, TableFcmp[Index].C1);
2908 Constant *Default = Ctx->getConstantInt32(TableFcmp[Index].Default);
2909 _mov(Dest, Default);
2911 InstX8632Label *Label = InstX8632Label::create(Func, this);
2912 _br(TableFcmp[Index].C1, Label);
2914 _br(TableFcmp[Index].C2, Label);
2916 Constant *NonDefault = Ctx->getConstantInt32(!TableFcmp[Index].Default);
2917 _mov_nonkillable(Dest, NonDefault);
2918 Context.insert(Label);
2922 void TargetX8632::lowerIcmp(const InstIcmp *Inst) {
2923 Operand *Src0 = legalize(Inst->getSrc(0));
2924 Operand *Src1 = legalize(Inst->getSrc(1));
2925 Variable *Dest = Inst->getDest();
2927 if (isVectorType(Dest->getType())) {
2928 Type Ty = Src0->getType();
2929 // Promote i1 vectors to 128 bit integer vector types.
2930 if (typeElementType(Ty) == IceType_i1) {
2931 Type NewTy = IceType_NUM;
2934 llvm_unreachable("unexpected type");
2937 NewTy = IceType_v4i32;
2940 NewTy = IceType_v8i16;
2943 NewTy = IceType_v16i8;
2946 Variable *NewSrc0 = Func->makeVariable(NewTy);
2947 Variable *NewSrc1 = Func->makeVariable(NewTy);
2948 lowerCast(InstCast::create(Func, InstCast::Sext, NewSrc0, Src0));
2949 lowerCast(InstCast::create(Func, InstCast::Sext, NewSrc1, Src1));
2955 InstIcmp::ICond Condition = Inst->getCondition();
2957 Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
2958 Operand *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
2960 // SSE2 only has signed comparison operations. Transform unsigned
2961 // inputs in a manner that allows for the use of signed comparison
2962 // operations by flipping the high order bits.
2963 if (Condition == InstIcmp::Ugt || Condition == InstIcmp::Uge ||
2964 Condition == InstIcmp::Ult || Condition == InstIcmp::Ule) {
2965 Variable *T0 = makeReg(Ty);
2966 Variable *T1 = makeReg(Ty);
2967 Variable *HighOrderBits = makeVectorOfHighOrderBits(Ty);
2969 _pxor(T0, HighOrderBits);
2971 _pxor(T1, HighOrderBits);
2976 Variable *T = makeReg(Ty);
2977 switch (Condition) {
2979 llvm_unreachable("unexpected condition");
2981 case InstIcmp::Eq: {
2982 if (llvm::isa<OperandX8632Mem>(Src1RM))
2983 Src1RM = legalizeToVar(Src1RM);
2987 case InstIcmp::Ne: {
2988 if (llvm::isa<OperandX8632Mem>(Src1RM))
2989 Src1RM = legalizeToVar(Src1RM);
2992 Variable *MinusOne = makeVectorOfMinusOnes(Ty);
2996 case InstIcmp::Sgt: {
2997 if (llvm::isa<OperandX8632Mem>(Src1RM))
2998 Src1RM = legalizeToVar(Src1RM);
3003 case InstIcmp::Sge: {
3004 // !(Src1RM > Src0RM)
3005 if (llvm::isa<OperandX8632Mem>(Src0RM))
3006 Src0RM = legalizeToVar(Src0RM);
3009 Variable *MinusOne = makeVectorOfMinusOnes(Ty);
3013 case InstIcmp::Slt: {
3014 if (llvm::isa<OperandX8632Mem>(Src0RM))
3015 Src0RM = legalizeToVar(Src0RM);
3020 case InstIcmp::Sle: {
3021 // !(Src0RM > Src1RM)
3022 if (llvm::isa<OperandX8632Mem>(Src1RM))
3023 Src1RM = legalizeToVar(Src1RM);
3026 Variable *MinusOne = makeVectorOfMinusOnes(Ty);
3032 eliminateNextVectorSextInstruction(Dest);
3036 // a=icmp cond, b, c ==> cmp b,c; a=1; br cond,L1; FakeUse(a); a=0; L1:
3037 if (Src0->getType() == IceType_i64) {
3038 InstIcmp::ICond Condition = Inst->getCondition();
3039 size_t Index = static_cast<size_t>(Condition);
3040 assert(Index < TableIcmp64Size);
3041 Operand *Src0LoRM = legalize(loOperand(Src0), Legal_Reg | Legal_Mem);
3042 Operand *Src0HiRM = legalize(hiOperand(Src0), Legal_Reg | Legal_Mem);
3043 Operand *Src1LoRI = legalize(loOperand(Src1), Legal_Reg | Legal_Imm);
3044 Operand *Src1HiRI = legalize(hiOperand(Src1), Legal_Reg | Legal_Imm);
3045 Constant *Zero = Ctx->getConstantZero(IceType_i32);
3046 Constant *One = Ctx->getConstantInt32(1);
3047 InstX8632Label *LabelFalse = InstX8632Label::create(Func, this);
3048 InstX8632Label *LabelTrue = InstX8632Label::create(Func, this);
3050 _cmp(Src0HiRM, Src1HiRI);
3051 if (TableIcmp64[Index].C1 != CondX86::Br_None)
3052 _br(TableIcmp64[Index].C1, LabelTrue);
3053 if (TableIcmp64[Index].C2 != CondX86::Br_None)
3054 _br(TableIcmp64[Index].C2, LabelFalse);
3055 _cmp(Src0LoRM, Src1LoRI);
3056 _br(TableIcmp64[Index].C3, LabelTrue);
3057 Context.insert(LabelFalse);
3058 _mov_nonkillable(Dest, Zero);
3059 Context.insert(LabelTrue);
3064 Operand *Src0RM = legalizeSrc0ForCmp(Src0, Src1);
3066 _setcc(Dest, getIcmp32Mapping(Inst->getCondition()));
3069 void TargetX8632::lowerInsertElement(const InstInsertElement *Inst) {
3070 Operand *SourceVectNotLegalized = Inst->getSrc(0);
3071 Operand *ElementToInsertNotLegalized = Inst->getSrc(1);
3072 ConstantInteger32 *ElementIndex =
3073 llvm::dyn_cast<ConstantInteger32>(Inst->getSrc(2));
3074 // Only constant indices are allowed in PNaCl IR.
3075 assert(ElementIndex);
3076 unsigned Index = ElementIndex->getValue();
3077 assert(Index < typeNumElements(SourceVectNotLegalized->getType()));
3079 Type Ty = SourceVectNotLegalized->getType();
3080 Type ElementTy = typeElementType(Ty);
3081 Type InVectorElementTy = getInVectorElementType(Ty);
3083 if (ElementTy == IceType_i1) {
3084 // Expand the element to the appropriate size for it to be inserted
3086 Variable *Expanded = Func->makeVariable(InVectorElementTy);
3087 InstCast *Cast = InstCast::create(Func, InstCast::Zext, Expanded,
3088 ElementToInsertNotLegalized);
3090 ElementToInsertNotLegalized = Expanded;
3093 if (Ty == IceType_v8i16 || Ty == IceType_v8i1 || InstructionSet >= SSE4_1) {
3094 // Use insertps, pinsrb, pinsrw, or pinsrd.
3095 Operand *ElementRM =
3096 legalize(ElementToInsertNotLegalized, Legal_Reg | Legal_Mem);
3097 Operand *SourceVectRM =
3098 legalize(SourceVectNotLegalized, Legal_Reg | Legal_Mem);
3099 Variable *T = makeReg(Ty);
3100 _movp(T, SourceVectRM);
3101 if (Ty == IceType_v4f32)
3102 _insertps(T, ElementRM, Ctx->getConstantInt32(Index << 4));
3104 _pinsr(T, ElementRM, Ctx->getConstantInt32(Index));
3105 _movp(Inst->getDest(), T);
3106 } else if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) {
3107 // Use shufps or movss.
3108 Variable *ElementR = nullptr;
3109 Operand *SourceVectRM =
3110 legalize(SourceVectNotLegalized, Legal_Reg | Legal_Mem);
3112 if (InVectorElementTy == IceType_f32) {
3113 // ElementR will be in an XMM register since it is floating point.
3114 ElementR = legalizeToVar(ElementToInsertNotLegalized);
3116 // Copy an integer to an XMM register.
3117 Operand *T = legalize(ElementToInsertNotLegalized, Legal_Reg | Legal_Mem);
3118 ElementR = makeReg(Ty);
3123 Variable *T = makeReg(Ty);
3124 _movp(T, SourceVectRM);
3125 _movss(T, ElementR);
3126 _movp(Inst->getDest(), T);
3130 // shufps treats the source and desination operands as vectors of
3131 // four doublewords. The destination's two high doublewords are
3132 // selected from the source operand and the two low doublewords are
3133 // selected from the (original value of) the destination operand.
3134 // An insertelement operation can be effected with a sequence of two
3135 // shufps operations with appropriate masks. In all cases below,
3136 // Element[0] is being inserted into SourceVectOperand. Indices are
3137 // ordered from left to right.
3139 // insertelement into index 1 (result is stored in ElementR):
3140 // ElementR := ElementR[0, 0] SourceVectRM[0, 0]
3141 // ElementR := ElementR[3, 0] SourceVectRM[2, 3]
3143 // insertelement into index 2 (result is stored in T):
3144 // T := SourceVectRM
3145 // ElementR := ElementR[0, 0] T[0, 3]
3146 // T := T[0, 1] ElementR[0, 3]
3148 // insertelement into index 3 (result is stored in T):
3149 // T := SourceVectRM
3150 // ElementR := ElementR[0, 0] T[0, 2]
3151 // T := T[0, 1] ElementR[3, 0]
3152 const unsigned char Mask1[3] = {0, 192, 128};
3153 const unsigned char Mask2[3] = {227, 196, 52};
3155 Constant *Mask1Constant = Ctx->getConstantInt32(Mask1[Index - 1]);
3156 Constant *Mask2Constant = Ctx->getConstantInt32(Mask2[Index - 1]);
3159 _shufps(ElementR, SourceVectRM, Mask1Constant);
3160 _shufps(ElementR, SourceVectRM, Mask2Constant);
3161 _movp(Inst->getDest(), ElementR);
3163 Variable *T = makeReg(Ty);
3164 _movp(T, SourceVectRM);
3165 _shufps(ElementR, T, Mask1Constant);
3166 _shufps(T, ElementR, Mask2Constant);
3167 _movp(Inst->getDest(), T);
3170 assert(Ty == IceType_v16i8 || Ty == IceType_v16i1);
3171 // Spill the value to a stack slot and perform the insertion in
3174 // TODO(wala): use legalize(SourceVectNotLegalized, Legal_Mem) when
3175 // support for legalizing to mem is implemented.
3176 Variable *Slot = Func->makeVariable(Ty);
3177 Slot->setWeight(RegWeight::Zero);
3178 _movp(Slot, legalizeToVar(SourceVectNotLegalized));
3180 // Compute the location of the position to insert in memory.
3181 unsigned Offset = Index * typeWidthInBytes(InVectorElementTy);
3182 OperandX8632Mem *Loc =
3183 getMemoryOperandForStackSlot(InVectorElementTy, Slot, Offset);
3184 _store(legalizeToVar(ElementToInsertNotLegalized), Loc);
3186 Variable *T = makeReg(Ty);
3188 _movp(Inst->getDest(), T);
3192 void TargetX8632::lowerIntrinsicCall(const InstIntrinsicCall *Instr) {
3193 switch (Intrinsics::IntrinsicID ID = Instr->getIntrinsicInfo().ID) {
3194 case Intrinsics::AtomicCmpxchg: {
3195 if (!Intrinsics::isMemoryOrderValid(
3196 ID, getConstantMemoryOrder(Instr->getArg(3)),
3197 getConstantMemoryOrder(Instr->getArg(4)))) {
3198 Func->setError("Unexpected memory ordering for AtomicCmpxchg");
3201 Variable *DestPrev = Instr->getDest();
3202 Operand *PtrToMem = Instr->getArg(0);
3203 Operand *Expected = Instr->getArg(1);
3204 Operand *Desired = Instr->getArg(2);
3205 if (tryOptimizedCmpxchgCmpBr(DestPrev, PtrToMem, Expected, Desired))
3207 lowerAtomicCmpxchg(DestPrev, PtrToMem, Expected, Desired);
3210 case Intrinsics::AtomicFence:
3211 if (!Intrinsics::isMemoryOrderValid(
3212 ID, getConstantMemoryOrder(Instr->getArg(0)))) {
3213 Func->setError("Unexpected memory ordering for AtomicFence");
3218 case Intrinsics::AtomicFenceAll:
3219 // NOTE: FenceAll should prevent and load/store from being moved
3220 // across the fence (both atomic and non-atomic). The InstX8632Mfence
3221 // instruction is currently marked coarsely as "HasSideEffects".
3224 case Intrinsics::AtomicIsLockFree: {
3225 // X86 is always lock free for 8/16/32/64 bit accesses.
3226 // TODO(jvoung): Since the result is constant when given a constant
3227 // byte size, this opens up DCE opportunities.
3228 Operand *ByteSize = Instr->getArg(0);
3229 Variable *Dest = Instr->getDest();
3230 if (ConstantInteger32 *CI = llvm::dyn_cast<ConstantInteger32>(ByteSize)) {
3232 switch (CI->getValue()) {
3234 // Some x86-64 processors support the cmpxchg16b intruction, which
3235 // can make 16-byte operations lock free (when used with the LOCK
3236 // prefix). However, that's not supported in 32-bit mode, so just
3237 // return 0 even for large sizes.
3238 Result = Ctx->getConstantZero(IceType_i32);
3244 Result = Ctx->getConstantInt32(1);
3250 // The PNaCl ABI requires the byte size to be a compile-time constant.
3251 Func->setError("AtomicIsLockFree byte size should be compile-time const");
3254 case Intrinsics::AtomicLoad: {
3255 // We require the memory address to be naturally aligned.
3256 // Given that is the case, then normal loads are atomic.
3257 if (!Intrinsics::isMemoryOrderValid(
3258 ID, getConstantMemoryOrder(Instr->getArg(1)))) {
3259 Func->setError("Unexpected memory ordering for AtomicLoad");
3262 Variable *Dest = Instr->getDest();
3263 if (Dest->getType() == IceType_i64) {
3264 // Follow what GCC does and use a movq instead of what lowerLoad()
3265 // normally does (split the load into two).
3266 // Thus, this skips load/arithmetic op folding. Load/arithmetic folding
3267 // can't happen anyway, since this is x86-32 and integer arithmetic only
3268 // happens on 32-bit quantities.
3269 Variable *T = makeReg(IceType_f64);
3270 OperandX8632Mem *Addr = formMemoryOperand(Instr->getArg(0), IceType_f64);
3272 // Then cast the bits back out of the XMM register to the i64 Dest.
3273 InstCast *Cast = InstCast::create(Func, InstCast::Bitcast, Dest, T);
3275 // Make sure that the atomic load isn't elided when unused.
3276 Context.insert(InstFakeUse::create(Func, Dest->getLo()));
3277 Context.insert(InstFakeUse::create(Func, Dest->getHi()));
3280 InstLoad *Load = InstLoad::create(Func, Dest, Instr->getArg(0));
3282 // Make sure the atomic load isn't elided when unused, by adding a FakeUse.
3283 // Since lowerLoad may fuse the load w/ an arithmetic instruction,
3284 // insert the FakeUse on the last-inserted instruction's dest.
3286 InstFakeUse::create(Func, Context.getLastInserted()->getDest()));
3289 case Intrinsics::AtomicRMW:
3290 if (!Intrinsics::isMemoryOrderValid(
3291 ID, getConstantMemoryOrder(Instr->getArg(3)))) {
3292 Func->setError("Unexpected memory ordering for AtomicRMW");
3297 static_cast<uint32_t>(
3298 llvm::cast<ConstantInteger32>(Instr->getArg(0))->getValue()),
3299 Instr->getArg(1), Instr->getArg(2));
3301 case Intrinsics::AtomicStore: {
3302 if (!Intrinsics::isMemoryOrderValid(
3303 ID, getConstantMemoryOrder(Instr->getArg(2)))) {
3304 Func->setError("Unexpected memory ordering for AtomicStore");
3307 // We require the memory address to be naturally aligned.
3308 // Given that is the case, then normal stores are atomic.
3309 // Add a fence after the store to make it visible.
3310 Operand *Value = Instr->getArg(0);
3311 Operand *Ptr = Instr->getArg(1);
3312 if (Value->getType() == IceType_i64) {
3313 // Use a movq instead of what lowerStore() normally does
3314 // (split the store into two), following what GCC does.
3315 // Cast the bits from int -> to an xmm register first.
3316 Variable *T = makeReg(IceType_f64);
3317 InstCast *Cast = InstCast::create(Func, InstCast::Bitcast, T, Value);
3319 // Then store XMM w/ a movq.
3320 OperandX8632Mem *Addr = formMemoryOperand(Ptr, IceType_f64);
3325 InstStore *Store = InstStore::create(Func, Value, Ptr);
3330 case Intrinsics::Bswap: {
3331 Variable *Dest = Instr->getDest();
3332 Operand *Val = Instr->getArg(0);
3333 // In 32-bit mode, bswap only works on 32-bit arguments, and the
3334 // argument must be a register. Use rotate left for 16-bit bswap.
3335 if (Val->getType() == IceType_i64) {
3336 Variable *T_Lo = legalizeToVar(loOperand(Val));
3337 Variable *T_Hi = legalizeToVar(hiOperand(Val));
3338 Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
3339 Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
3344 } else if (Val->getType() == IceType_i32) {
3345 Variable *T = legalizeToVar(Val);
3349 assert(Val->getType() == IceType_i16);
3350 Val = legalize(Val);
3351 Constant *Eight = Ctx->getConstantInt16(8);
3352 Variable *T = nullptr;
3359 case Intrinsics::Ctpop: {
3360 Variable *Dest = Instr->getDest();
3361 Operand *Val = Instr->getArg(0);
3362 InstCall *Call = makeHelperCall(isInt32Asserting32Or64(Val->getType())
3368 // The popcount helpers always return 32-bit values, while the intrinsic's
3369 // signature matches the native POPCNT instruction and fills a 64-bit reg
3370 // (in 64-bit mode). Thus, clear the upper bits of the dest just in case
3371 // the user doesn't do that in the IR. If the user does that in the IR,
3372 // then this zero'ing instruction is dead and gets optimized out.
3373 if (Val->getType() == IceType_i64) {
3374 Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
3375 Constant *Zero = Ctx->getConstantZero(IceType_i32);
3380 case Intrinsics::Ctlz: {
3381 // The "is zero undef" parameter is ignored and we always return
3382 // a well-defined value.
3383 Operand *Val = legalize(Instr->getArg(0));
3385 Operand *SecondVal = nullptr;
3386 if (Val->getType() == IceType_i64) {
3387 FirstVal = loOperand(Val);
3388 SecondVal = hiOperand(Val);
3392 const bool IsCttz = false;
3393 lowerCountZeros(IsCttz, Val->getType(), Instr->getDest(), FirstVal,
3397 case Intrinsics::Cttz: {
3398 // The "is zero undef" parameter is ignored and we always return
3399 // a well-defined value.
3400 Operand *Val = legalize(Instr->getArg(0));
3402 Operand *SecondVal = nullptr;
3403 if (Val->getType() == IceType_i64) {
3404 FirstVal = hiOperand(Val);
3405 SecondVal = loOperand(Val);
3409 const bool IsCttz = true;
3410 lowerCountZeros(IsCttz, Val->getType(), Instr->getDest(), FirstVal,
3414 case Intrinsics::Fabs: {
3415 Operand *Src = legalize(Instr->getArg(0));
3416 Type Ty = Src->getType();
3417 Variable *Dest = Instr->getDest();
3418 Variable *T = makeVectorOfFabsMask(Ty);
3419 // The pand instruction operates on an m128 memory operand, so if
3420 // Src is an f32 or f64, we need to make sure it's in a register.
3421 if (isVectorType(Ty)) {
3422 if (llvm::isa<OperandX8632Mem>(Src))
3423 Src = legalizeToVar(Src);
3425 Src = legalizeToVar(Src);
3428 if (isVectorType(Ty))
3434 case Intrinsics::Longjmp: {
3435 InstCall *Call = makeHelperCall(H_call_longjmp, nullptr, 2);
3436 Call->addArg(Instr->getArg(0));
3437 Call->addArg(Instr->getArg(1));
3441 case Intrinsics::Memcpy: {
3442 // In the future, we could potentially emit an inline memcpy/memset, etc.
3443 // for intrinsic calls w/ a known length.
3444 InstCall *Call = makeHelperCall(H_call_memcpy, nullptr, 3);
3445 Call->addArg(Instr->getArg(0));
3446 Call->addArg(Instr->getArg(1));
3447 Call->addArg(Instr->getArg(2));
3451 case Intrinsics::Memmove: {
3452 InstCall *Call = makeHelperCall(H_call_memmove, nullptr, 3);
3453 Call->addArg(Instr->getArg(0));
3454 Call->addArg(Instr->getArg(1));
3455 Call->addArg(Instr->getArg(2));
3459 case Intrinsics::Memset: {
3460 // The value operand needs to be extended to a stack slot size
3461 // because the PNaCl ABI requires arguments to be at least 32 bits
3463 Operand *ValOp = Instr->getArg(1);
3464 assert(ValOp->getType() == IceType_i8);
3465 Variable *ValExt = Func->makeVariable(stackSlotType());
3466 lowerCast(InstCast::create(Func, InstCast::Zext, ValExt, ValOp));
3467 InstCall *Call = makeHelperCall(H_call_memset, nullptr, 3);
3468 Call->addArg(Instr->getArg(0));
3469 Call->addArg(ValExt);
3470 Call->addArg(Instr->getArg(2));
3474 case Intrinsics::NaClReadTP: {
3475 if (Ctx->getFlags().getUseSandboxing()) {
3476 Constant *Zero = Ctx->getConstantZero(IceType_i32);
3478 OperandX8632Mem::create(Func, IceType_i32, nullptr, Zero, nullptr, 0,
3479 OperandX8632Mem::SegReg_GS);
3480 Variable *Dest = Instr->getDest();
3481 Variable *T = nullptr;
3485 InstCall *Call = makeHelperCall(H_call_read_tp, Instr->getDest(), 0);
3490 case Intrinsics::Setjmp: {
3491 InstCall *Call = makeHelperCall(H_call_setjmp, Instr->getDest(), 1);
3492 Call->addArg(Instr->getArg(0));
3496 case Intrinsics::Sqrt: {
3497 Operand *Src = legalize(Instr->getArg(0));
3498 Variable *Dest = Instr->getDest();
3499 Variable *T = makeReg(Dest->getType());
3504 case Intrinsics::Stacksave: {
3505 Variable *esp = Func->getTarget()->getPhysicalRegister(RegX8632::Reg_esp);
3506 Variable *Dest = Instr->getDest();
3510 case Intrinsics::Stackrestore: {
3511 Variable *esp = Func->getTarget()->getPhysicalRegister(RegX8632::Reg_esp);
3512 _mov_nonkillable(esp, Instr->getArg(0));
3515 case Intrinsics::Trap:
3518 case Intrinsics::UnknownIntrinsic:
3519 Func->setError("Should not be lowering UnknownIntrinsic");
3525 void TargetX8632::lowerAtomicCmpxchg(Variable *DestPrev, Operand *Ptr,
3526 Operand *Expected, Operand *Desired) {
3527 if (Expected->getType() == IceType_i64) {
3528 // Reserve the pre-colored registers first, before adding any more
3529 // infinite-weight variables from formMemoryOperand's legalization.
3530 Variable *T_edx = makeReg(IceType_i32, RegX8632::Reg_edx);
3531 Variable *T_eax = makeReg(IceType_i32, RegX8632::Reg_eax);
3532 Variable *T_ecx = makeReg(IceType_i32, RegX8632::Reg_ecx);
3533 Variable *T_ebx = makeReg(IceType_i32, RegX8632::Reg_ebx);
3534 _mov(T_eax, loOperand(Expected));
3535 _mov(T_edx, hiOperand(Expected));
3536 _mov(T_ebx, loOperand(Desired));
3537 _mov(T_ecx, hiOperand(Desired));
3538 OperandX8632Mem *Addr = formMemoryOperand(Ptr, Expected->getType());
3539 const bool Locked = true;
3540 _cmpxchg8b(Addr, T_edx, T_eax, T_ecx, T_ebx, Locked);
3541 Variable *DestLo = llvm::cast<Variable>(loOperand(DestPrev));
3542 Variable *DestHi = llvm::cast<Variable>(hiOperand(DestPrev));
3543 _mov(DestLo, T_eax);
3544 _mov(DestHi, T_edx);
3547 Variable *T_eax = makeReg(Expected->getType(), RegX8632::Reg_eax);
3548 _mov(T_eax, Expected);
3549 OperandX8632Mem *Addr = formMemoryOperand(Ptr, Expected->getType());
3550 Variable *DesiredReg = legalizeToVar(Desired);
3551 const bool Locked = true;
3552 _cmpxchg(Addr, T_eax, DesiredReg, Locked);
3553 _mov(DestPrev, T_eax);
3556 bool TargetX8632::tryOptimizedCmpxchgCmpBr(Variable *Dest, Operand *PtrToMem,
3559 if (Ctx->getFlags().getOptLevel() == Opt_m1)
3561 // Peek ahead a few instructions and see how Dest is used.
3562 // It's very common to have:
3564 // %x = call i32 @llvm.nacl.atomic.cmpxchg.i32(i32* ptr, i32 %expected, ...)
3565 // [%y_phi = ...] // list of phi stores
3566 // %p = icmp eq i32 %x, %expected
3567 // br i1 %p, label %l1, label %l2
3569 // which we can optimize into:
3571 // %x = <cmpxchg code>
3572 // [%y_phi = ...] // list of phi stores
3574 InstList::iterator I = Context.getCur();
3575 // I is currently the InstIntrinsicCall. Peek past that.
3576 // This assumes that the atomic cmpxchg has not been lowered yet,
3577 // so that the instructions seen in the scan from "Cur" is simple.
3578 assert(llvm::isa<InstIntrinsicCall>(*I));
3579 Inst *NextInst = Context.getNextInst(I);
3582 // There might be phi assignments right before the compare+branch, since this
3583 // could be a backward branch for a loop. This placement of assignments is
3584 // determined by placePhiStores().
3585 std::vector<InstAssign *> PhiAssigns;
3586 while (InstAssign *PhiAssign = llvm::dyn_cast<InstAssign>(NextInst)) {
3587 if (PhiAssign->getDest() == Dest)
3589 PhiAssigns.push_back(PhiAssign);
3590 NextInst = Context.getNextInst(I);
3594 if (InstIcmp *NextCmp = llvm::dyn_cast<InstIcmp>(NextInst)) {
3595 if (!(NextCmp->getCondition() == InstIcmp::Eq &&
3596 ((NextCmp->getSrc(0) == Dest && NextCmp->getSrc(1) == Expected) ||
3597 (NextCmp->getSrc(1) == Dest && NextCmp->getSrc(0) == Expected)))) {
3600 NextInst = Context.getNextInst(I);
3603 if (InstBr *NextBr = llvm::dyn_cast<InstBr>(NextInst)) {
3604 if (!NextBr->isUnconditional() &&
3605 NextCmp->getDest() == NextBr->getCondition() &&
3606 NextBr->isLastUse(NextCmp->getDest())) {
3607 lowerAtomicCmpxchg(Dest, PtrToMem, Expected, Desired);
3608 for (size_t i = 0; i < PhiAssigns.size(); ++i) {
3609 // Lower the phi assignments now, before the branch (same placement
3611 InstAssign *PhiAssign = PhiAssigns[i];
3612 PhiAssign->setDeleted();
3613 lowerAssign(PhiAssign);
3614 Context.advanceNext();
3616 _br(CondX86::Br_e, NextBr->getTargetTrue(), NextBr->getTargetFalse());
3617 // Skip over the old compare and branch, by deleting them.
3618 NextCmp->setDeleted();
3619 NextBr->setDeleted();
3620 Context.advanceNext();
3621 Context.advanceNext();
3629 void TargetX8632::lowerAtomicRMW(Variable *Dest, uint32_t Operation,
3630 Operand *Ptr, Operand *Val) {
3631 bool NeedsCmpxchg = false;
3632 LowerBinOp Op_Lo = nullptr;
3633 LowerBinOp Op_Hi = nullptr;
3634 switch (Operation) {
3636 Func->setError("Unknown AtomicRMW operation");
3638 case Intrinsics::AtomicAdd: {
3639 if (Dest->getType() == IceType_i64) {
3640 // All the fall-through paths must set this to true, but use this
3642 NeedsCmpxchg = true;
3643 Op_Lo = &TargetX8632::_add;
3644 Op_Hi = &TargetX8632::_adc;
3647 OperandX8632Mem *Addr = formMemoryOperand(Ptr, Dest->getType());
3648 const bool Locked = true;
3649 Variable *T = nullptr;
3651 _xadd(Addr, T, Locked);
3655 case Intrinsics::AtomicSub: {
3656 if (Dest->getType() == IceType_i64) {
3657 NeedsCmpxchg = true;
3658 Op_Lo = &TargetX8632::_sub;
3659 Op_Hi = &TargetX8632::_sbb;
3662 OperandX8632Mem *Addr = formMemoryOperand(Ptr, Dest->getType());
3663 const bool Locked = true;
3664 Variable *T = nullptr;
3667 _xadd(Addr, T, Locked);
3671 case Intrinsics::AtomicOr:
3672 // TODO(jvoung): If Dest is null or dead, then some of these
3673 // operations do not need an "exchange", but just a locked op.
3674 // That appears to be "worth" it for sub, or, and, and xor.
3675 // xadd is probably fine vs lock add for add, and xchg is fine
3676 // vs an atomic store.
3677 NeedsCmpxchg = true;
3678 Op_Lo = &TargetX8632::_or;
3679 Op_Hi = &TargetX8632::_or;
3681 case Intrinsics::AtomicAnd:
3682 NeedsCmpxchg = true;
3683 Op_Lo = &TargetX8632::_and;
3684 Op_Hi = &TargetX8632::_and;
3686 case Intrinsics::AtomicXor:
3687 NeedsCmpxchg = true;
3688 Op_Lo = &TargetX8632::_xor;
3689 Op_Hi = &TargetX8632::_xor;
3691 case Intrinsics::AtomicExchange:
3692 if (Dest->getType() == IceType_i64) {
3693 NeedsCmpxchg = true;
3694 // NeedsCmpxchg, but no real Op_Lo/Op_Hi need to be done. The values
3695 // just need to be moved to the ecx and ebx registers.
3700 OperandX8632Mem *Addr = formMemoryOperand(Ptr, Dest->getType());
3701 Variable *T = nullptr;
3707 // Otherwise, we need a cmpxchg loop.
3709 assert(NeedsCmpxchg);
3710 expandAtomicRMWAsCmpxchg(Op_Lo, Op_Hi, Dest, Ptr, Val);
3713 void TargetX8632::expandAtomicRMWAsCmpxchg(LowerBinOp Op_Lo, LowerBinOp Op_Hi,
3714 Variable *Dest, Operand *Ptr,
3716 // Expand a more complex RMW operation as a cmpxchg loop:
3719 // mov edx, [ptr + 4]
3722 // <Op_Lo> ebx, <desired_adj_lo>
3724 // <Op_Hi> ecx, <desired_adj_hi>
3725 // lock cmpxchg8b [ptr]
3727 // mov <dest_lo>, eax
3728 // mov <dest_lo>, edx
3734 // op <reg>, [desired_adj]
3735 // lock cmpxchg [ptr], <reg>
3739 // If Op_{Lo,Hi} are nullptr, then just copy the value.
3740 Val = legalize(Val);
3741 Type Ty = Val->getType();
3742 if (Ty == IceType_i64) {
3743 Variable *T_edx = makeReg(IceType_i32, RegX8632::Reg_edx);
3744 Variable *T_eax = makeReg(IceType_i32, RegX8632::Reg_eax);
3745 OperandX8632Mem *Addr = formMemoryOperand(Ptr, Ty);
3746 _mov(T_eax, loOperand(Addr));
3747 _mov(T_edx, hiOperand(Addr));
3748 Variable *T_ecx = makeReg(IceType_i32, RegX8632::Reg_ecx);
3749 Variable *T_ebx = makeReg(IceType_i32, RegX8632::Reg_ebx);
3750 InstX8632Label *Label = InstX8632Label::create(Func, this);
3751 const bool IsXchg8b = Op_Lo == nullptr && Op_Hi == nullptr;
3753 Context.insert(Label);
3755 (this->*Op_Lo)(T_ebx, loOperand(Val));
3757 (this->*Op_Hi)(T_ecx, hiOperand(Val));
3759 // This is for xchg, which doesn't need an actual Op_Lo/Op_Hi.
3760 // It just needs the Val loaded into ebx and ecx.
3761 // That can also be done before the loop.
3762 _mov(T_ebx, loOperand(Val));
3763 _mov(T_ecx, hiOperand(Val));
3764 Context.insert(Label);
3766 const bool Locked = true;
3767 _cmpxchg8b(Addr, T_edx, T_eax, T_ecx, T_ebx, Locked);
3768 _br(CondX86::Br_ne, Label);
3770 // If Val is a variable, model the extended live range of Val through
3771 // the end of the loop, since it will be re-used by the loop.
3772 if (Variable *ValVar = llvm::dyn_cast<Variable>(Val)) {
3773 Variable *ValLo = llvm::cast<Variable>(loOperand(ValVar));
3774 Variable *ValHi = llvm::cast<Variable>(hiOperand(ValVar));
3775 Context.insert(InstFakeUse::create(Func, ValLo));
3776 Context.insert(InstFakeUse::create(Func, ValHi));
3779 // For xchg, the loop is slightly smaller and ebx/ecx are used.
3780 Context.insert(InstFakeUse::create(Func, T_ebx));
3781 Context.insert(InstFakeUse::create(Func, T_ecx));
3783 // The address base (if any) is also reused in the loop.
3784 if (Variable *Base = Addr->getBase())
3785 Context.insert(InstFakeUse::create(Func, Base));
3786 Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
3787 Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
3788 _mov(DestLo, T_eax);
3789 _mov(DestHi, T_edx);
3792 OperandX8632Mem *Addr = formMemoryOperand(Ptr, Ty);
3793 Variable *T_eax = makeReg(Ty, RegX8632::Reg_eax);
3795 InstX8632Label *Label = InstX8632Label::create(Func, this);
3796 Context.insert(Label);
3797 // We want to pick a different register for T than Eax, so don't use
3798 // _mov(T == nullptr, T_eax).
3799 Variable *T = makeReg(Ty);
3801 (this->*Op_Lo)(T, Val);
3802 const bool Locked = true;
3803 _cmpxchg(Addr, T_eax, T, Locked);
3804 _br(CondX86::Br_ne, Label);
3805 // If Val is a variable, model the extended live range of Val through
3806 // the end of the loop, since it will be re-used by the loop.
3807 if (Variable *ValVar = llvm::dyn_cast<Variable>(Val)) {
3808 Context.insert(InstFakeUse::create(Func, ValVar));
3810 // The address base (if any) is also reused in the loop.
3811 if (Variable *Base = Addr->getBase())
3812 Context.insert(InstFakeUse::create(Func, Base));
3816 // Lowers count {trailing, leading} zeros intrinsic.
3818 // We could do constant folding here, but that should have
3819 // been done by the front-end/middle-end optimizations.
3820 void TargetX8632::lowerCountZeros(bool Cttz, Type Ty, Variable *Dest,
3821 Operand *FirstVal, Operand *SecondVal) {
3822 // TODO(jvoung): Determine if the user CPU supports LZCNT (BMI).
3823 // Then the instructions will handle the Val == 0 case much more simply
3824 // and won't require conversion from bit position to number of zeros.
3827 // bsr IF_NOT_ZERO, Val
3829 // cmovne T_DEST, IF_NOT_ZERO
3833 // NOTE: T_DEST must be a register because cmov requires its dest to be a
3834 // register. Also, bsf and bsr require their dest to be a register.
3836 // The xor DEST, 31 converts a bit position to # of leading zeroes.
3837 // E.g., for 000... 00001100, bsr will say that the most significant bit
3838 // set is at position 3, while the number of leading zeros is 28. Xor is
3839 // like (31 - N) for N <= 31, and converts 63 to 32 (for the all-zeros case).
3841 // Similar for 64-bit, but start w/ speculating that the upper 32 bits
3842 // are all zero, and compute the result for that case (checking the lower
3843 // 32 bits). Then actually compute the result for the upper bits and
3844 // cmov in the result from the lower computation if the earlier speculation
3847 // Cttz, is similar, but uses bsf instead, and doesn't require the xor
3848 // bit position conversion, and the speculation is reversed.
3849 assert(Ty == IceType_i32 || Ty == IceType_i64);
3850 Variable *T = makeReg(IceType_i32);
3851 Operand *FirstValRM = legalize(FirstVal, Legal_Mem | Legal_Reg);
3853 _bsf(T, FirstValRM);
3855 _bsr(T, FirstValRM);
3857 Variable *T_Dest = makeReg(IceType_i32);
3858 Constant *ThirtyTwo = Ctx->getConstantInt32(32);
3859 Constant *ThirtyOne = Ctx->getConstantInt32(31);
3861 _mov(T_Dest, ThirtyTwo);
3863 Constant *SixtyThree = Ctx->getConstantInt32(63);
3864 _mov(T_Dest, SixtyThree);
3866 _cmov(T_Dest, T, CondX86::Br_ne);
3868 _xor(T_Dest, ThirtyOne);
3870 if (Ty == IceType_i32) {
3874 _add(T_Dest, ThirtyTwo);
3875 Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
3876 Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
3877 // Will be using "test" on this, so we need a registerized variable.
3878 Variable *SecondVar = legalizeToVar(SecondVal);
3879 Variable *T_Dest2 = makeReg(IceType_i32);
3881 _bsf(T_Dest2, SecondVar);
3883 _bsr(T_Dest2, SecondVar);
3884 _xor(T_Dest2, ThirtyOne);
3886 _test(SecondVar, SecondVar);
3887 _cmov(T_Dest2, T_Dest, CondX86::Br_e);
3888 _mov(DestLo, T_Dest2);
3889 _mov(DestHi, Ctx->getConstantZero(IceType_i32));
3894 bool isAdd(const Inst *Inst) {
3895 if (const InstArithmetic *Arith =
3896 llvm::dyn_cast_or_null<const InstArithmetic>(Inst)) {
3897 return (Arith->getOp() == InstArithmetic::Add);
3902 void dumpAddressOpt(const Cfg *Func, const Variable *Base,
3903 const Variable *Index, uint16_t Shift, int32_t Offset,
3904 const Inst *Reason) {
3907 if (!Func->isVerbose(IceV_AddrOpt))
3909 OstreamLocker L(Func->getContext());
3910 Ostream &Str = Func->getContext()->getStrDump();
3911 Str << "Instruction: ";
3912 Reason->dumpDecorated(Func);
3913 Str << " results in Base=";
3923 Str << ", Shift=" << Shift << ", Offset=" << Offset << "\n";
3926 bool matchTransitiveAssign(const VariablesMetadata *VMetadata, Variable *&Var,
3927 const Inst *&Reason) {
3928 // Var originates from Var=SrcVar ==>
3932 if (const Inst *VarAssign = VMetadata->getSingleDefinition(Var)) {
3933 assert(!VMetadata->isMultiDef(Var));
3934 if (llvm::isa<InstAssign>(VarAssign)) {
3935 Operand *SrcOp = VarAssign->getSrc(0);
3937 if (Variable *SrcVar = llvm::dyn_cast<Variable>(SrcOp)) {
3938 if (!VMetadata->isMultiDef(SrcVar) &&
3939 // TODO: ensure SrcVar stays single-BB
3951 bool matchCombinedBaseIndex(const VariablesMetadata *VMetadata, Variable *&Base,
3952 Variable *&Index, uint16_t &Shift,
3953 const Inst *&Reason) {
3954 // Index==nullptr && Base is Base=Var1+Var2 ==>
3955 // set Base=Var1, Index=Var2, Shift=0
3956 if (Base == nullptr)
3958 if (Index != nullptr)
3960 const Inst *BaseInst = VMetadata->getSingleDefinition(Base);
3961 if (BaseInst == nullptr)
3963 assert(!VMetadata->isMultiDef(Base));
3964 if (BaseInst->getSrcSize() < 2)
3966 if (Variable *Var1 = llvm::dyn_cast<Variable>(BaseInst->getSrc(0))) {
3967 if (VMetadata->isMultiDef(Var1))
3969 if (Variable *Var2 = llvm::dyn_cast<Variable>(BaseInst->getSrc(1))) {
3970 if (VMetadata->isMultiDef(Var2))
3972 if (isAdd(BaseInst) &&
3973 // TODO: ensure Var1 and Var2 stay single-BB
3977 Shift = 0; // should already have been 0
3986 bool matchShiftedIndex(const VariablesMetadata *VMetadata, Variable *&Index,
3987 uint16_t &Shift, const Inst *&Reason) {
3988 // Index is Index=Var*Const && log2(Const)+Shift<=3 ==>
3989 // Index=Var, Shift+=log2(Const)
3990 if (Index == nullptr)
3992 const Inst *IndexInst = VMetadata->getSingleDefinition(Index);
3993 if (IndexInst == nullptr)
3995 assert(!VMetadata->isMultiDef(Index));
3996 if (IndexInst->getSrcSize() < 2)
3998 if (const InstArithmetic *ArithInst =
3999 llvm::dyn_cast<InstArithmetic>(IndexInst)) {
4000 if (Variable *Var = llvm::dyn_cast<Variable>(ArithInst->getSrc(0))) {
4001 if (ConstantInteger32 *Const =
4002 llvm::dyn_cast<ConstantInteger32>(ArithInst->getSrc(1))) {
4003 if (ArithInst->getOp() == InstArithmetic::Mul &&
4004 !VMetadata->isMultiDef(Var) && Const->getType() == IceType_i32) {
4005 uint64_t Mult = Const->getValue();
4023 if (Shift + LogMult <= 3) {
4036 bool matchOffsetBase(const VariablesMetadata *VMetadata, Variable *&Base,
4037 int32_t &Offset, const Inst *&Reason) {
4038 // Base is Base=Var+Const || Base is Base=Const+Var ==>
4039 // set Base=Var, Offset+=Const
4040 // Base is Base=Var-Const ==>
4041 // set Base=Var, Offset-=Const
4042 if (Base == nullptr)
4044 const Inst *BaseInst = VMetadata->getSingleDefinition(Base);
4045 if (BaseInst == nullptr)
4047 assert(!VMetadata->isMultiDef(Base));
4048 if (const InstArithmetic *ArithInst =
4049 llvm::dyn_cast<const InstArithmetic>(BaseInst)) {
4050 if (ArithInst->getOp() != InstArithmetic::Add &&
4051 ArithInst->getOp() != InstArithmetic::Sub)
4053 bool IsAdd = ArithInst->getOp() == InstArithmetic::Add;
4054 Variable *Var = nullptr;
4055 ConstantInteger32 *Const = nullptr;
4056 if (Variable *VariableOperand =
4057 llvm::dyn_cast<Variable>(ArithInst->getSrc(0))) {
4058 Var = VariableOperand;
4059 Const = llvm::dyn_cast<ConstantInteger32>(ArithInst->getSrc(1));
4061 Const = llvm::dyn_cast<ConstantInteger32>(ArithInst->getSrc(0));
4062 Var = llvm::dyn_cast<Variable>(ArithInst->getSrc(1));
4064 if (Var == nullptr || Const == nullptr || VMetadata->isMultiDef(Var))
4066 int32_t MoreOffset = IsAdd ? Const->getValue() : -Const->getValue();
4067 if (Utils::WouldOverflowAdd(Offset, MoreOffset))
4070 Offset += MoreOffset;
4077 void computeAddressOpt(Cfg *Func, const Inst *Instr, Variable *&Base,
4078 Variable *&Index, uint16_t &Shift, int32_t &Offset) {
4079 Func->resetCurrentNode();
4080 if (Func->isVerbose(IceV_AddrOpt)) {
4081 OstreamLocker L(Func->getContext());
4082 Ostream &Str = Func->getContext()->getStrDump();
4083 Str << "\nStarting computeAddressOpt for instruction:\n ";
4084 Instr->dumpDecorated(Func);
4086 (void)Offset; // TODO: pattern-match for non-zero offsets.
4087 if (Base == nullptr)
4089 // If the Base has more than one use or is live across multiple
4090 // blocks, then don't go further. Alternatively (?), never consider
4091 // a transformation that would change a variable that is currently
4092 // *not* live across basic block boundaries into one that *is*.
4093 if (Func->getVMetadata()->isMultiBlock(Base) /* || Base->getUseCount() > 1*/)
4096 const VariablesMetadata *VMetadata = Func->getVMetadata();
4097 bool Continue = true;
4099 const Inst *Reason = nullptr;
4100 if (matchTransitiveAssign(VMetadata, Base, Reason) ||
4101 matchTransitiveAssign(VMetadata, Index, Reason) ||
4102 matchCombinedBaseIndex(VMetadata, Base, Index, Shift, Reason) ||
4103 matchShiftedIndex(VMetadata, Index, Shift, Reason) ||
4104 matchOffsetBase(VMetadata, Base, Offset, Reason)) {
4105 dumpAddressOpt(Func, Base, Index, Shift, Offset, Reason);
4110 // Index is Index=Var<<Const && Const+Shift<=3 ==>
4111 // Index=Var, Shift+=Const
4113 // Index is Index=Const*Var && log2(Const)+Shift<=3 ==>
4114 // Index=Var, Shift+=log2(Const)
4116 // Index && Shift==0 && Base is Base=Var*Const && log2(Const)+Shift<=3 ==>
4118 // Similar for Base=Const*Var and Base=Var<<Const
4120 // Index is Index=Var+Const ==>
4121 // set Index=Var, Offset+=(Const<<Shift)
4123 // Index is Index=Const+Var ==>
4124 // set Index=Var, Offset+=(Const<<Shift)
4126 // Index is Index=Var-Const ==>
4127 // set Index=Var, Offset-=(Const<<Shift)
4129 // TODO: consider overflow issues with respect to Offset.
4130 // TODO: handle symbolic constants.
4134 } // anonymous namespace
4136 void TargetX8632::lowerLoad(const InstLoad *Load) {
4137 // A Load instruction can be treated the same as an Assign
4138 // instruction, after the source operand is transformed into an
4139 // OperandX8632Mem operand. Note that the address mode
4140 // optimization already creates an OperandX8632Mem operand, so it
4141 // doesn't need another level of transformation.
4142 Variable *DestLoad = Load->getDest();
4143 Type Ty = DestLoad->getType();
4144 Operand *Src0 = formMemoryOperand(Load->getSourceAddress(), Ty);
4145 InstAssign *Assign = InstAssign::create(Func, DestLoad, Src0);
4146 lowerAssign(Assign);
4149 void TargetX8632::doAddressOptLoad() {
4150 Inst *Inst = Context.getCur();
4151 Variable *Dest = Inst->getDest();
4152 Operand *Addr = Inst->getSrc(0);
4153 Variable *Index = nullptr;
4155 int32_t Offset = 0; // TODO: make Constant
4156 // Vanilla ICE load instructions should not use the segment registers,
4157 // and computeAddressOpt only works at the level of Variables and Constants,
4158 // not other OperandX8632Mem, so there should be no mention of segment
4159 // registers there either.
4160 const OperandX8632Mem::SegmentRegisters SegmentReg =
4161 OperandX8632Mem::DefaultSegment;
4162 Variable *Base = llvm::dyn_cast<Variable>(Addr);
4163 computeAddressOpt(Func, Inst, Base, Index, Shift, Offset);
4164 if (Base && Addr != Base) {
4166 Constant *OffsetOp = Ctx->getConstantInt32(Offset);
4167 Addr = OperandX8632Mem::create(Func, Dest->getType(), Base, OffsetOp, Index,
4169 Context.insert(InstLoad::create(Func, Dest, Addr));
4173 void TargetX8632::randomlyInsertNop(float Probability) {
4174 RandomNumberGeneratorWrapper RNG(Ctx->getRNG());
4175 if (RNG.getTrueWithProbability(Probability)) {
4176 _nop(RNG(X86_NUM_NOP_VARIANTS));
4180 void TargetX8632::lowerPhi(const InstPhi * /*Inst*/) {
4181 Func->setError("Phi found in regular instruction list");
4184 void TargetX8632::lowerRet(const InstRet *Inst) {
4185 Variable *Reg = nullptr;
4186 if (Inst->hasRetValue()) {
4187 Operand *Src0 = legalize(Inst->getRetValue());
4188 if (Src0->getType() == IceType_i64) {
4189 Variable *eax = legalizeToVar(loOperand(Src0), RegX8632::Reg_eax);
4190 Variable *edx = legalizeToVar(hiOperand(Src0), RegX8632::Reg_edx);
4192 Context.insert(InstFakeUse::create(Func, edx));
4193 } else if (isScalarFloatingType(Src0->getType())) {
4195 } else if (isVectorType(Src0->getType())) {
4196 Reg = legalizeToVar(Src0, RegX8632::Reg_xmm0);
4198 _mov(Reg, Src0, RegX8632::Reg_eax);
4201 // Add a ret instruction even if sandboxing is enabled, because
4202 // addEpilog explicitly looks for a ret instruction as a marker for
4203 // where to insert the frame removal instructions.
4205 // Add a fake use of esp to make sure esp stays alive for the entire
4206 // function. Otherwise post-call esp adjustments get dead-code
4207 // eliminated. TODO: Are there more places where the fake use
4208 // should be inserted? E.g. "void f(int n){while(1) g(n);}" may not
4209 // have a ret instruction.
4210 Variable *esp = Func->getTarget()->getPhysicalRegister(RegX8632::Reg_esp);
4211 Context.insert(InstFakeUse::create(Func, esp));
4214 void TargetX8632::lowerSelect(const InstSelect *Inst) {
4215 Variable *Dest = Inst->getDest();
4216 Type DestTy = Dest->getType();
4217 Operand *SrcT = Inst->getTrueOperand();
4218 Operand *SrcF = Inst->getFalseOperand();
4219 Operand *Condition = Inst->getCondition();
4221 if (isVectorType(DestTy)) {
4222 Type SrcTy = SrcT->getType();
4223 Variable *T = makeReg(SrcTy);
4224 Operand *SrcTRM = legalize(SrcT, Legal_Reg | Legal_Mem);
4225 Operand *SrcFRM = legalize(SrcF, Legal_Reg | Legal_Mem);
4226 if (InstructionSet >= SSE4_1) {
4227 // TODO(wala): If the condition operand is a constant, use blendps
4230 // Use blendvps or pblendvb to implement select.
4231 if (SrcTy == IceType_v4i1 || SrcTy == IceType_v4i32 ||
4232 SrcTy == IceType_v4f32) {
4233 Operand *ConditionRM = legalize(Condition, Legal_Reg | Legal_Mem);
4234 Variable *xmm0 = makeReg(IceType_v4i32, RegX8632::Reg_xmm0);
4235 _movp(xmm0, ConditionRM);
4236 _psll(xmm0, Ctx->getConstantInt8(31));
4238 _blendvps(T, SrcTRM, xmm0);
4241 assert(typeNumElements(SrcTy) == 8 || typeNumElements(SrcTy) == 16);
4242 Type SignExtTy = Condition->getType() == IceType_v8i1 ? IceType_v8i16
4244 Variable *xmm0 = makeReg(SignExtTy, RegX8632::Reg_xmm0);
4245 lowerCast(InstCast::create(Func, InstCast::Sext, xmm0, Condition));
4247 _pblendvb(T, SrcTRM, xmm0);
4252 // Lower select without SSE4.1:
4254 // if elementtype(d) != i1:
4257 Variable *T2 = makeReg(SrcTy);
4258 // Sign extend the condition operand if applicable.
4259 if (SrcTy == IceType_v4f32) {
4260 // The sext operation takes only integer arguments.
4261 Variable *T3 = Func->makeVariable(IceType_v4i32);
4262 lowerCast(InstCast::create(Func, InstCast::Sext, T3, Condition));
4264 } else if (typeElementType(SrcTy) != IceType_i1) {
4265 lowerCast(InstCast::create(Func, InstCast::Sext, T, Condition));
4267 Operand *ConditionRM = legalize(Condition, Legal_Reg | Legal_Mem);
4268 _movp(T, ConditionRM);
4279 CondX86::BrCond Cond = CondX86::Br_ne;
4280 Operand *CmpOpnd0 = nullptr;
4281 Operand *CmpOpnd1 = nullptr;
4282 // Handle folding opportunities.
4283 if (const class Inst *Producer = FoldingInfo.getProducerFor(Condition)) {
4284 assert(Producer->isDeleted());
4285 switch (BoolFolding::getProducerKind(Producer)) {
4288 case BoolFolding::PK_Icmp32: {
4289 auto *Cmp = llvm::dyn_cast<InstIcmp>(Producer);
4290 Cond = getIcmp32Mapping(Cmp->getCondition());
4291 CmpOpnd1 = legalize(Producer->getSrc(1));
4292 CmpOpnd0 = legalizeSrc0ForCmp(Producer->getSrc(0), CmpOpnd1);
4296 if (CmpOpnd0 == nullptr) {
4297 CmpOpnd0 = legalize(Condition, Legal_Reg | Legal_Mem);
4298 CmpOpnd1 = Ctx->getConstantZero(IceType_i32);
4303 _cmp(CmpOpnd0, CmpOpnd1);
4304 if (typeWidthInBytes(DestTy) == 1 || isFloatingType(DestTy)) {
4305 // The cmov instruction doesn't allow 8-bit or FP operands, so
4306 // we need explicit control flow.
4307 // d=cmp e,f; a=d?b:c ==> cmp e,f; a=b; jne L1; a=c; L1:
4308 InstX8632Label *Label = InstX8632Label::create(Func, this);
4309 SrcT = legalize(SrcT, Legal_Reg | Legal_Imm);
4312 SrcF = legalize(SrcF, Legal_Reg | Legal_Imm);
4313 _mov_nonkillable(Dest, SrcF);
4314 Context.insert(Label);
4317 // mov t, SrcF; cmov_cond t, SrcT; mov dest, t
4318 // But if SrcT is immediate, we might be able to do better, as
4319 // the cmov instruction doesn't allow an immediate operand:
4320 // mov t, SrcT; cmov_!cond t, SrcF; mov dest, t
4321 if (llvm::isa<Constant>(SrcT) && !llvm::isa<Constant>(SrcF)) {
4322 std::swap(SrcT, SrcF);
4323 Cond = InstX8632::getOppositeCondition(Cond);
4325 if (DestTy == IceType_i64) {
4326 // Set the low portion.
4327 Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
4328 Variable *TLo = nullptr;
4329 Operand *SrcFLo = legalize(loOperand(SrcF));
4331 Operand *SrcTLo = legalize(loOperand(SrcT), Legal_Reg | Legal_Mem);
4332 _cmov(TLo, SrcTLo, Cond);
4334 // Set the high portion.
4335 Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
4336 Variable *THi = nullptr;
4337 Operand *SrcFHi = legalize(hiOperand(SrcF));
4339 Operand *SrcTHi = legalize(hiOperand(SrcT), Legal_Reg | Legal_Mem);
4340 _cmov(THi, SrcTHi, Cond);
4345 assert(DestTy == IceType_i16 || DestTy == IceType_i32);
4346 Variable *T = nullptr;
4347 SrcF = legalize(SrcF);
4349 SrcT = legalize(SrcT, Legal_Reg | Legal_Mem);
4350 _cmov(T, SrcT, Cond);
4354 void TargetX8632::lowerStore(const InstStore *Inst) {
4355 Operand *Value = Inst->getData();
4356 Operand *Addr = Inst->getAddr();
4357 OperandX8632Mem *NewAddr = formMemoryOperand(Addr, Value->getType());
4358 Type Ty = NewAddr->getType();
4360 if (Ty == IceType_i64) {
4361 Value = legalize(Value);
4362 Operand *ValueHi = legalize(hiOperand(Value), Legal_Reg | Legal_Imm);
4363 Operand *ValueLo = legalize(loOperand(Value), Legal_Reg | Legal_Imm);
4364 _store(ValueHi, llvm::cast<OperandX8632Mem>(hiOperand(NewAddr)));
4365 _store(ValueLo, llvm::cast<OperandX8632Mem>(loOperand(NewAddr)));
4366 } else if (isVectorType(Ty)) {
4367 _storep(legalizeToVar(Value), NewAddr);
4369 Value = legalize(Value, Legal_Reg | Legal_Imm);
4370 _store(Value, NewAddr);
4374 void TargetX8632::doAddressOptStore() {
4375 InstStore *Inst = llvm::cast<InstStore>(Context.getCur());
4376 Operand *Data = Inst->getData();
4377 Operand *Addr = Inst->getAddr();
4378 Variable *Index = nullptr;
4380 int32_t Offset = 0; // TODO: make Constant
4381 Variable *Base = llvm::dyn_cast<Variable>(Addr);
4382 // Vanilla ICE store instructions should not use the segment registers,
4383 // and computeAddressOpt only works at the level of Variables and Constants,
4384 // not other OperandX8632Mem, so there should be no mention of segment
4385 // registers there either.
4386 const OperandX8632Mem::SegmentRegisters SegmentReg =
4387 OperandX8632Mem::DefaultSegment;
4388 computeAddressOpt(Func, Inst, Base, Index, Shift, Offset);
4389 if (Base && Addr != Base) {
4391 Constant *OffsetOp = Ctx->getConstantInt32(Offset);
4392 Addr = OperandX8632Mem::create(Func, Data->getType(), Base, OffsetOp, Index,
4394 Context.insert(InstStore::create(Func, Data, Addr));
4398 void TargetX8632::lowerSwitch(const InstSwitch *Inst) {
4399 // This implements the most naive possible lowering.
4400 // cmp a,val[0]; jeq label[0]; cmp a,val[1]; jeq label[1]; ... jmp default
4401 Operand *Src0 = Inst->getComparison();
4402 SizeT NumCases = Inst->getNumCases();
4403 if (Src0->getType() == IceType_i64) {
4404 Src0 = legalize(Src0); // get Base/Index into physical registers
4405 Operand *Src0Lo = loOperand(Src0);
4406 Operand *Src0Hi = hiOperand(Src0);
4407 if (NumCases >= 2) {
4408 Src0Lo = legalizeToVar(Src0Lo);
4409 Src0Hi = legalizeToVar(Src0Hi);
4411 Src0Lo = legalize(Src0Lo, Legal_Reg | Legal_Mem);
4412 Src0Hi = legalize(Src0Hi, Legal_Reg | Legal_Mem);
4414 for (SizeT I = 0; I < NumCases; ++I) {
4415 Constant *ValueLo = Ctx->getConstantInt32(Inst->getValue(I));
4416 Constant *ValueHi = Ctx->getConstantInt32(Inst->getValue(I) >> 32);
4417 InstX8632Label *Label = InstX8632Label::create(Func, this);
4418 _cmp(Src0Lo, ValueLo);
4419 _br(CondX86::Br_ne, Label);
4420 _cmp(Src0Hi, ValueHi);
4421 _br(CondX86::Br_e, Inst->getLabel(I));
4422 Context.insert(Label);
4424 _br(Inst->getLabelDefault());
4427 // OK, we'll be slightly less naive by forcing Src into a physical
4428 // register if there are 2 or more uses.
4430 Src0 = legalizeToVar(Src0);
4432 Src0 = legalize(Src0, Legal_Reg | Legal_Mem);
4433 for (SizeT I = 0; I < NumCases; ++I) {
4434 Constant *Value = Ctx->getConstantInt32(Inst->getValue(I));
4436 _br(CondX86::Br_e, Inst->getLabel(I));
4439 _br(Inst->getLabelDefault());
4442 void TargetX8632::scalarizeArithmetic(InstArithmetic::OpKind Kind,
4443 Variable *Dest, Operand *Src0,
4445 assert(isVectorType(Dest->getType()));
4446 Type Ty = Dest->getType();
4447 Type ElementTy = typeElementType(Ty);
4448 SizeT NumElements = typeNumElements(Ty);
4450 Operand *T = Ctx->getConstantUndef(Ty);
4451 for (SizeT I = 0; I < NumElements; ++I) {
4452 Constant *Index = Ctx->getConstantInt32(I);
4454 // Extract the next two inputs.
4455 Variable *Op0 = Func->makeVariable(ElementTy);
4456 lowerExtractElement(InstExtractElement::create(Func, Op0, Src0, Index));
4457 Variable *Op1 = Func->makeVariable(ElementTy);
4458 lowerExtractElement(InstExtractElement::create(Func, Op1, Src1, Index));
4460 // Perform the arithmetic as a scalar operation.
4461 Variable *Res = Func->makeVariable(ElementTy);
4462 lowerArithmetic(InstArithmetic::create(Func, Kind, Res, Op0, Op1));
4464 // Insert the result into position.
4465 Variable *DestT = Func->makeVariable(Ty);
4466 lowerInsertElement(InstInsertElement::create(Func, DestT, T, Res, Index));
4470 lowerAssign(InstAssign::create(Func, Dest, T));
4473 // The following pattern occurs often in lowered C and C++ code:
4475 // %cmp = fcmp/icmp pred <n x ty> %src0, %src1
4476 // %cmp.ext = sext <n x i1> %cmp to <n x ty>
4478 // We can eliminate the sext operation by copying the result of pcmpeqd,
4479 // pcmpgtd, or cmpps (which produce sign extended results) to the result
4480 // of the sext operation.
4481 void TargetX8632::eliminateNextVectorSextInstruction(
4482 Variable *SignExtendedResult) {
4483 if (InstCast *NextCast =
4484 llvm::dyn_cast_or_null<InstCast>(Context.getNextInst())) {
4485 if (NextCast->getCastKind() == InstCast::Sext &&
4486 NextCast->getSrc(0) == SignExtendedResult) {
4487 NextCast->setDeleted();
4488 _movp(NextCast->getDest(), legalizeToVar(SignExtendedResult));
4489 // Skip over the instruction.
4490 Context.advanceNext();
4495 void TargetX8632::lowerUnreachable(const InstUnreachable * /*Inst*/) { _ud2(); }
4497 // Turn an i64 Phi instruction into a pair of i32 Phi instructions, to
4498 // preserve integrity of liveness analysis. Undef values are also
4499 // turned into zeroes, since loOperand() and hiOperand() don't expect
4501 void TargetX8632::prelowerPhis() {
4502 CfgNode *Node = Context.getNode();
4503 for (Inst &I : Node->getPhis()) {
4504 auto Phi = llvm::dyn_cast<InstPhi>(&I);
4505 if (Phi->isDeleted())
4507 Variable *Dest = Phi->getDest();
4508 if (Dest->getType() == IceType_i64) {
4509 Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
4510 Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
4511 InstPhi *PhiLo = InstPhi::create(Func, Phi->getSrcSize(), DestLo);
4512 InstPhi *PhiHi = InstPhi::create(Func, Phi->getSrcSize(), DestHi);
4513 for (SizeT I = 0; I < Phi->getSrcSize(); ++I) {
4514 Operand *Src = Phi->getSrc(I);
4515 CfgNode *Label = Phi->getLabel(I);
4516 if (llvm::isa<ConstantUndef>(Src))
4517 Src = Ctx->getConstantZero(Dest->getType());
4518 PhiLo->addArgument(loOperand(Src), Label);
4519 PhiHi->addArgument(hiOperand(Src), Label);
4521 Node->getPhis().push_back(PhiLo);
4522 Node->getPhis().push_back(PhiHi);
4530 bool isMemoryOperand(const Operand *Opnd) {
4531 if (const auto Var = llvm::dyn_cast<Variable>(Opnd))
4532 return !Var->hasReg();
4533 // We treat vector undef values the same as a memory operand,
4534 // because they do in fact need a register to materialize the vector
4536 if (llvm::isa<ConstantUndef>(Opnd))
4537 return isScalarFloatingType(Opnd->getType()) ||
4538 isVectorType(Opnd->getType());
4539 if (llvm::isa<Constant>(Opnd))
4540 return isScalarFloatingType(Opnd->getType());
4544 } // end of anonymous namespace
4546 // Lower the pre-ordered list of assignments into mov instructions.
4547 // Also has to do some ad-hoc register allocation as necessary.
4548 void TargetX8632::lowerPhiAssignments(CfgNode *Node,
4549 const AssignList &Assignments) {
4550 // Check that this is a properly initialized shell of a node.
4551 assert(Node->getOutEdges().size() == 1);
4552 assert(Node->getInsts().empty());
4553 assert(Node->getPhis().empty());
4554 CfgNode *Succ = Node->getOutEdges().front();
4555 getContext().init(Node);
4556 // Register set setup similar to regAlloc().
4557 RegSetMask RegInclude = RegSet_All;
4558 RegSetMask RegExclude = RegSet_StackPointer;
4559 if (hasFramePointer())
4560 RegExclude |= RegSet_FramePointer;
4561 llvm::SmallBitVector Available = getRegisterSet(RegInclude, RegExclude);
4562 bool NeedsRegs = false;
4563 // Initialize the set of available registers to the set of what is
4564 // available (not live) at the beginning of the successor block,
4565 // minus all registers used as Dest operands in the Assignments. To
4566 // do this, we start off assuming all registers are available, then
4567 // iterate through the Assignments and remove Dest registers.
4568 // During this iteration, we also determine whether we will actually
4569 // need any extra registers for memory-to-memory copies. If so, we
4570 // do the actual work of removing the live-in registers from the
4571 // set. TODO(stichnot): This work is being repeated for every split
4572 // edge to the successor, so consider updating LiveIn just once
4573 // after all the edges are split.
4574 for (const Inst &I : Assignments) {
4575 Variable *Dest = I.getDest();
4576 if (Dest->hasReg()) {
4577 Available[Dest->getRegNum()] = false;
4578 } else if (isMemoryOperand(I.getSrc(0))) {
4579 NeedsRegs = true; // Src and Dest are both in memory
4583 LivenessBV &LiveIn = Func->getLiveness()->getLiveIn(Succ);
4584 for (int i = LiveIn.find_first(); i != -1; i = LiveIn.find_next(i)) {
4585 Variable *Var = Func->getLiveness()->getVariable(i, Succ);
4587 Available[Var->getRegNum()] = false;
4590 // Iterate backwards through the Assignments. After lowering each
4591 // assignment, add Dest to the set of available registers, and
4592 // remove Src from the set of available registers. Iteration is
4593 // done backwards to enable incremental updates of the available
4594 // register set, and the lowered instruction numbers may be out of
4595 // order, but that can be worked around by renumbering the block
4596 // afterwards if necessary.
4597 for (const Inst &I : reverse_range(Assignments)) {
4599 auto Assign = llvm::dyn_cast<InstAssign>(&I);
4600 Variable *Dest = Assign->getDest();
4601 Operand *Src = Assign->getSrc(0);
4602 Variable *SrcVar = llvm::dyn_cast<Variable>(Src);
4603 // Use normal assignment lowering, except lower mem=mem specially
4604 // so we can register-allocate at the same time.
4605 if (!isMemoryOperand(Dest) || !isMemoryOperand(Src)) {
4606 lowerAssign(Assign);
4608 assert(Dest->getType() == Src->getType());
4609 const llvm::SmallBitVector &RegsForType =
4610 getRegisterSetForType(Dest->getType());
4611 llvm::SmallBitVector AvailRegsForType = RegsForType & Available;
4612 Variable *SpillLoc = nullptr;
4613 Variable *Preg = nullptr;
4614 // TODO(stichnot): Opportunity for register randomization.
4615 int32_t RegNum = AvailRegsForType.find_first();
4616 bool IsVector = isVectorType(Dest->getType());
4617 bool NeedSpill = (RegNum == -1);
4619 // Pick some register to spill and update RegNum.
4620 // TODO(stichnot): Opportunity for register randomization.
4621 RegNum = RegsForType.find_first();
4622 Preg = getPhysicalRegister(RegNum, Dest->getType());
4623 SpillLoc = Func->makeVariable(Dest->getType());
4624 // Create a fake def of the physical register to avoid
4625 // liveness inconsistency problems during late-stage liveness
4626 // analysis (e.g. asm-verbose mode).
4627 Context.insert(InstFakeDef::create(Func, Preg));
4629 _movp(SpillLoc, Preg);
4631 _mov(SpillLoc, Preg);
4633 assert(RegNum >= 0);
4634 if (llvm::isa<ConstantUndef>(Src))
4635 // Materialize an actual constant instead of undef. RegNum is
4636 // passed in for vector types because undef vectors are
4637 // lowered to vector register of zeroes.
4639 legalize(Src, Legal_All, IsVector ? RegNum : Variable::NoRegister);
4640 Variable *Tmp = makeReg(Dest->getType(), RegNum);
4649 // Restore the spilled register.
4651 _movp(Preg, SpillLoc);
4653 _mov(Preg, SpillLoc);
4654 // Create a fake use of the physical register to keep it live
4655 // for late-stage liveness analysis (e.g. asm-verbose mode).
4656 Context.insert(InstFakeUse::create(Func, Preg));
4659 // Update register availability before moving to the previous
4660 // instruction on the Assignments list.
4662 Available[Dest->getRegNum()] = true;
4663 if (SrcVar && SrcVar->hasReg())
4664 Available[SrcVar->getRegNum()] = false;
4667 // Add the terminator branch instruction to the end.
4668 Context.setInsertPoint(Context.getEnd());
4672 // There is no support for loading or emitting vector constants, so the
4673 // vector values returned from makeVectorOfZeros, makeVectorOfOnes,
4674 // etc. are initialized with register operations.
4676 // TODO(wala): Add limited support for vector constants so that
4677 // complex initialization in registers is unnecessary.
4679 Variable *TargetX8632::makeVectorOfZeros(Type Ty, int32_t RegNum) {
4680 Variable *Reg = makeReg(Ty, RegNum);
4681 // Insert a FakeDef, since otherwise the live range of Reg might
4682 // be overestimated.
4683 Context.insert(InstFakeDef::create(Func, Reg));
4688 Variable *TargetX8632::makeVectorOfMinusOnes(Type Ty, int32_t RegNum) {
4689 Variable *MinusOnes = makeReg(Ty, RegNum);
4690 // Insert a FakeDef so the live range of MinusOnes is not overestimated.
4691 Context.insert(InstFakeDef::create(Func, MinusOnes));
4692 _pcmpeq(MinusOnes, MinusOnes);
4696 Variable *TargetX8632::makeVectorOfOnes(Type Ty, int32_t RegNum) {
4697 Variable *Dest = makeVectorOfZeros(Ty, RegNum);
4698 Variable *MinusOne = makeVectorOfMinusOnes(Ty);
4699 _psub(Dest, MinusOne);
4703 Variable *TargetX8632::makeVectorOfHighOrderBits(Type Ty, int32_t RegNum) {
4704 assert(Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v8i16 ||
4705 Ty == IceType_v16i8);
4706 if (Ty == IceType_v4f32 || Ty == IceType_v4i32 || Ty == IceType_v8i16) {
4707 Variable *Reg = makeVectorOfOnes(Ty, RegNum);
4708 SizeT Shift = typeWidthInBytes(typeElementType(Ty)) * X86_CHAR_BIT - 1;
4709 _psll(Reg, Ctx->getConstantInt8(Shift));
4712 // SSE has no left shift operation for vectors of 8 bit integers.
4713 const uint32_t HIGH_ORDER_BITS_MASK = 0x80808080;
4714 Constant *ConstantMask = Ctx->getConstantInt32(HIGH_ORDER_BITS_MASK);
4715 Variable *Reg = makeReg(Ty, RegNum);
4716 _movd(Reg, legalize(ConstantMask, Legal_Reg | Legal_Mem));
4717 _pshufd(Reg, Reg, Ctx->getConstantZero(IceType_i8));
4722 // Construct a mask in a register that can be and'ed with a
4723 // floating-point value to mask off its sign bit. The value will be
4724 // <4 x 0x7fffffff> for f32 and v4f32, and <2 x 0x7fffffffffffffff>
4725 // for f64. Construct it as vector of ones logically right shifted
4726 // one bit. TODO(stichnot): Fix the wala TODO above, to represent
4727 // vector constants in memory.
4728 Variable *TargetX8632::makeVectorOfFabsMask(Type Ty, int32_t RegNum) {
4729 Variable *Reg = makeVectorOfMinusOnes(Ty, RegNum);
4730 _psrl(Reg, Ctx->getConstantInt8(1));
4734 OperandX8632Mem *TargetX8632::getMemoryOperandForStackSlot(Type Ty,
4737 // Ensure that Loc is a stack slot.
4738 assert(Slot->getWeight().isZero());
4739 assert(Slot->getRegNum() == Variable::NoRegister);
4740 // Compute the location of Loc in memory.
4741 // TODO(wala,stichnot): lea should not be required. The address of
4742 // the stack slot is known at compile time (although not until after
4744 const Type PointerType = IceType_i32;
4745 Variable *Loc = makeReg(PointerType);
4747 Constant *ConstantOffset = Ctx->getConstantInt32(Offset);
4748 return OperandX8632Mem::create(Func, Ty, Loc, ConstantOffset);
4751 // Helper for legalize() to emit the right code to lower an operand to a
4752 // register of the appropriate type.
4753 Variable *TargetX8632::copyToReg(Operand *Src, int32_t RegNum) {
4754 Type Ty = Src->getType();
4755 Variable *Reg = makeReg(Ty, RegNum);
4756 if (isVectorType(Ty)) {
4764 Operand *TargetX8632::legalize(Operand *From, LegalMask Allowed,
4766 Type Ty = From->getType();
4767 // Assert that a physical register is allowed. To date, all calls
4768 // to legalize() allow a physical register. If a physical register
4769 // needs to be explicitly disallowed, then new code will need to be
4770 // written to force a spill.
4771 assert(Allowed & Legal_Reg);
4772 // If we're asking for a specific physical register, make sure we're
4773 // not allowing any other operand kinds. (This could be future
4774 // work, e.g. allow the shl shift amount to be either an immediate
4776 assert(RegNum == Variable::NoRegister || Allowed == Legal_Reg);
4777 if (auto Mem = llvm::dyn_cast<OperandX8632Mem>(From)) {
4778 // Before doing anything with a Mem operand, we need to ensure
4779 // that the Base and Index components are in physical registers.
4780 Variable *Base = Mem->getBase();
4781 Variable *Index = Mem->getIndex();
4782 Variable *RegBase = nullptr;
4783 Variable *RegIndex = nullptr;
4785 RegBase = legalizeToVar(Base);
4788 RegIndex = legalizeToVar(Index);
4790 if (Base != RegBase || Index != RegIndex) {
4792 OperandX8632Mem::create(Func, Ty, RegBase, Mem->getOffset(), RegIndex,
4793 Mem->getShift(), Mem->getSegmentRegister());
4796 if (!(Allowed & Legal_Mem)) {
4797 From = copyToReg(From, RegNum);
4801 if (llvm::isa<Constant>(From)) {
4802 if (llvm::isa<ConstantUndef>(From)) {
4803 // Lower undefs to zero. Another option is to lower undefs to an
4804 // uninitialized register; however, using an uninitialized register
4805 // results in less predictable code.
4807 // If in the future the implementation is changed to lower undef
4808 // values to uninitialized registers, a FakeDef will be needed:
4809 // Context.insert(InstFakeDef::create(Func, Reg));
4810 // This is in order to ensure that the live range of Reg is not
4811 // overestimated. If the constant being lowered is a 64 bit value,
4812 // then the result should be split and the lo and hi components will
4813 // need to go in uninitialized registers.
4814 if (isVectorType(Ty))
4815 return makeVectorOfZeros(Ty, RegNum);
4816 From = Ctx->getConstantZero(Ty);
4818 // There should be no constants of vector type (other than undef).
4819 assert(!isVectorType(Ty));
4820 // Convert a scalar floating point constant into an explicit
4822 if (isScalarFloatingType(Ty)) {
4823 Variable *Base = nullptr;
4825 llvm::raw_string_ostream StrBuf(Buffer);
4826 llvm::cast<Constant>(From)->emitPoolLabel(StrBuf);
4827 Constant *Offset = Ctx->getConstantSym(0, StrBuf.str(), true);
4828 From = OperandX8632Mem::create(Func, Ty, Base, Offset);
4830 bool NeedsReg = false;
4831 if (!(Allowed & Legal_Imm) && !isScalarFloatingType(Ty))
4832 // Immediate specifically not allowed
4834 if (!(Allowed & Legal_Mem) && isScalarFloatingType(Ty))
4835 // On x86, FP constants are lowered to mem operands.
4838 From = copyToReg(From, RegNum);
4842 if (auto Var = llvm::dyn_cast<Variable>(From)) {
4843 // Check if the variable is guaranteed a physical register. This
4844 // can happen either when the variable is pre-colored or when it is
4845 // assigned infinite weight.
4846 bool MustHaveRegister = (Var->hasReg() || Var->getWeight().isInf());
4847 // We need a new physical register for the operand if:
4848 // Mem is not allowed and Var isn't guaranteed a physical
4850 // RegNum is required and Var->getRegNum() doesn't match.
4851 if ((!(Allowed & Legal_Mem) && !MustHaveRegister) ||
4852 (RegNum != Variable::NoRegister && RegNum != Var->getRegNum())) {
4853 From = copyToReg(From, RegNum);
4857 llvm_unreachable("Unhandled operand kind in legalize()");
4861 // Provide a trivial wrapper to legalize() for this common usage.
4862 Variable *TargetX8632::legalizeToVar(Operand *From, int32_t RegNum) {
4863 return llvm::cast<Variable>(legalize(From, Legal_Reg, RegNum));
4866 // For the cmp instruction, if Src1 is an immediate, or known to be a
4867 // physical register, we can allow Src0 to be a memory operand.
4868 // Otherwise, Src0 must be copied into a physical register.
4869 // (Actually, either Src0 or Src1 can be chosen for the physical
4870 // register, but unfortunately we have to commit to one or the other
4871 // before register allocation.)
4872 Operand *TargetX8632::legalizeSrc0ForCmp(Operand *Src0, Operand *Src1) {
4873 bool IsSrc1ImmOrReg = false;
4874 if (llvm::isa<Constant>(Src1)) {
4875 IsSrc1ImmOrReg = true;
4876 } else if (Variable *Var = llvm::dyn_cast<Variable>(Src1)) {
4878 IsSrc1ImmOrReg = true;
4880 return legalize(Src0, IsSrc1ImmOrReg ? (Legal_Reg | Legal_Mem) : Legal_Reg);
4883 OperandX8632Mem *TargetX8632::formMemoryOperand(Operand *Operand, Type Ty,
4885 OperandX8632Mem *Mem = llvm::dyn_cast<OperandX8632Mem>(Operand);
4886 // It may be the case that address mode optimization already creates
4887 // an OperandX8632Mem, so in that case it wouldn't need another level
4888 // of transformation.
4890 Variable *Base = llvm::dyn_cast<Variable>(Operand);
4891 Constant *Offset = llvm::dyn_cast<Constant>(Operand);
4892 assert(Base || Offset);
4894 // Make sure Offset is not undef.
4895 Offset = llvm::cast<Constant>(legalize(Offset));
4896 assert(llvm::isa<ConstantInteger32>(Offset) ||
4897 llvm::isa<ConstantRelocatable>(Offset));
4899 Mem = OperandX8632Mem::create(Func, Ty, Base, Offset);
4901 return llvm::cast<OperandX8632Mem>(DoLegalize ? legalize(Mem) : Mem);
4904 Variable *TargetX8632::makeReg(Type Type, int32_t RegNum) {
4905 // There aren't any 64-bit integer registers for x86-32.
4906 assert(Type != IceType_i64);
4907 Variable *Reg = Func->makeVariable(Type);
4908 if (RegNum == Variable::NoRegister)
4909 Reg->setWeightInfinite();
4911 Reg->setRegNum(RegNum);
4915 void TargetX8632::postLower() {
4916 if (Ctx->getFlags().getOptLevel() == Opt_m1)
4921 void TargetX8632::makeRandomRegisterPermutation(
4922 llvm::SmallVectorImpl<int32_t> &Permutation,
4923 const llvm::SmallBitVector &ExcludeRegisters) const {
4924 // TODO(stichnot): Declaring Permutation this way loses type/size
4925 // information. Fix this in conjunction with the caller-side TODO.
4926 assert(Permutation.size() >= RegX8632::Reg_NUM);
4927 // Expected upper bound on the number of registers in a single
4928 // equivalence class. For x86-32, this would comprise the 8 XMM
4929 // registers. This is for performance, not correctness.
4930 static const unsigned MaxEquivalenceClassSize = 8;
4931 typedef llvm::SmallVector<int32_t, MaxEquivalenceClassSize> RegisterList;
4932 typedef std::map<uint32_t, RegisterList> EquivalenceClassMap;
4933 EquivalenceClassMap EquivalenceClasses;
4934 SizeT NumShuffled = 0, NumPreserved = 0;
4936 // Build up the equivalence classes of registers by looking at the
4937 // register properties as well as whether the registers should be
4938 // explicitly excluded from shuffling.
4939 #define X(val, encode, name, name16, name8, scratch, preserved, stackptr, \
4940 frameptr, isI8, isInt, isFP) \
4941 if (ExcludeRegisters[RegX8632::val]) { \
4942 /* val stays the same in the resulting permutation. */ \
4943 Permutation[RegX8632::val] = RegX8632::val; \
4946 const uint32_t Index = (scratch << 0) | (preserved << 1) | (isI8 << 2) | \
4947 (isInt << 3) | (isFP << 4); \
4948 /* val is assigned to an equivalence class based on its properties. */ \
4949 EquivalenceClasses[Index].push_back(RegX8632::val); \
4954 RandomNumberGeneratorWrapper RNG(Ctx->getRNG());
4956 // Shuffle the resulting equivalence classes.
4957 for (auto I : EquivalenceClasses) {
4958 const RegisterList &List = I.second;
4959 RegisterList Shuffled(List);
4960 RandomShuffle(Shuffled.begin(), Shuffled.end(), RNG);
4961 for (size_t SI = 0, SE = Shuffled.size(); SI < SE; ++SI) {
4962 Permutation[List[SI]] = Shuffled[SI];
4967 assert(NumShuffled + NumPreserved == RegX8632::Reg_NUM);
4969 if (Func->isVerbose(IceV_Random)) {
4970 OstreamLocker L(Func->getContext());
4971 Ostream &Str = Func->getContext()->getStrDump();
4972 Str << "Register equivalence classes:\n";
4973 for (auto I : EquivalenceClasses) {
4975 const RegisterList &List = I.second;
4977 for (int32_t Register : List) {
4981 Str << getRegName(Register, IceType_i32);
4988 void TargetX8632::emit(const ConstantInteger32 *C) const {
4991 Ostream &Str = Ctx->getStrEmit();
4992 Str << getConstantPrefix() << C->getValue();
4995 void TargetX8632::emit(const ConstantInteger64 *) const {
4996 llvm::report_fatal_error("Not expecting to emit 64-bit integers");
4999 void TargetX8632::emit(const ConstantFloat *C) const {
5002 Ostream &Str = Ctx->getStrEmit();
5003 C->emitPoolLabel(Str);
5006 void TargetX8632::emit(const ConstantDouble *C) const {
5009 Ostream &Str = Ctx->getStrEmit();
5010 C->emitPoolLabel(Str);
5013 void TargetX8632::emit(const ConstantUndef *) const {
5014 llvm::report_fatal_error("undef value encountered by emitter.");
5017 TargetDataX8632::TargetDataX8632(GlobalContext *Ctx)
5018 : TargetDataLowering(Ctx) {}
5020 void TargetDataX8632::lowerGlobal(const VariableDeclaration &Var) {
5021 // If external and not initialized, this must be a cross test.
5022 // Don't generate a declaration for such cases.
5023 bool IsExternal = Var.isExternal() || Ctx->getFlags().getDisableInternal();
5024 if (IsExternal && !Var.hasInitializer())
5027 Ostream &Str = Ctx->getStrEmit();
5028 const VariableDeclaration::InitializerListType &Initializers =
5029 Var.getInitializers();
5030 bool HasNonzeroInitializer = Var.hasNonzeroInitializer();
5031 bool IsConstant = Var.getIsConstant();
5032 uint32_t Align = Var.getAlignment();
5033 SizeT Size = Var.getNumBytes();
5034 IceString MangledName = Var.mangleName(Ctx);
5035 IceString SectionSuffix = "";
5036 if (Ctx->getFlags().getDataSections())
5037 SectionSuffix = "." + MangledName;
5039 Str << "\t.type\t" << MangledName << ",@object\n";
5042 Str << "\t.section\t.rodata" << SectionSuffix << ",\"a\",@progbits\n";
5043 else if (HasNonzeroInitializer)
5044 Str << "\t.section\t.data" << SectionSuffix << ",\"aw\",@progbits\n";
5046 Str << "\t.section\t.bss" << SectionSuffix << ",\"aw\",@nobits\n";
5049 Str << "\t.globl\t" << MangledName << "\n";
5052 Str << "\t.align\t" << Align << "\n";
5054 Str << MangledName << ":\n";
5056 if (HasNonzeroInitializer) {
5057 for (VariableDeclaration::Initializer *Init : Initializers) {
5058 switch (Init->getKind()) {
5059 case VariableDeclaration::Initializer::DataInitializerKind: {
5060 const auto Data = llvm::cast<VariableDeclaration::DataInitializer>(Init)
5062 for (SizeT i = 0; i < Init->getNumBytes(); ++i) {
5063 Str << "\t.byte\t" << (((unsigned)Data[i]) & 0xff) << "\n";
5067 case VariableDeclaration::Initializer::ZeroInitializerKind:
5068 Str << "\t.zero\t" << Init->getNumBytes() << "\n";
5070 case VariableDeclaration::Initializer::RelocInitializerKind: {
5072 llvm::cast<VariableDeclaration::RelocInitializer>(Init);
5074 Str << Reloc->getDeclaration()->mangleName(Ctx);
5075 if (RelocOffsetT Offset = Reloc->getOffset()) {
5076 if (Offset >= 0 || (Offset == INT32_MIN))
5077 Str << " + " << Offset;
5079 Str << " - " << -Offset;
5087 // NOTE: for non-constant zero initializers, this is BSS (no bits),
5088 // so an ELF writer would not write to the file, and only track
5089 // virtual offsets, but the .s writer still needs this .zero and
5090 // cannot simply use the .size to advance offsets.
5091 Str << "\t.zero\t" << Size << "\n";
5093 Str << "\t.size\t" << MangledName << ", " << Size << "\n";
5096 void TargetDataX8632::lowerGlobals(
5097 std::unique_ptr<VariableDeclarationList> Vars) {
5098 switch (Ctx->getFlags().getOutFileType()) {
5100 ELFObjectWriter *Writer = Ctx->getObjectWriter();
5101 Writer->writeDataSection(*Vars, llvm::ELF::R_386_32);
5105 const IceString &TranslateOnly = Ctx->getFlags().getTranslateOnly();
5106 OstreamLocker L(Ctx);
5107 for (const VariableDeclaration *Var : *Vars) {
5108 if (GlobalContext::matchSymbolName(Var->getName(), TranslateOnly)) {
5116 template <typename T> struct PoolTypeConverter {};
5118 template <> struct PoolTypeConverter<float> {
5119 typedef uint32_t PrimitiveIntType;
5120 typedef ConstantFloat IceType;
5121 static const Type Ty = IceType_f32;
5122 static const char *TypeName;
5123 static const char *AsmTag;
5124 static const char *PrintfString;
5126 const char *PoolTypeConverter<float>::TypeName = "float";
5127 const char *PoolTypeConverter<float>::AsmTag = ".long";
5128 const char *PoolTypeConverter<float>::PrintfString = "0x%x";
5130 template <> struct PoolTypeConverter<double> {
5131 typedef uint64_t PrimitiveIntType;
5132 typedef ConstantDouble IceType;
5133 static const Type Ty = IceType_f64;
5134 static const char *TypeName;
5135 static const char *AsmTag;
5136 static const char *PrintfString;
5138 const char *PoolTypeConverter<double>::TypeName = "double";
5139 const char *PoolTypeConverter<double>::AsmTag = ".quad";
5140 const char *PoolTypeConverter<double>::PrintfString = "0x%llx";
5142 template <typename T>
5143 void TargetDataX8632::emitConstantPool(GlobalContext *Ctx) {
5146 Ostream &Str = Ctx->getStrEmit();
5148 SizeT Align = typeAlignInBytes(Ty);
5149 ConstantList Pool = Ctx->getConstantPool(Ty);
5151 Str << "\t.section\t.rodata.cst" << Align << ",\"aM\",@progbits," << Align
5153 Str << "\t.align\t" << Align << "\n";
5154 for (Constant *C : Pool) {
5155 typename T::IceType *Const = llvm::cast<typename T::IceType>(C);
5156 typename T::IceType::PrimType Value = Const->getValue();
5157 // Use memcpy() to copy bits from Value into RawValue in a way
5158 // that avoids breaking strict-aliasing rules.
5159 typename T::PrimitiveIntType RawValue;
5160 memcpy(&RawValue, &Value, sizeof(Value));
5163 snprintf(buf, llvm::array_lengthof(buf), T::PrintfString, RawValue);
5164 assert(CharsPrinted >= 0 &&
5165 (size_t)CharsPrinted < llvm::array_lengthof(buf));
5166 (void)CharsPrinted; // avoid warnings if asserts are disabled
5167 Const->emitPoolLabel(Str);
5168 Str << ":\n\t" << T::AsmTag << "\t" << buf << "\t# " << T::TypeName << " "
5173 void TargetDataX8632::lowerConstants() {
5174 if (Ctx->getFlags().getDisableTranslation())
5176 // No need to emit constants from the int pool since (for x86) they
5177 // are embedded as immediates in the instructions, just emit float/double.
5178 switch (Ctx->getFlags().getOutFileType()) {
5180 ELFObjectWriter *Writer = Ctx->getObjectWriter();
5181 Writer->writeConstantPool<ConstantFloat>(IceType_f32);
5182 Writer->writeConstantPool<ConstantDouble>(IceType_f64);
5186 OstreamLocker L(Ctx);
5187 emitConstantPool<PoolTypeConverter<float>>(Ctx);
5188 emitConstantPool<PoolTypeConverter<double>>(Ctx);
5193 TargetHeaderX8632::TargetHeaderX8632(GlobalContext *Ctx)
5194 : TargetHeaderLowering(Ctx) {}
5196 } // end of namespace Ice