src/IceTargetLoweringX86BaseImpl.h

   1 //===- subzero/src/IceTargetLoweringX86BaseImpl.h - x86 lowering -*- C++ -*-==//
   2 //
   3 //                        The Subzero Code Generator
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 ///
  10 /// \file
  11 /// \brief Implements the TargetLoweringX86Base class, which consists almost
  12 /// entirely of the lowering sequence for each high-level instruction.
  13 ///
  14 //===----------------------------------------------------------------------===//
  15
  16 #ifndef SUBZERO_SRC_ICETARGETLOWERINGX86BASEIMPL_H
  17 #define SUBZERO_SRC_ICETARGETLOWERINGX86BASEIMPL_H
  18
  19 #include "IceCfg.h"
  20 #include "IceCfgNode.h"
  21 #include "IceClFlags.h"
  22 #include "IceDefs.h"
  23 #include "IceELFObjectWriter.h"
  24 #include "IceGlobalInits.h"
  25 #include "IceInstVarIter.h"
  26 #include "IceLiveness.h"
  27 #include "IceOperand.h"
  28 #include "IcePhiLoweringImpl.h"
  29 #include "IceUtils.h"
  30 #include "IceInstX86Base.h"
  31 #include "llvm/Support/MathExtras.h"
  32
  33 #include <stack>
  34
  35 namespace Ice {
  36 namespace X86 {
  37 template <typename T> struct PoolTypeConverter {};
  38
  39 template <> struct PoolTypeConverter<float> {
  40   using PrimitiveIntType = uint32_t;
  41   using IceType = ConstantFloat;
  42   static const Type Ty = IceType_f32;
  43   static const char *TypeName;
  44   static const char *AsmTag;
  45   static const char *PrintfString;
  46 };
  47
  48 template <> struct PoolTypeConverter<double> {
  49   using PrimitiveIntType = uint64_t;
  50   using IceType = ConstantDouble;
  51   static const Type Ty = IceType_f64;
  52   static const char *TypeName;
  53   static const char *AsmTag;
  54   static const char *PrintfString;
  55 };
  56
  57 // Add converter for int type constant pooling
  58 template <> struct PoolTypeConverter<uint32_t> {
  59   using PrimitiveIntType = uint32_t;
  60   using IceType = ConstantInteger32;
  61   static const Type Ty = IceType_i32;
  62   static const char *TypeName;
  63   static const char *AsmTag;
  64   static const char *PrintfString;
  65 };
  66
  67 // Add converter for int type constant pooling
  68 template <> struct PoolTypeConverter<uint16_t> {
  69   using PrimitiveIntType = uint32_t;
  70   using IceType = ConstantInteger32;
  71   static const Type Ty = IceType_i16;
  72   static const char *TypeName;
  73   static const char *AsmTag;
  74   static const char *PrintfString;
  75 };
  76
  77 // Add converter for int type constant pooling
  78 template <> struct PoolTypeConverter<uint8_t> {
  79   using PrimitiveIntType = uint32_t;
  80   using IceType = ConstantInteger32;
  81   static const Type Ty = IceType_i8;
  82   static const char *TypeName;
  83   static const char *AsmTag;
  84   static const char *PrintfString;
  85 };
  86 } // end of namespace X86
  87
  88 namespace X86NAMESPACE {
  89
  90 using Utils::BoolFlagSaver;
  91
  92 template <typename Traits> class BoolFoldingEntry {
  93   BoolFoldingEntry(const BoolFoldingEntry &) = delete;
  94
  95 public:
  96   BoolFoldingEntry() = default;
  97   explicit BoolFoldingEntry(Inst *I);
  98   BoolFoldingEntry &operator=(const BoolFoldingEntry &) = default;
  99   /// Instr is the instruction producing the i1-type variable of interest.
 100   Inst *Instr = nullptr;
 101   /// IsComplex is the cached result of BoolFolding::hasComplexLowering(Instr).
 102   bool IsComplex = false;
 103   /// IsLiveOut is initialized conservatively to true, and is set to false when
 104   /// we encounter an instruction that ends Var's live range. We disable the
 105   /// folding optimization when Var is live beyond this basic block. Note that
 106   /// if liveness analysis is not performed (e.g. in Om1 mode), IsLiveOut will
 107   /// always be true and the folding optimization will never be performed.
 108   bool IsLiveOut = true;
 109   // NumUses counts the number of times Var is used as a source operand in the
 110   // basic block. If IsComplex is true and there is more than one use of Var,
 111   // then the folding optimization is disabled for Var.
 112   uint32_t NumUses = 0;
 113 };
 114
 115 template <typename Traits> class BoolFolding {
 116 public:
 117   enum BoolFoldingProducerKind {
 118     PK_None,
 119     // TODO(jpp): PK_Icmp32 is no longer meaningful. Rename to PK_IcmpNative.
 120     PK_Icmp32,
 121     PK_Icmp64,
 122     PK_Fcmp,
 123     PK_Trunc,
 124     PK_Arith // A flag-setting arithmetic instruction.
 125   };
 126
 127   /// Currently the actual enum values are not used (other than CK_None), but we
 128   /// go ahead and produce them anyway for symmetry with the
 129   /// BoolFoldingProducerKind.
 130   enum BoolFoldingConsumerKind { CK_None, CK_Br, CK_Select, CK_Sext, CK_Zext };
 131
 132 private:
 133   BoolFolding(const BoolFolding &) = delete;
 134   BoolFolding &operator=(const BoolFolding &) = delete;
 135
 136 public:
 137   BoolFolding() = default;
 138   static BoolFoldingProducerKind getProducerKind(const Inst *Instr);
 139   static BoolFoldingConsumerKind getConsumerKind(const Inst *Instr);
 140   static bool hasComplexLowering(const Inst *Instr);
 141   static bool isValidFolding(BoolFoldingProducerKind ProducerKind,
 142                              BoolFoldingConsumerKind ConsumerKind);
 143   void init(CfgNode *Node);
 144   const Inst *getProducerFor(const Operand *Opnd) const;
 145   void dump(const Cfg *Func) const;
 146
 147 private:
 148   /// Returns true if Producers contains a valid entry for the given VarNum.
 149   bool containsValid(SizeT VarNum) const {
 150     auto Element = Producers.find(VarNum);
 151     return Element != Producers.end() && Element->second.Instr != nullptr;
 152   }
 153   void setInvalid(SizeT VarNum) { Producers[VarNum].Instr = nullptr; }
 154   void invalidateProducersOnStore(const Inst *Instr);
 155   /// Producers maps Variable::Number to a BoolFoldingEntry.
 156   CfgUnorderedMap<SizeT, BoolFoldingEntry<Traits>> Producers;
 157 };
 158
 159 template <typename Traits>
 160 BoolFoldingEntry<Traits>::BoolFoldingEntry(Inst *I)
 161     : Instr(I), IsComplex(BoolFolding<Traits>::hasComplexLowering(I)) {}
 162
 163 template <typename Traits>
 164 typename BoolFolding<Traits>::BoolFoldingProducerKind
 165 BoolFolding<Traits>::getProducerKind(const Inst *Instr) {
 166   if (llvm::isa<InstIcmp>(Instr)) {
 167     if (Traits::Is64Bit || Instr->getSrc(0)->getType() != IceType_i64)
 168       return PK_Icmp32;
 169     return PK_Icmp64;
 170   }
 171   if (llvm::isa<InstFcmp>(Instr))
 172     return PK_Fcmp;
 173   if (auto *Arith = llvm::dyn_cast<InstArithmetic>(Instr)) {
 174     if (Traits::Is64Bit || Arith->getSrc(0)->getType() != IceType_i64) {
 175       switch (Arith->getOp()) {
 176       default:
 177         return PK_None;
 178       case InstArithmetic::And:
 179       case InstArithmetic::Or:
 180         return PK_Arith;
 181       }
 182     }
 183   }
 184   return PK_None; // TODO(stichnot): remove this
 185
 186   if (auto *Cast = llvm::dyn_cast<InstCast>(Instr)) {
 187     switch (Cast->getCastKind()) {
 188     default:
 189       return PK_None;
 190     case InstCast::Trunc:
 191       return PK_Trunc;
 192     }
 193   }
 194   return PK_None;
 195 }
 196
 197 template <typename Traits>
 198 typename BoolFolding<Traits>::BoolFoldingConsumerKind
 199 BoolFolding<Traits>::getConsumerKind(const Inst *Instr) {
 200   if (llvm::isa<InstBr>(Instr))
 201     return CK_Br;
 202   if (llvm::isa<InstSelect>(Instr))
 203     return CK_Select;
 204   return CK_None; // TODO(stichnot): remove this
 205
 206   if (auto *Cast = llvm::dyn_cast<InstCast>(Instr)) {
 207     switch (Cast->getCastKind()) {
 208     default:
 209       return CK_None;
 210     case InstCast::Sext:
 211       return CK_Sext;
 212     case InstCast::Zext:
 213       return CK_Zext;
 214     }
 215   }
 216   return CK_None;
 217 }
 218
 219 /// Returns true if the producing instruction has a "complex" lowering sequence.
 220 /// This generally means that its lowering sequence requires more than one
 221 /// conditional branch, namely 64-bit integer compares and some floating-point
 222 /// compares. When this is true, and there is more than one consumer, we prefer
 223 /// to disable the folding optimization because it minimizes branches.
 224 template <typename Traits>
 225 bool BoolFolding<Traits>::hasComplexLowering(const Inst *Instr) {
 226   switch (getProducerKind(Instr)) {
 227   default:
 228     return false;
 229   case PK_Icmp64:
 230     return !Traits::Is64Bit;
 231   case PK_Fcmp:
 232     return Traits::TableFcmp[llvm::cast<InstFcmp>(Instr)->getCondition()].C2 !=
 233            Traits::Cond::Br_None;
 234   }
 235 }
 236
 237 template <typename Traits>
 238 bool BoolFolding<Traits>::isValidFolding(
 239     typename BoolFolding<Traits>::BoolFoldingProducerKind ProducerKind,
 240     typename BoolFolding<Traits>::BoolFoldingConsumerKind ConsumerKind) {
 241   switch (ProducerKind) {
 242   default:
 243     return false;
 244   case PK_Icmp32:
 245   case PK_Icmp64:
 246   case PK_Fcmp:
 247     return (ConsumerKind == CK_Br) || (ConsumerKind == CK_Select);
 248   case PK_Arith:
 249     return ConsumerKind == CK_Br;
 250   }
 251 }
 252
 253 template <typename Traits> void BoolFolding<Traits>::init(CfgNode *Node) {
 254   Producers.clear();
 255   for (Inst &Instr : Node->getInsts()) {
 256     if (Instr.isDeleted())
 257       continue;
 258     invalidateProducersOnStore(&Instr);
 259     // Check whether Instr is a valid producer.
 260     Variable *Var = Instr.getDest();
 261     if (Var // only consider instructions with an actual dest var
 262         && Var->getType() == IceType_i1          // only bool-type dest vars
 263         && getProducerKind(&Instr) != PK_None) { // white-listed instructions
 264       Producers[Var->getIndex()] = BoolFoldingEntry<Traits>(&Instr);
 265     }
 266     // Check each src variable against the map.
 267     FOREACH_VAR_IN_INST(Var, Instr) {
 268       SizeT VarNum = Var->getIndex();
 269       if (!containsValid(VarNum))
 270         continue;
 271       // All valid consumers use Var as the first source operand
 272       if (IndexOfVarOperandInInst(Var) != 0) {
 273         setInvalid(VarNum);
 274         continue;
 275       }
 276       // Consumer instructions must be white-listed
 277       typename BoolFolding<Traits>::BoolFoldingConsumerKind ConsumerKind =
 278           getConsumerKind(&Instr);
 279       if (ConsumerKind == CK_None) {
 280         setInvalid(VarNum);
 281         continue;
 282       }
 283       typename BoolFolding<Traits>::BoolFoldingProducerKind ProducerKind =
 284           getProducerKind(Producers[VarNum].Instr);
 285       if (!isValidFolding(ProducerKind, ConsumerKind)) {
 286         setInvalid(VarNum);
 287         continue;
 288       }
 289       // Avoid creating multiple copies of complex producer instructions.
 290       if (Producers[VarNum].IsComplex && Producers[VarNum].NumUses > 0) {
 291         setInvalid(VarNum);
 292         continue;
 293       }
 294       ++Producers[VarNum].NumUses;
 295       if (Instr.isLastUse(Var)) {
 296         Producers[VarNum].IsLiveOut = false;
 297       }
 298     }
 299   }
 300   for (auto &I : Producers) {
 301     // Ignore entries previously marked invalid.
 302     if (I.second.Instr == nullptr)
 303       continue;
 304     // Disable the producer if its dest may be live beyond this block.
 305     if (I.second.IsLiveOut) {
 306       setInvalid(I.first);
 307       continue;
 308     }
 309     // Mark as "dead" rather than outright deleting. This is so that other
 310     // peephole style optimizations during or before lowering have access to
 311     // this instruction in undeleted form. See for example
 312     // tryOptimizedCmpxchgCmpBr().
 313     I.second.Instr->setDead();
 314   }
 315 }
 316
 317 template <typename Traits>
 318 const Inst *BoolFolding<Traits>::getProducerFor(const Operand *Opnd) const {
 319   auto *Var = llvm::dyn_cast<const Variable>(Opnd);
 320   if (Var == nullptr)
 321     return nullptr;
 322   SizeT VarNum = Var->getIndex();
 323   auto Element = Producers.find(VarNum);
 324   if (Element == Producers.end())
 325     return nullptr;
 326   return Element->second.Instr;
 327 }
 328
 329 template <typename Traits>
 330 void BoolFolding<Traits>::dump(const Cfg *Func) const {
 331   if (!BuildDefs::dump() || !Func->isVerbose(IceV_Folding))
 332     return;
 333   OstreamLocker L(Func->getContext());
 334   Ostream &Str = Func->getContext()->getStrDump();
 335   for (auto &I : Producers) {
 336     if (I.second.Instr == nullptr)
 337       continue;
 338     Str << "Found foldable producer:\n  ";
 339     I.second.Instr->dump(Func);
 340     Str << "\n";
 341   }
 342 }
 343
 344 /// If the given instruction has potential memory side effects (e.g. store, rmw,
 345 /// or a call instruction with potential memory side effects), then we must not
 346 /// allow a pre-store Producer instruction with memory operands to be folded
 347 /// into a post-store Consumer instruction.  If this is detected, the Producer
 348 /// is invalidated.
 349 ///
 350 /// We use the Producer's IsLiveOut field to determine whether any potential
 351 /// Consumers come after this store instruction.  The IsLiveOut field is
 352 /// initialized to true, and BoolFolding::init() sets IsLiveOut to false when it
 353 /// sees the variable's definitive last use (indicating the variable is not in
 354 /// the node's live-out set).  Thus if we see here that IsLiveOut is false, we
 355 /// know that there can be no consumers after the store, and therefore we know
 356 /// the folding is safe despite the store instruction.
 357 template <typename Traits>
 358 void BoolFolding<Traits>::invalidateProducersOnStore(const Inst *Instr) {
 359   if (!Instr->isMemoryWrite())
 360     return;
 361   for (auto &ProducerPair : Producers) {
 362     if (!ProducerPair.second.IsLiveOut)
 363       continue;
 364     Inst *PInst = ProducerPair.second.Instr;
 365     if (PInst == nullptr)
 366       continue;
 367     bool HasMemOperand = false;
 368     const SizeT SrcSize = PInst->getSrcSize();
 369     for (SizeT I = 0; I < SrcSize; ++I) {
 370       if (llvm::isa<typename Traits::X86OperandMem>(PInst->getSrc(I))) {
 371         HasMemOperand = true;
 372         break;
 373       }
 374     }
 375     if (!HasMemOperand)
 376       continue;
 377     setInvalid(ProducerPair.first);
 378   }
 379 }
 380
 381 template <typename TraitsType>
 382 void TargetX86Base<TraitsType>::initNodeForLowering(CfgNode *Node) {
 383   FoldingInfo.init(Node);
 384   FoldingInfo.dump(Func);
 385 }
 386
 387 template <typename TraitsType>
 388 TargetX86Base<TraitsType>::TargetX86Base(Cfg *Func)
 389     : TargetLowering(Func), NeedSandboxing(SandboxingType == ST_NaCl) {
 390   static_assert(
 391       (Traits::InstructionSet::End - Traits::InstructionSet::Begin) ==
 392           (TargetInstructionSet::X86InstructionSet_End -
 393            TargetInstructionSet::X86InstructionSet_Begin),
 394       "Traits::InstructionSet range different from TargetInstructionSet");
 395   if (getFlags().getTargetInstructionSet() !=
 396       TargetInstructionSet::BaseInstructionSet) {
 397     InstructionSet = static_cast<InstructionSetEnum>(
 398         (getFlags().getTargetInstructionSet() -
 399          TargetInstructionSet::X86InstructionSet_Begin) +
 400         Traits::InstructionSet::Begin);
 401   }
 402 }
 403
 404 template <typename TraitsType>
 405 void TargetX86Base<TraitsType>::staticInit(GlobalContext *Ctx) {
 406   RegNumT::setLimit(Traits::RegisterSet::Reg_NUM);
 407   Traits::initRegisterSet(getFlags(), &TypeToRegisterSet, &RegisterAliases);
 408   for (size_t i = 0; i < TypeToRegisterSet.size(); ++i)
 409     TypeToRegisterSetUnfiltered[i] = TypeToRegisterSet[i];
 410   filterTypeToRegisterSet(Ctx, Traits::RegisterSet::Reg_NUM,
 411                           TypeToRegisterSet.data(), TypeToRegisterSet.size(),
 412                           Traits::getRegName, getRegClassName);
 413   PcRelFixup = Traits::FK_PcRel;
 414   AbsFixup = getFlags().getUseNonsfi() ? Traits::FK_Gotoff : Traits::FK_Abs;
 415 }
 416
 417 template <typename TraitsType>
 418 bool TargetX86Base<TraitsType>::shouldBePooled(const Constant *C) {
 419   if (auto *ConstFloat = llvm::dyn_cast<ConstantFloat>(C)) {
 420     return !Utils::isPositiveZero(ConstFloat->getValue());
 421   }
 422   if (auto *ConstDouble = llvm::dyn_cast<ConstantDouble>(C)) {
 423     return !Utils::isPositiveZero(ConstDouble->getValue());
 424   }
 425   if (getFlags().getRandomizeAndPoolImmediatesOption() != RPI_Pool) {
 426     return false;
 427   }
 428   return C->shouldBeRandomizedOrPooled();
 429 }
 430
 431 template <typename TraitsType> void TargetX86Base<TraitsType>::translateO2() {
 432   TimerMarker T(TimerStack::TT_O2, Func);
 433
 434   if (SandboxingType != ST_None) {
 435     initRebasePtr();
 436   }
 437
 438   genTargetHelperCalls();
 439   Func->dump("After target helper call insertion");
 440
 441   // Merge Alloca instructions, and lay out the stack.
 442   static constexpr bool SortAndCombineAllocas = true;
 443   Func->processAllocas(SortAndCombineAllocas);
 444   Func->dump("After Alloca processing");
 445
 446   // Run this early so it can be used to focus optimizations on potentially hot
 447   // code.
 448   // TODO(stichnot,ascull): currently only used for regalloc not
 449   // expensive high level optimizations which could be focused on potentially
 450   // hot code.
 451   Func->generateLoopInfo();
 452   Func->dump("After loop analysis");
 453   if (getFlags().getLoopInvariantCodeMotion()) {
 454     Func->loopInvariantCodeMotion();
 455     Func->dump("After LICM");
 456   }
 457
 458   if (getFlags().getEnableExperimental()) {
 459     Func->localCSE();
 460     Func->dump("After Local CSE");
 461   }
 462   if (getFlags().getEnableShortCircuit()) {
 463     Func->shortCircuitJumps();
 464     Func->dump("After Short Circuiting");
 465   }
 466
 467   if (!getFlags().getEnablePhiEdgeSplit()) {
 468     // Lower Phi instructions.
 469     Func->placePhiLoads();
 470     if (Func->hasError())
 471       return;
 472     Func->placePhiStores();
 473     if (Func->hasError())
 474       return;
 475     Func->deletePhis();
 476     if (Func->hasError())
 477       return;
 478     Func->dump("After Phi lowering");
 479   }
 480
 481   // Address mode optimization.
 482   Func->getVMetadata()->init(VMK_SingleDefs);
 483   Func->doAddressOpt();
 484   Func->materializeVectorShuffles();
 485
 486   // Find read-modify-write opportunities. Do this after address mode
 487   // optimization so that doAddressOpt() doesn't need to be applied to RMW
 488   // instructions as well.
 489   findRMW();
 490   Func->dump("After RMW transform");
 491
 492   // Argument lowering
 493   Func->doArgLowering();
 494
 495   // Target lowering. This requires liveness analysis for some parts of the
 496   // lowering decisions, such as compare/branch fusing. If non-lightweight
 497   // liveness analysis is used, the instructions need to be renumbered first
 498   // TODO: This renumbering should only be necessary if we're actually
 499   // calculating live intervals, which we only do for register allocation.
 500   Func->renumberInstructions();
 501   if (Func->hasError())
 502     return;
 503
 504   // TODO: It should be sufficient to use the fastest liveness calculation,
 505   // i.e. livenessLightweight(). However, for some reason that slows down the
 506   // rest of the translation. Investigate.
 507   Func->liveness(Liveness_Basic);
 508   if (Func->hasError())
 509     return;
 510   Func->dump("After x86 address mode opt");
 511
 512   // Disable constant blinding or pooling for load optimization.
 513   {
 514     BoolFlagSaver B(RandomizationPoolingPaused, true);
 515     doLoadOpt();
 516   }
 517   Func->genCode();
 518   if (Func->hasError())
 519     return;
 520   if (SandboxingType != ST_None) {
 521     initSandbox();
 522   }
 523   Func->dump("After x86 codegen");
 524
 525   // Register allocation. This requires instruction renumbering and full
 526   // liveness analysis. Loops must be identified before liveness so variable
 527   // use weights are correct.
 528   Func->renumberInstructions();
 529   if (Func->hasError())
 530     return;
 531   Func->liveness(Liveness_Intervals);
 532   if (Func->hasError())
 533     return;
 534   // The post-codegen dump is done here, after liveness analysis and associated
 535   // cleanup, to make the dump cleaner and more useful.
 536   Func->dump("After initial x8632 codegen");
 537   // Validate the live range computations. The expensive validation call is
 538   // deliberately only made when assertions are enabled.
 539   assert(Func->validateLiveness());
 540   Func->getVMetadata()->init(VMK_All);
 541   regAlloc(RAK_Global);
 542   if (Func->hasError())
 543     return;
 544   Func->dump("After linear scan regalloc");
 545
 546   if (getFlags().getEnablePhiEdgeSplit()) {
 547     Func->advancedPhiLowering();
 548     Func->dump("After advanced Phi lowering");
 549   }
 550
 551   // Stack frame mapping.
 552   Func->genFrame();
 553   if (Func->hasError())
 554     return;
 555   Func->dump("After stack frame mapping");
 556
 557   Func->contractEmptyNodes();
 558   Func->reorderNodes();
 559
 560   // Shuffle basic block order if -reorder-basic-blocks is enabled.
 561   Func->shuffleNodes();
 562
 563   // Branch optimization.  This needs to be done just before code emission. In
 564   // particular, no transformations that insert or reorder CfgNodes should be
 565   // done after branch optimization. We go ahead and do it before nop insertion
 566   // to reduce the amount of work needed for searching for opportunities.
 567   Func->doBranchOpt();
 568   Func->dump("After branch optimization");
 569
 570   // Nop insertion if -nop-insertion is enabled.
 571   Func->doNopInsertion();
 572
 573   // Mark nodes that require sandbox alignment
 574   if (NeedSandboxing) {
 575     Func->markNodesForSandboxing();
 576   }
 577 }
 578
 579 template <typename TraitsType> void TargetX86Base<TraitsType>::translateOm1() {
 580   TimerMarker T(TimerStack::TT_Om1, Func);
 581
 582   if (SandboxingType != ST_None) {
 583     initRebasePtr();
 584   }
 585
 586   genTargetHelperCalls();
 587
 588   // Do not merge Alloca instructions, and lay out the stack.
 589   static constexpr bool SortAndCombineAllocas = false;
 590   Func->processAllocas(SortAndCombineAllocas);
 591   Func->dump("After Alloca processing");
 592
 593   Func->placePhiLoads();
 594   if (Func->hasError())
 595     return;
 596   Func->placePhiStores();
 597   if (Func->hasError())
 598     return;
 599   Func->deletePhis();
 600   if (Func->hasError())
 601     return;
 602   Func->dump("After Phi lowering");
 603
 604   Func->doArgLowering();
 605   Func->genCode();
 606   if (Func->hasError())
 607     return;
 608   if (SandboxingType != ST_None) {
 609     initSandbox();
 610   }
 611   Func->dump("After initial x8632 codegen");
 612
 613   regAlloc(RAK_InfOnly);
 614   if (Func->hasError())
 615     return;
 616   Func->dump("After regalloc of infinite-weight variables");
 617
 618   Func->genFrame();
 619   if (Func->hasError())
 620     return;
 621   Func->dump("After stack frame mapping");
 622
 623   // Shuffle basic block order if -reorder-basic-blocks is enabled.
 624   Func->shuffleNodes();
 625
 626   // Nop insertion if -nop-insertion is enabled.
 627   Func->doNopInsertion();
 628
 629   // Mark nodes that require sandbox alignment
 630   if (NeedSandboxing)
 631     Func->markNodesForSandboxing();
 632 }
 633
 634 inline bool canRMW(const InstArithmetic *Arith) {
 635   Type Ty = Arith->getDest()->getType();
 636   // X86 vector instructions write to a register and have no RMW option.
 637   if (isVectorType(Ty))
 638     return false;
 639   bool isI64 = Ty == IceType_i64;
 640
 641   switch (Arith->getOp()) {
 642   // Not handled for lack of simple lowering:
 643   //   shift on i64
 644   //   mul, udiv, urem, sdiv, srem, frem
 645   // Not handled for lack of RMW instructions:
 646   //   fadd, fsub, fmul, fdiv (also vector types)
 647   default:
 648     return false;
 649   case InstArithmetic::Add:
 650   case InstArithmetic::Sub:
 651   case InstArithmetic::And:
 652   case InstArithmetic::Or:
 653   case InstArithmetic::Xor:
 654     return true;
 655   case InstArithmetic::Shl:
 656   case InstArithmetic::Lshr:
 657   case InstArithmetic::Ashr:
 658     return false; // TODO(stichnot): implement
 659     return !isI64;
 660   }
 661 }
 662
 663 template <typename TraitsType>
 664 bool isSameMemAddressOperand(const Operand *A, const Operand *B) {
 665   if (A == B)
 666     return true;
 667   if (auto *MemA =
 668           llvm::dyn_cast<typename TargetX86Base<TraitsType>::X86OperandMem>(
 669               A)) {
 670     if (auto *MemB =
 671             llvm::dyn_cast<typename TargetX86Base<TraitsType>::X86OperandMem>(
 672                 B)) {
 673       return MemA->getBase() == MemB->getBase() &&
 674              MemA->getOffset() == MemB->getOffset() &&
 675              MemA->getIndex() == MemB->getIndex() &&
 676              MemA->getShift() == MemB->getShift() &&
 677              MemA->getSegmentRegister() == MemB->getSegmentRegister();
 678     }
 679   }
 680   return false;
 681 }
 682
 683 template <typename TraitsType> void TargetX86Base<TraitsType>::findRMW() {
 684   TimerMarker _(TimerStack::TT_findRMW, Func);
 685   Func->dump("Before RMW");
 686   if (Func->isVerbose(IceV_RMW))
 687     Func->getContext()->lockStr();
 688   for (CfgNode *Node : Func->getNodes()) {
 689     // Walk through the instructions, considering each sequence of 3
 690     // instructions, and look for the particular RMW pattern. Note that this
 691     // search can be "broken" (false negatives) if there are intervening
 692     // deleted instructions, or intervening instructions that could be safely
 693     // moved out of the way to reveal an RMW pattern.
 694     auto E = Node->getInsts().end();
 695     auto I1 = E, I2 = E, I3 = Node->getInsts().begin();
 696     for (; I3 != E; I1 = I2, I2 = I3, ++I3) {
 697       // Make I3 skip over deleted instructions.
 698       while (I3 != E && I3->isDeleted())
 699         ++I3;
 700       if (I1 == E || I2 == E || I3 == E)
 701         continue;
 702       assert(!I1->isDeleted());
 703       assert(!I2->isDeleted());
 704       assert(!I3->isDeleted());
 705       auto *Load = llvm::dyn_cast<InstLoad>(I1);
 706       auto *Arith = llvm::dyn_cast<InstArithmetic>(I2);
 707       auto *Store = llvm::dyn_cast<InstStore>(I3);
 708       if (!Load || !Arith || !Store)
 709         continue;
 710       // Look for:
 711       //   a = Load addr
 712       //   b = <op> a, other
 713       //   Store b, addr
 714       // Change to:
 715       //   a = Load addr
 716       //   b = <op> a, other
 717       //   x = FakeDef
 718       //   RMW <op>, addr, other, x
 719       //   b = Store b, addr, x
 720       // Note that inferTwoAddress() makes sure setDestRedefined() gets called
 721       // on the updated Store instruction, to avoid liveness problems later.
 722       //
 723       // With this transformation, the Store instruction acquires a Dest
 724       // variable and is now subject to dead code elimination if there are no
 725       // more uses of "b".  Variable "x" is a beacon for determining whether the
 726       // Store instruction gets dead-code eliminated.  If the Store instruction
 727       // is eliminated, then it must be the case that the RMW instruction ends
 728       // x's live range, and therefore the RMW instruction will be retained and
 729       // later lowered.  On the other hand, if the RMW instruction does not end
 730       // x's live range, then the Store instruction must still be present, and
 731       // therefore the RMW instruction is ignored during lowering because it is
 732       // redundant with the Store instruction.
 733       //
 734       // Note that if "a" has further uses, the RMW transformation may still
 735       // trigger, resulting in two loads and one store, which is worse than the
 736       // original one load and one store.  However, this is probably rare, and
 737       // caching probably keeps it just as fast.
 738       if (!isSameMemAddressOperand<TraitsType>(Load->getSourceAddress(),
 739                                                Store->getAddr()))
 740         continue;
 741       Operand *ArithSrcFromLoad = Arith->getSrc(0);
 742       Operand *ArithSrcOther = Arith->getSrc(1);
 743       if (ArithSrcFromLoad != Load->getDest()) {
 744         if (!Arith->isCommutative() || ArithSrcOther != Load->getDest())
 745           continue;
 746         std::swap(ArithSrcFromLoad, ArithSrcOther);
 747       }
 748       if (Arith->getDest() != Store->getData())
 749         continue;
 750       if (!canRMW(Arith))
 751         continue;
 752       if (Func->isVerbose(IceV_RMW)) {
 753         Ostream &Str = Func->getContext()->getStrDump();
 754         Str << "Found RMW in " << Func->getFunctionName() << ":\n  ";
 755         Load->dump(Func);
 756         Str << "\n  ";
 757         Arith->dump(Func);
 758         Str << "\n  ";
 759         Store->dump(Func);
 760         Str << "\n";
 761       }
 762       Variable *Beacon = Func->makeVariable(IceType_i32);
 763       Beacon->setMustNotHaveReg();
 764       Store->setRmwBeacon(Beacon);
 765       auto *BeaconDef = InstFakeDef::create(Func, Beacon);
 766       Node->getInsts().insert(I3, BeaconDef);
 767       auto *RMW = InstX86FakeRMW::create(Func, ArithSrcOther, Store->getAddr(),
 768                                          Beacon, Arith->getOp());
 769       Node->getInsts().insert(I3, RMW);
 770     }
 771   }
 772   if (Func->isVerbose(IceV_RMW))
 773     Func->getContext()->unlockStr();
 774 }
 775
 776 // Converts a ConstantInteger32 operand into its constant value, or
 777 // MemoryOrderInvalid if the operand is not a ConstantInteger32.
 778 inline uint64_t getConstantMemoryOrder(Operand *Opnd) {
 779   if (auto *Integer = llvm::dyn_cast<ConstantInteger32>(Opnd))
 780     return Integer->getValue();
 781   return Intrinsics::MemoryOrderInvalid;
 782 }
 783
 784 /// Determines whether the dest of a Load instruction can be folded into one of
 785 /// the src operands of a 2-operand instruction. This is true as long as the
 786 /// load dest matches exactly one of the binary instruction's src operands.
 787 /// Replaces Src0 or Src1 with LoadSrc if the answer is true.
 788 inline bool canFoldLoadIntoBinaryInst(Operand *LoadSrc, Variable *LoadDest,
 789                                       Operand *&Src0, Operand *&Src1) {
 790   if (Src0 == LoadDest && Src1 != LoadDest) {
 791     Src0 = LoadSrc;
 792     return true;
 793   }
 794   if (Src0 != LoadDest && Src1 == LoadDest) {
 795     Src1 = LoadSrc;
 796     return true;
 797   }
 798   return false;
 799 }
 800
 801 template <typename TraitsType> void TargetX86Base<TraitsType>::doLoadOpt() {
 802   TimerMarker _(TimerStack::TT_loadOpt, Func);
 803   for (CfgNode *Node : Func->getNodes()) {
 804     Context.init(Node);
 805     while (!Context.atEnd()) {
 806       Variable *LoadDest = nullptr;
 807       Operand *LoadSrc = nullptr;
 808       Inst *CurInst = Context.getCur();
 809       Inst *Next = Context.getNextInst();
 810       // Determine whether the current instruction is a Load instruction or
 811       // equivalent.
 812       if (auto *Load = llvm::dyn_cast<InstLoad>(CurInst)) {
 813         // An InstLoad always qualifies.
 814         LoadDest = Load->getDest();
 815         constexpr bool DoLegalize = false;
 816         LoadSrc = formMemoryOperand(Load->getSourceAddress(),
 817                                     LoadDest->getType(), DoLegalize);
 818       } else if (auto *Intrin = llvm::dyn_cast<InstIntrinsicCall>(CurInst)) {
 819         // An AtomicLoad intrinsic qualifies as long as it has a valid memory
 820         // ordering, and can be implemented in a single instruction (i.e., not
 821         // i64 on x86-32).
 822         Intrinsics::IntrinsicID ID = Intrin->getIntrinsicInfo().ID;
 823         if (ID == Intrinsics::AtomicLoad &&
 824             (Traits::Is64Bit || Intrin->getDest()->getType() != IceType_i64) &&
 825             Intrinsics::isMemoryOrderValid(
 826                 ID, getConstantMemoryOrder(Intrin->getArg(1)))) {
 827           LoadDest = Intrin->getDest();
 828           constexpr bool DoLegalize = false;
 829           LoadSrc = formMemoryOperand(Intrin->getArg(0), LoadDest->getType(),
 830                                       DoLegalize);
 831         }
 832       }
 833       // A Load instruction can be folded into the following instruction only
 834       // if the following instruction ends the Load's Dest variable's live
 835       // range.
 836       if (LoadDest && Next && Next->isLastUse(LoadDest)) {
 837         assert(LoadSrc);
 838         Inst *NewInst = nullptr;
 839         if (auto *Arith = llvm::dyn_cast<InstArithmetic>(Next)) {
 840           Operand *Src0 = Arith->getSrc(0);
 841           Operand *Src1 = Arith->getSrc(1);
 842           if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
 843             NewInst = InstArithmetic::create(Func, Arith->getOp(),
 844                                              Arith->getDest(), Src0, Src1);
 845           }
 846         } else if (auto *Icmp = llvm::dyn_cast<InstIcmp>(Next)) {
 847           Operand *Src0 = Icmp->getSrc(0);
 848           Operand *Src1 = Icmp->getSrc(1);
 849           if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
 850             NewInst = InstIcmp::create(Func, Icmp->getCondition(),
 851                                        Icmp->getDest(), Src0, Src1);
 852           }
 853         } else if (auto *Fcmp = llvm::dyn_cast<InstFcmp>(Next)) {
 854           Operand *Src0 = Fcmp->getSrc(0);
 855           Operand *Src1 = Fcmp->getSrc(1);
 856           if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
 857             NewInst = InstFcmp::create(Func, Fcmp->getCondition(),
 858                                        Fcmp->getDest(), Src0, Src1);
 859           }
 860         } else if (auto *Select = llvm::dyn_cast<InstSelect>(Next)) {
 861           Operand *Src0 = Select->getTrueOperand();
 862           Operand *Src1 = Select->getFalseOperand();
 863           if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
 864             NewInst = InstSelect::create(Func, Select->getDest(),
 865                                          Select->getCondition(), Src0, Src1);
 866           }
 867         } else if (auto *Cast = llvm::dyn_cast<InstCast>(Next)) {
 868           // The load dest can always be folded into a Cast instruction.
 869           auto *Src0 = llvm::dyn_cast<Variable>(Cast->getSrc(0));
 870           if (Src0 == LoadDest) {
 871             NewInst = InstCast::create(Func, Cast->getCastKind(),
 872                                        Cast->getDest(), LoadSrc);
 873           }
 874         }
 875         if (NewInst) {
 876           CurInst->setDeleted();
 877           Next->setDeleted();
 878           Context.insert(NewInst);
 879           // Update NewInst->LiveRangesEnded so that target lowering may
 880           // benefit. Also update NewInst->HasSideEffects.
 881           NewInst->spliceLivenessInfo(Next, CurInst);
 882         }
 883       }
 884       Context.advanceCur();
 885       Context.advanceNext();
 886     }
 887   }
 888   Func->dump("After load optimization");
 889 }
 890
 891 template <typename TraitsType>
 892 bool TargetX86Base<TraitsType>::doBranchOpt(Inst *I, const CfgNode *NextNode) {
 893   if (auto *Br = llvm::dyn_cast<InstX86Br>(I)) {
 894     return Br->optimizeBranch(NextNode);
 895   }
 896   return false;
 897 }
 898
 899 template <typename TraitsType>
 900 Variable *TargetX86Base<TraitsType>::getPhysicalRegister(RegNumT RegNum,
 901                                                          Type Ty) {
 902   if (Ty == IceType_void)
 903     Ty = IceType_i32;
 904   if (PhysicalRegisters[Ty].empty())
 905     PhysicalRegisters[Ty].resize(Traits::RegisterSet::Reg_NUM);
 906   assert(unsigned(RegNum) < PhysicalRegisters[Ty].size());
 907   Variable *Reg = PhysicalRegisters[Ty][RegNum];
 908   if (Reg == nullptr) {
 909     Reg = Func->makeVariable(Ty);
 910     Reg->setRegNum(RegNum);
 911     PhysicalRegisters[Ty][RegNum] = Reg;
 912     // Specially mark a named physical register as an "argument" so that it is
 913     // considered live upon function entry.  Otherwise it's possible to get
 914     // liveness validation errors for saving callee-save registers.
 915     Func->addImplicitArg(Reg);
 916     // Don't bother tracking the live range of a named physical register.
 917     Reg->setIgnoreLiveness();
 918   }
 919   assert(Traits::getGprForType(Ty, RegNum) == RegNum);
 920   return Reg;
 921 }
 922
 923 template <typename TraitsType>
 924 const char *TargetX86Base<TraitsType>::getRegName(RegNumT RegNum,
 925                                                   Type Ty) const {
 926   return Traits::getRegName(Traits::getGprForType(Ty, RegNum));
 927 }
 928
 929 template <typename TraitsType>
 930 void TargetX86Base<TraitsType>::emitVariable(const Variable *Var) const {
 931   if (!BuildDefs::dump())
 932     return;
 933   Ostream &Str = Ctx->getStrEmit();
 934   if (Var->hasReg()) {
 935     const bool Is64BitSandboxing = Traits::Is64Bit && NeedSandboxing;
 936     const Type VarType = (Var->isRematerializable() && Is64BitSandboxing)
 937                              ? IceType_i64
 938                              : Var->getType();
 939     Str << "%" << getRegName(Var->getRegNum(), VarType);
 940     return;
 941   }
 942   if (Var->mustHaveReg()) {
 943     llvm::report_fatal_error("Infinite-weight Variable (" + Var->getName() +
 944                              ") has no register assigned - function " +
 945                              Func->getFunctionName());
 946   }
 947   const int32_t Offset = Var->getStackOffset();
 948   auto BaseRegNum = Var->getBaseRegNum();
 949   if (BaseRegNum.hasNoValue())
 950     BaseRegNum = getFrameOrStackReg();
 951
 952   // Print in the form "Offset(%reg)", omitting Offset when it is 0.
 953   if (getFlags().getDecorateAsm()) {
 954     Str << Var->getSymbolicStackOffset();
 955   } else if (Offset != 0) {
 956     Str << Offset;
 957   }
 958   const Type FrameSPTy = Traits::WordType;
 959   Str << "(%" << getRegName(BaseRegNum, FrameSPTy) << ")";
 960 }
 961
 962 template <typename TraitsType>
 963 typename TargetX86Base<TraitsType>::X86Address
 964 TargetX86Base<TraitsType>::stackVarToAsmOperand(const Variable *Var) const {
 965   if (Var->hasReg())
 966     llvm::report_fatal_error("Stack Variable has a register assigned");
 967   if (Var->mustHaveReg()) {
 968     llvm::report_fatal_error("Infinite-weight Variable (" + Var->getName() +
 969                              ") has no register assigned - function " +
 970                              Func->getFunctionName());
 971   }
 972   int32_t Offset = Var->getStackOffset();
 973   auto BaseRegNum = Var->getBaseRegNum();
 974   if (Var->getBaseRegNum().hasNoValue())
 975     BaseRegNum = getFrameOrStackReg();
 976   return X86Address(Traits::getEncodedGPR(BaseRegNum), Offset,
 977                     AssemblerFixup::NoFixup);
 978 }
 979
 980 template <typename TraitsType>
 981 void TargetX86Base<TraitsType>::addProlog(CfgNode *Node) {
 982   // Stack frame layout:
 983   //
 984   // +------------------------+
 985   // | 1. return address      |
 986   // +------------------------+
 987   // | 2. preserved registers |
 988   // +------------------------+
 989   // | 3. padding             |
 990   // +------------------------+
 991   // | 4. global spill area   |
 992   // +------------------------+
 993   // | 5. padding             |
 994   // +------------------------+
 995   // | 6. local spill area    |
 996   // +------------------------+
 997   // | 7. padding             |
 998   // +------------------------+
 999   // | 8. allocas             |
1000   // +------------------------+
1001   // | 9. padding             |
1002   // +------------------------+
1003   // | 10. out args           |
1004   // +------------------------+ <--- StackPointer
1005   //
1006   // The following variables record the size in bytes of the given areas:
1007   //  * X86_RET_IP_SIZE_BYTES:  area 1
1008   //  * PreservedRegsSizeBytes: area 2
1009   //  * SpillAreaPaddingBytes:  area 3
1010   //  * GlobalsSize:            area 4
1011   //  * GlobalsAndSubsequentPaddingSize: areas 4 - 5
1012   //  * LocalsSpillAreaSize:    area 6
1013   //  * SpillAreaSizeBytes:     areas 3 - 10
1014   //  * maxOutArgsSizeBytes():  area 10
1015
1016   // Determine stack frame offsets for each Variable without a register
1017   // assignment. This can be done as one variable per stack slot. Or, do
1018   // coalescing by running the register allocator again with an infinite set of
1019   // registers (as a side effect, this gives variables a second chance at
1020   // physical register assignment).
1021   //
1022   // A middle ground approach is to leverage sparsity and allocate one block of
1023   // space on the frame for globals (variables with multi-block lifetime), and
1024   // one block to share for locals (single-block lifetime).
1025
1026   Context.init(Node);
1027   Context.setInsertPoint(Context.getCur());
1028
1029   SmallBitVector CalleeSaves = getRegisterSet(RegSet_CalleeSave, RegSet_None);
1030   RegsUsed = SmallBitVector(CalleeSaves.size());
1031   VarList SortedSpilledVariables, VariablesLinkedToSpillSlots;
1032   size_t GlobalsSize = 0;
1033   // If there is a separate locals area, this represents that area. Otherwise
1034   // it counts any variable not counted by GlobalsSize.
1035   SpillAreaSizeBytes = 0;
1036   // If there is a separate locals area, this specifies the alignment for it.
1037   uint32_t LocalsSlotsAlignmentBytes = 0;
1038   // The entire spill locations area gets aligned to largest natural alignment
1039   // of the variables that have a spill slot.
1040   uint32_t SpillAreaAlignmentBytes = 0;
1041   // A spill slot linked to a variable with a stack slot should reuse that
1042   // stack slot.
1043   std::function<bool(Variable *)> TargetVarHook =
1044       [&VariablesLinkedToSpillSlots](Variable *Var) {
1045         if (Var->getLinkedTo() != nullptr) {
1046           // TODO(stichnot): This assert won't necessarily be true in the
1047           // future.
1048           assert(Var->mustNotHaveReg());
1049           if (!Var->getLinkedTo()->hasReg()) {
1050             VariablesLinkedToSpillSlots.push_back(Var);
1051             return true;
1052           }
1053         }
1054         return false;
1055       };
1056
1057   // Compute the list of spilled variables and bounds for GlobalsSize, etc.
1058   getVarStackSlotParams(SortedSpilledVariables, RegsUsed, &GlobalsSize,
1059                         &SpillAreaSizeBytes, &SpillAreaAlignmentBytes,
1060                         &LocalsSlotsAlignmentBytes, TargetVarHook);
1061   uint32_t LocalsSpillAreaSize = SpillAreaSizeBytes;
1062   SpillAreaSizeBytes += GlobalsSize;
1063
1064   // Add push instructions for preserved registers.
1065   uint32_t NumCallee = 0;
1066   size_t PreservedRegsSizeBytes = 0;
1067   SmallBitVector Pushed(CalleeSaves.size());
1068   for (RegNumT i : RegNumBVIter(CalleeSaves)) {
1069     const auto Canonical = Traits::getBaseReg(i);
1070     assert(Canonical == Traits::getBaseReg(Canonical));
1071     if (RegsUsed[i]) {
1072       Pushed[Canonical] = true;
1073     }
1074   }
1075   for (RegNumT RegNum : RegNumBVIter(Pushed)) {
1076     assert(RegNum == Traits::getBaseReg(RegNum));
1077     ++NumCallee;
1078     PreservedRegsSizeBytes += typeWidthInBytes(Traits::WordType);
1079     _push_reg(getPhysicalRegister(RegNum, Traits::WordType));
1080   }
1081   Ctx->statsUpdateRegistersSaved(NumCallee);
1082
1083   // Generate "push frameptr; mov frameptr, stackptr"
1084   if (IsEbpBasedFrame) {
1085     assert((RegsUsed & getRegisterSet(RegSet_FramePointer, RegSet_None))
1086                .count() == 0);
1087     PreservedRegsSizeBytes += typeWidthInBytes(Traits::WordType);
1088     _link_bp();
1089   }
1090
1091   // Align the variables area. SpillAreaPaddingBytes is the size of the region
1092   // after the preserved registers and before the spill areas.
1093   // LocalsSlotsPaddingBytes is the amount of padding between the globals and
1094   // locals area if they are separate.
1095   assert(SpillAreaAlignmentBytes <= Traits::X86_STACK_ALIGNMENT_BYTES);
1096   assert(LocalsSlotsAlignmentBytes <= SpillAreaAlignmentBytes);
1097   uint32_t SpillAreaPaddingBytes = 0;
1098   uint32_t LocalsSlotsPaddingBytes = 0;
1099   alignStackSpillAreas(Traits::X86_RET_IP_SIZE_BYTES + PreservedRegsSizeBytes,
1100                        SpillAreaAlignmentBytes, GlobalsSize,
1101                        LocalsSlotsAlignmentBytes, &SpillAreaPaddingBytes,
1102                        &LocalsSlotsPaddingBytes);
1103   SpillAreaSizeBytes += SpillAreaPaddingBytes + LocalsSlotsPaddingBytes;
1104   uint32_t GlobalsAndSubsequentPaddingSize =
1105       GlobalsSize + LocalsSlotsPaddingBytes;
1106
1107   // Functions returning scalar floating point types may need to convert values
1108   // from an in-register xmm value to the top of the x87 floating point stack.
1109   // This is done by a movp[sd] and an fld[sd].  Ensure there is enough scratch
1110   // space on the stack for this.
1111   const Type ReturnType = Func->getReturnType();
1112   if (!Traits::X86_PASS_SCALAR_FP_IN_XMM) {
1113     if (isScalarFloatingType(ReturnType)) {
1114       // Avoid misaligned double-precicion load/store.
1115       NeedsStackAlignment = true;
1116       SpillAreaSizeBytes =
1117           std::max(typeWidthInBytesOnStack(ReturnType), SpillAreaSizeBytes);
1118     }
1119   }
1120
1121   // Align esp if necessary.
1122   if (NeedsStackAlignment) {
1123     uint32_t StackOffset =
1124         Traits::X86_RET_IP_SIZE_BYTES + PreservedRegsSizeBytes;
1125     uint32_t StackSize =
1126         Traits::applyStackAlignment(StackOffset + SpillAreaSizeBytes);
1127     StackSize = Traits::applyStackAlignment(StackSize + maxOutArgsSizeBytes());
1128     SpillAreaSizeBytes = StackSize - StackOffset;
1129   } else {
1130     SpillAreaSizeBytes += maxOutArgsSizeBytes();
1131   }
1132
1133   // Combine fixed allocations into SpillAreaSizeBytes if we are emitting the
1134   // fixed allocations in the prolog.
1135   if (PrologEmitsFixedAllocas)
1136     SpillAreaSizeBytes += FixedAllocaSizeBytes;
1137   if (SpillAreaSizeBytes) {
1138     // Generate "sub stackptr, SpillAreaSizeBytes"
1139     _sub_sp(Ctx->getConstantInt32(SpillAreaSizeBytes));
1140     // If the fixed allocas are aligned more than the stack frame, align the
1141     // stack pointer accordingly.
1142     if (PrologEmitsFixedAllocas &&
1143         FixedAllocaAlignBytes > Traits::X86_STACK_ALIGNMENT_BYTES) {
1144       assert(IsEbpBasedFrame);
1145       _and(getPhysicalRegister(getStackReg(), Traits::WordType),
1146            Ctx->getConstantInt32(-FixedAllocaAlignBytes));
1147     }
1148   }
1149
1150   // Account for known-frame-offset alloca instructions that were not already
1151   // combined into the prolog.
1152   if (!PrologEmitsFixedAllocas)
1153     SpillAreaSizeBytes += FixedAllocaSizeBytes;
1154
1155   Ctx->statsUpdateFrameBytes(SpillAreaSizeBytes);
1156
1157   // Fill in stack offsets for stack args, and copy args into registers for
1158   // those that were register-allocated. Args are pushed right to left, so
1159   // Arg[0] is closest to the stack/frame pointer.
1160   Variable *FramePtr =
1161       getPhysicalRegister(getFrameOrStackReg(), Traits::WordType);
1162   size_t BasicFrameOffset =
1163       PreservedRegsSizeBytes + Traits::X86_RET_IP_SIZE_BYTES;
1164   if (!IsEbpBasedFrame)
1165     BasicFrameOffset += SpillAreaSizeBytes;
1166
1167   emitGetIP(Node);
1168
1169   const VarList &Args = Func->getArgs();
1170   size_t InArgsSizeBytes = 0;
1171   unsigned NumXmmArgs = 0;
1172   unsigned NumGPRArgs = 0;
1173   for (Variable *Arg : Args) {
1174     // Skip arguments passed in registers.
1175     if (isVectorType(Arg->getType())) {
1176       if (Traits::getRegisterForXmmArgNum(NumXmmArgs).hasValue()) {
1177         ++NumXmmArgs;
1178         continue;
1179       }
1180     } else if (isScalarFloatingType(Arg->getType())) {
1181       if (Traits::X86_PASS_SCALAR_FP_IN_XMM &&
1182           Traits::getRegisterForXmmArgNum(NumXmmArgs).hasValue()) {
1183         ++NumXmmArgs;
1184         continue;
1185       }
1186     } else {
1187       assert(isScalarIntegerType(Arg->getType()));
1188       if (Traits::getRegisterForGprArgNum(Traits::WordType, NumGPRArgs)
1189               .hasValue()) {
1190         ++NumGPRArgs;
1191         continue;
1192       }
1193     }
1194     // For esp-based frames where the allocas are done outside the prolog, the
1195     // esp value may not stabilize to its home value until after all the
1196     // fixed-size alloca instructions have executed.  In this case, a stack
1197     // adjustment is needed when accessing in-args in order to copy them into
1198     // registers.
1199     size_t StackAdjBytes = 0;
1200     if (!IsEbpBasedFrame && !PrologEmitsFixedAllocas)
1201       StackAdjBytes -= FixedAllocaSizeBytes;
1202     finishArgumentLowering(Arg, FramePtr, BasicFrameOffset, StackAdjBytes,
1203                            InArgsSizeBytes);
1204   }
1205
1206   // Fill in stack offsets for locals.
1207   assignVarStackSlots(SortedSpilledVariables, SpillAreaPaddingBytes,
1208                       SpillAreaSizeBytes, GlobalsAndSubsequentPaddingSize,
1209                       IsEbpBasedFrame);
1210   // Assign stack offsets to variables that have been linked to spilled
1211   // variables.
1212   for (Variable *Var : VariablesLinkedToSpillSlots) {
1213     const Variable *Root = Var->getLinkedToRoot();
1214     assert(Root != nullptr);
1215     Var->setStackOffset(Root->getStackOffset());
1216   }
1217   this->HasComputedFrame = true;
1218
1219   if (BuildDefs::dump() && Func->isVerbose(IceV_Frame)) {
1220     OstreamLocker L(Func->getContext());
1221     Ostream &Str = Func->getContext()->getStrDump();
1222
1223     Str << "Stack layout:\n";
1224     uint32_t EspAdjustmentPaddingSize =
1225         SpillAreaSizeBytes - LocalsSpillAreaSize -
1226         GlobalsAndSubsequentPaddingSize - SpillAreaPaddingBytes -
1227         maxOutArgsSizeBytes();
1228     Str << " in-args = " << InArgsSizeBytes << " bytes\n"
1229         << " return address = " << Traits::X86_RET_IP_SIZE_BYTES << " bytes\n"
1230         << " preserved registers = " << PreservedRegsSizeBytes << " bytes\n"
1231         << " spill area padding = " << SpillAreaPaddingBytes << " bytes\n"
1232         << " globals spill area = " << GlobalsSize << " bytes\n"
1233         << " globals-locals spill areas intermediate padding = "
1234         << GlobalsAndSubsequentPaddingSize - GlobalsSize << " bytes\n"
1235         << " locals spill area = " << LocalsSpillAreaSize << " bytes\n"
1236         << " esp alignment padding = " << EspAdjustmentPaddingSize
1237         << " bytes\n";
1238
1239     Str << "Stack details:\n"
1240         << " esp adjustment = " << SpillAreaSizeBytes << " bytes\n"
1241         << " spill area alignment = " << SpillAreaAlignmentBytes << " bytes\n"
1242         << " outgoing args size = " << maxOutArgsSizeBytes() << " bytes\n"
1243         << " locals spill area alignment = " << LocalsSlotsAlignmentBytes
1244         << " bytes\n"
1245         << " is ebp based = " << IsEbpBasedFrame << "\n";
1246   }
1247 }
1248
1249 /// Helper function for addProlog().
1250 ///
1251 /// This assumes Arg is an argument passed on the stack. This sets the frame
1252 /// offset for Arg and updates InArgsSizeBytes according to Arg's width. For an
1253 /// I64 arg that has been split into Lo and Hi components, it calls itself
1254 /// recursively on the components, taking care to handle Lo first because of the
1255 /// little-endian architecture. Lastly, this function generates an instruction
1256 /// to copy Arg into its assigned register if applicable.
1257 template <typename TraitsType>
1258 void TargetX86Base<TraitsType>::finishArgumentLowering(
1259     Variable *Arg, Variable *FramePtr, size_t BasicFrameOffset,
1260     size_t StackAdjBytes, size_t &InArgsSizeBytes) {
1261   if (!Traits::Is64Bit) {
1262     if (auto *Arg64On32 = llvm::dyn_cast<Variable64On32>(Arg)) {
1263       Variable *Lo = Arg64On32->getLo();
1264       Variable *Hi = Arg64On32->getHi();
1265       finishArgumentLowering(Lo, FramePtr, BasicFrameOffset, StackAdjBytes,
1266                              InArgsSizeBytes);
1267       finishArgumentLowering(Hi, FramePtr, BasicFrameOffset, StackAdjBytes,
1268                              InArgsSizeBytes);
1269       return;
1270     }
1271   }
1272   Type Ty = Arg->getType();
1273   if (isVectorType(Ty)) {
1274     InArgsSizeBytes = Traits::applyStackAlignment(InArgsSizeBytes);
1275   }
1276   Arg->setStackOffset(BasicFrameOffset + InArgsSizeBytes);
1277   InArgsSizeBytes += typeWidthInBytesOnStack(Ty);
1278   if (Arg->hasReg()) {
1279     assert(Ty != IceType_i64 || Traits::Is64Bit);
1280     auto *Mem = X86OperandMem::create(
1281         Func, Ty, FramePtr,
1282         Ctx->getConstantInt32(Arg->getStackOffset() + StackAdjBytes));
1283     if (isVectorType(Arg->getType())) {
1284       _movp(Arg, Mem);
1285     } else {
1286       _mov(Arg, Mem);
1287     }
1288     // This argument-copying instruction uses an explicit X86OperandMem
1289     // operand instead of a Variable, so its fill-from-stack operation has to
1290     // be tracked separately for statistics.
1291     Ctx->statsUpdateFills();
1292   }
1293 }
1294
1295 template <typename TraitsType>
1296 void TargetX86Base<TraitsType>::addEpilog(CfgNode *Node) {
1297   InstList &Insts = Node->getInsts();
1298   InstList::reverse_iterator RI, E;
1299   for (RI = Insts.rbegin(), E = Insts.rend(); RI != E; ++RI) {
1300     if (llvm::isa<typename Traits::Insts::Ret>(*RI))
1301       break;
1302   }
1303   if (RI == E)
1304     return;
1305
1306   // Convert the reverse_iterator position into its corresponding (forward)
1307   // iterator position.
1308   InstList::iterator InsertPoint = RI.base();
1309   --InsertPoint;
1310   Context.init(Node);
1311   Context.setInsertPoint(InsertPoint);
1312
1313   if (IsEbpBasedFrame) {
1314     _unlink_bp();
1315   } else {
1316     // add stackptr, SpillAreaSizeBytes
1317     if (SpillAreaSizeBytes != 0) {
1318       _add_sp(Ctx->getConstantInt32(SpillAreaSizeBytes));
1319     }
1320   }
1321
1322   // Add pop instructions for preserved registers.
1323   SmallBitVector CalleeSaves = getRegisterSet(RegSet_CalleeSave, RegSet_None);
1324   SmallBitVector Popped(CalleeSaves.size());
1325   for (int32_t i = CalleeSaves.size() - 1; i >= 0; --i) {
1326     const auto RegNum = RegNumT::fromInt(i);
1327     if (RegNum == getFrameReg() && IsEbpBasedFrame)
1328       continue;
1329     const RegNumT Canonical = Traits::getBaseReg(RegNum);
1330     if (CalleeSaves[i] && RegsUsed[i]) {
1331       Popped[Canonical] = true;
1332     }
1333   }
1334   for (int32_t i = Popped.size() - 1; i >= 0; --i) {
1335     if (!Popped[i])
1336       continue;
1337     const auto RegNum = RegNumT::fromInt(i);
1338     assert(RegNum == Traits::getBaseReg(RegNum));
1339     _pop(getPhysicalRegister(RegNum, Traits::WordType));
1340   }
1341
1342   if (!NeedSandboxing) {
1343     return;
1344   }
1345   emitSandboxedReturn();
1346   if (RI->getSrcSize()) {
1347     auto *RetValue = llvm::cast<Variable>(RI->getSrc(0));
1348     Context.insert<InstFakeUse>(RetValue);
1349   }
1350   RI->setDeleted();
1351 }
1352
1353 template <typename TraitsType> Type TargetX86Base<TraitsType>::stackSlotType() {
1354   return Traits::WordType;
1355 }
1356
1357 template <typename TraitsType>
1358 template <typename T>
1359 typename std::enable_if<!T::Is64Bit, Operand>::type *
1360 TargetX86Base<TraitsType>::loOperand(Operand *Operand) {
1361   assert(Operand->getType() == IceType_i64 ||
1362          Operand->getType() == IceType_f64);
1363   if (Operand->getType() != IceType_i64 && Operand->getType() != IceType_f64)
1364     return Operand;
1365   if (auto *Var64On32 = llvm::dyn_cast<Variable64On32>(Operand))
1366     return Var64On32->getLo();
1367   if (auto *Const = llvm::dyn_cast<ConstantInteger64>(Operand)) {
1368     auto *ConstInt = llvm::dyn_cast<ConstantInteger32>(
1369         Ctx->getConstantInt32(static_cast<int32_t>(Const->getValue())));
1370     // Check if we need to blind/pool the constant.
1371     return legalize(ConstInt);
1372   }
1373   if (auto *Mem = llvm::dyn_cast<X86OperandMem>(Operand)) {
1374     auto *MemOperand = X86OperandMem::create(
1375         Func, IceType_i32, Mem->getBase(), Mem->getOffset(), Mem->getIndex(),
1376         Mem->getShift(), Mem->getSegmentRegister(), Mem->getIsRebased());
1377     // Test if we should randomize or pool the offset, if so randomize it or
1378     // pool it then create mem operand with the blinded/pooled constant.
1379     // Otherwise, return the mem operand as ordinary mem operand.
1380     return legalize(MemOperand);
1381   }
1382   llvm_unreachable("Unsupported operand type");
1383   return nullptr;
1384 }
1385
1386 template <typename TraitsType>
1387 template <typename T>
1388 typename std::enable_if<!T::Is64Bit, Operand>::type *
1389 TargetX86Base<TraitsType>::hiOperand(Operand *Operand) {
1390   assert(Operand->getType() == IceType_i64 ||
1391          Operand->getType() == IceType_f64);
1392   if (Operand->getType() != IceType_i64 && Operand->getType() != IceType_f64)
1393     return Operand;
1394   if (auto *Var64On32 = llvm::dyn_cast<Variable64On32>(Operand))
1395     return Var64On32->getHi();
1396   if (auto *Const = llvm::dyn_cast<ConstantInteger64>(Operand)) {
1397     auto *ConstInt = llvm::dyn_cast<ConstantInteger32>(
1398         Ctx->getConstantInt32(static_cast<int32_t>(Const->getValue() >> 32)));
1399     // Check if we need to blind/pool the constant.
1400     return legalize(ConstInt);
1401   }
1402   if (auto *Mem = llvm::dyn_cast<X86OperandMem>(Operand)) {
1403     Constant *Offset = Mem->getOffset();
1404     if (Offset == nullptr) {
1405       Offset = Ctx->getConstantInt32(4);
1406     } else if (auto *IntOffset = llvm::dyn_cast<ConstantInteger32>(Offset)) {
1407       Offset = Ctx->getConstantInt32(4 + IntOffset->getValue());
1408     } else if (auto *SymOffset = llvm::dyn_cast<ConstantRelocatable>(Offset)) {
1409       assert(!Utils::WouldOverflowAdd(SymOffset->getOffset(), 4));
1410       Offset =
1411           Ctx->getConstantSym(4 + SymOffset->getOffset(), SymOffset->getName());
1412     }
1413     auto *MemOperand = X86OperandMem::create(
1414         Func, IceType_i32, Mem->getBase(), Offset, Mem->getIndex(),
1415         Mem->getShift(), Mem->getSegmentRegister(), Mem->getIsRebased());
1416     // Test if the Offset is an eligible i32 constants for randomization and
1417     // pooling. Blind/pool it if it is. Otherwise return as oridinary mem
1418     // operand.
1419     return legalize(MemOperand);
1420   }
1421   llvm_unreachable("Unsupported operand type");
1422   return nullptr;
1423 }
1424
1425 template <typename TraitsType>
1426 SmallBitVector
1427 TargetX86Base<TraitsType>::getRegisterSet(RegSetMask Include,
1428                                           RegSetMask Exclude) const {
1429   return Traits::getRegisterSet(getFlags(), Include, Exclude);
1430 }
1431
1432 template <typename TraitsType>
1433 void TargetX86Base<TraitsType>::lowerAlloca(const InstAlloca *Instr) {
1434   // Conservatively require the stack to be aligned. Some stack adjustment
1435   // operations implemented below assume that the stack is aligned before the
1436   // alloca. All the alloca code ensures that the stack alignment is preserved
1437   // after the alloca. The stack alignment restriction can be relaxed in some
1438   // cases.
1439   NeedsStackAlignment = true;
1440
1441   // For default align=0, set it to the real value 1, to avoid any
1442   // bit-manipulation problems below.
1443   const uint32_t AlignmentParam = std::max(1u, Instr->getAlignInBytes());
1444
1445   // LLVM enforces power of 2 alignment.
1446   assert(llvm::isPowerOf2_32(AlignmentParam));
1447   assert(llvm::isPowerOf2_32(Traits::X86_STACK_ALIGNMENT_BYTES));
1448
1449   const uint32_t Alignment =
1450       std::max(AlignmentParam, Traits::X86_STACK_ALIGNMENT_BYTES);
1451   const bool OverAligned = Alignment > Traits::X86_STACK_ALIGNMENT_BYTES;
1452   const bool OptM1 = Func->getOptLevel() == Opt_m1;
1453   const bool AllocaWithKnownOffset = Instr->getKnownFrameOffset();
1454   const bool UseFramePointer =
1455       hasFramePointer() || OverAligned || !AllocaWithKnownOffset || OptM1;
1456
1457   if (UseFramePointer)
1458     setHasFramePointer();
1459
1460   Variable *esp = getPhysicalRegister(getStackReg(), Traits::WordType);
1461   if (OverAligned) {
1462     _and(esp, Ctx->getConstantInt32(-Alignment));
1463   }
1464
1465   Variable *Dest = Instr->getDest();
1466   Operand *TotalSize = legalize(Instr->getSizeInBytes());
1467
1468   if (const auto *ConstantTotalSize =
1469           llvm::dyn_cast<ConstantInteger32>(TotalSize)) {
1470     const uint32_t Value =
1471         Utils::applyAlignment(ConstantTotalSize->getValue(), Alignment);
1472     if (UseFramePointer) {
1473       _sub_sp(Ctx->getConstantInt32(Value));
1474     } else {
1475       // If we don't need a Frame Pointer, this alloca has a known offset to the
1476       // stack pointer. We don't need adjust the stack pointer, nor assign any
1477       // value to Dest, as Dest is rematerializable.
1478       assert(Dest->isRematerializable());
1479       FixedAllocaSizeBytes += Value;
1480       Context.insert<InstFakeDef>(Dest);
1481     }
1482   } else {
1483     // Non-constant sizes need to be adjusted to the next highest multiple of
1484     // the required alignment at runtime.
1485     Variable *T = nullptr;
1486     if (Traits::Is64Bit && TotalSize->getType() != IceType_i64 &&
1487         !NeedSandboxing) {
1488       T = makeReg(IceType_i64);
1489       _movzx(T, TotalSize);
1490     } else {
1491       T = makeReg(IceType_i32);
1492       _mov(T, TotalSize);
1493     }
1494     _add(T, Ctx->getConstantInt32(Alignment - 1));
1495     _and(T, Ctx->getConstantInt32(-Alignment));
1496     _sub_sp(T);
1497   }
1498   // Add enough to the returned address to account for the out args area.
1499   uint32_t OutArgsSize = maxOutArgsSizeBytes();
1500   if (OutArgsSize > 0) {
1501     Variable *T = makeReg(IceType_i32);
1502     auto *CalculateOperand = X86OperandMem::create(
1503         Func, IceType_void, esp, Ctx->getConstantInt(IceType_i32, OutArgsSize));
1504     _lea(T, CalculateOperand);
1505     _mov(Dest, T);
1506   } else {
1507     _mov(Dest, esp);
1508   }
1509 }
1510
1511 template <typename TraitsType>
1512 void TargetX86Base<TraitsType>::lowerArguments() {
1513   const bool OptM1 = Func->getOptLevel() == Opt_m1;
1514   VarList &Args = Func->getArgs();
1515   unsigned NumXmmArgs = 0;
1516   bool XmmSlotsRemain = true;
1517   unsigned NumGprArgs = 0;
1518   bool GprSlotsRemain = true;
1519
1520   Context.init(Func->getEntryNode());
1521   Context.setInsertPoint(Context.getCur());
1522
1523   for (SizeT i = 0, End = Args.size();
1524        i < End && (XmmSlotsRemain || GprSlotsRemain); ++i) {
1525     Variable *Arg = Args[i];
1526     Type Ty = Arg->getType();
1527     Variable *RegisterArg = nullptr;
1528     RegNumT RegNum;
1529     if (isVectorType(Ty)) {
1530       RegNum = Traits::getRegisterForXmmArgNum(NumXmmArgs);
1531       if (RegNum.hasNoValue()) {
1532         XmmSlotsRemain = false;
1533         continue;
1534       }
1535       ++NumXmmArgs;
1536       RegisterArg = Func->makeVariable(Ty);
1537     } else if (isScalarFloatingType(Ty)) {
1538       if (!Traits::X86_PASS_SCALAR_FP_IN_XMM) {
1539         continue;
1540       }
1541       RegNum = Traits::getRegisterForXmmArgNum(NumXmmArgs);
1542       if (RegNum.hasNoValue()) {
1543         XmmSlotsRemain = false;
1544         continue;
1545       }
1546       ++NumXmmArgs;
1547       RegisterArg = Func->makeVariable(Ty);
1548     } else if (isScalarIntegerType(Ty)) {
1549       RegNum = Traits::getRegisterForGprArgNum(Ty, NumGprArgs);
1550       if (RegNum.hasNoValue()) {
1551         GprSlotsRemain = false;
1552         continue;
1553       }
1554       ++NumGprArgs;
1555       RegisterArg = Func->makeVariable(Ty);
1556     }
1557     assert(RegNum.hasValue());
1558     assert(RegisterArg != nullptr);
1559     // Replace Arg in the argument list with the home register. Then generate
1560     // an instruction in the prolog to copy the home register to the assigned
1561     // location of Arg.
1562     if (BuildDefs::dump())
1563       RegisterArg->setName(Func, "home_reg:" + Arg->getName());
1564     RegisterArg->setRegNum(RegNum);
1565     RegisterArg->setIsArg();
1566     Arg->setIsArg(false);
1567
1568     Args[i] = RegisterArg;
1569     // When not Om1, do the assignment through a temporary, instead of directly
1570     // from the pre-colored variable, so that a subsequent availabilityGet()
1571     // call has a chance to work.  (In Om1, don't bother creating extra
1572     // instructions with extra variables to register-allocate.)
1573     if (OptM1) {
1574       Context.insert<InstAssign>(Arg, RegisterArg);
1575     } else {
1576       Variable *Tmp = makeReg(RegisterArg->getType());
1577       Context.insert<InstAssign>(Tmp, RegisterArg);
1578       Context.insert<InstAssign>(Arg, Tmp);
1579     }
1580   }
1581   if (!OptM1)
1582     Context.availabilityUpdate();
1583 }
1584
1585 /// Strength-reduce scalar integer multiplication by a constant (for i32 or
1586 /// narrower) for certain constants. The lea instruction can be used to multiply
1587 /// by 3, 5, or 9, and the lsh instruction can be used to multiply by powers of
1588 /// 2. These can be combined such that e.g. multiplying by 100 can be done as 2
1589 /// lea-based multiplies by 5, combined with left-shifting by 2.
1590 template <typename TraitsType>
1591 bool TargetX86Base<TraitsType>::optimizeScalarMul(Variable *Dest, Operand *Src0,
1592                                                   int32_t Src1) {
1593   // Disable this optimization for Om1 and O0, just to keep things simple
1594   // there.
1595   if (Func->getOptLevel() < Opt_1)
1596     return false;
1597   Type Ty = Dest->getType();
1598   if (Src1 == -1) {
1599     Variable *T = nullptr;
1600     _mov(T, Src0);
1601     _neg(T);
1602     _mov(Dest, T);
1603     return true;
1604   }
1605   if (Src1 == 0) {
1606     _mov(Dest, Ctx->getConstantZero(Ty));
1607     return true;
1608   }
1609   if (Src1 == 1) {
1610     Variable *T = nullptr;
1611     _mov(T, Src0);
1612     _mov(Dest, T);
1613     return true;
1614   }
1615   // Don't bother with the edge case where Src1 == MININT.
1616   if (Src1 == -Src1)
1617     return false;
1618   const bool Src1IsNegative = Src1 < 0;
1619   if (Src1IsNegative)
1620     Src1 = -Src1;
1621   uint32_t Count9 = 0;
1622   uint32_t Count5 = 0;
1623   uint32_t Count3 = 0;
1624   uint32_t Count2 = 0;
1625   uint32_t CountOps = 0;
1626   while (Src1 > 1) {
1627     if (Src1 % 9 == 0) {
1628       ++CountOps;
1629       ++Count9;
1630       Src1 /= 9;
1631     } else if (Src1 % 5 == 0) {
1632       ++CountOps;
1633       ++Count5;
1634       Src1 /= 5;
1635     } else if (Src1 % 3 == 0) {
1636       ++CountOps;
1637       ++Count3;
1638       Src1 /= 3;
1639     } else if (Src1 % 2 == 0) {
1640       if (Count2 == 0)
1641         ++CountOps;
1642       ++Count2;
1643       Src1 /= 2;
1644     } else {
1645       return false;
1646     }
1647   }
1648   // Lea optimization only works for i16 and i32 types, not i8.
1649   if (Ty != IceType_i32 && !(Traits::Is64Bit && Ty == IceType_i64) &&
1650       (Count3 || Count5 || Count9))
1651     return false;
1652   // Limit the number of lea/shl operations for a single multiply, to a
1653   // somewhat arbitrary choice of 3.
1654   constexpr uint32_t MaxOpsForOptimizedMul = 3;
1655   if (CountOps > MaxOpsForOptimizedMul)
1656     return false;
1657   Variable *T = makeReg(Traits::WordType);
1658   if (typeWidthInBytes(Src0->getType()) < typeWidthInBytes(T->getType())) {
1659     _movzx(T, Src0);
1660   } else {
1661     _mov(T, Src0);
1662   }
1663   Constant *Zero = Ctx->getConstantZero(IceType_i32);
1664   for (uint32_t i = 0; i < Count9; ++i) {
1665     constexpr uint16_t Shift = 3; // log2(9-1)
1666     _lea(T, X86OperandMem::create(Func, IceType_void, T, Zero, T, Shift));
1667   }
1668   for (uint32_t i = 0; i < Count5; ++i) {
1669     constexpr uint16_t Shift = 2; // log2(5-1)
1670     _lea(T, X86OperandMem::create(Func, IceType_void, T, Zero, T, Shift));
1671   }
1672   for (uint32_t i = 0; i < Count3; ++i) {
1673     constexpr uint16_t Shift = 1; // log2(3-1)
1674     _lea(T, X86OperandMem::create(Func, IceType_void, T, Zero, T, Shift));
1675   }
1676   if (Count2) {
1677     _shl(T, Ctx->getConstantInt(Ty, Count2));
1678   }
1679   if (Src1IsNegative)
1680     _neg(T);
1681   _mov(Dest, T);
1682   return true;
1683 }
1684
1685 template <typename TraitsType>
1686 void TargetX86Base<TraitsType>::lowerShift64(InstArithmetic::OpKind Op,
1687                                              Operand *Src0Lo, Operand *Src0Hi,
1688                                              Operand *Src1Lo, Variable *DestLo,
1689                                              Variable *DestHi) {
1690   // TODO: Refactor the similarities between Shl, Lshr, and Ashr.
1691   Variable *T_1 = nullptr, *T_2 = nullptr, *T_3 = nullptr;
1692   Constant *Zero = Ctx->getConstantZero(IceType_i32);
1693   Constant *SignExtend = Ctx->getConstantInt32(0x1f);
1694   if (auto *ConstantShiftAmount = llvm::dyn_cast<ConstantInteger32>(Src1Lo)) {
1695     uint32_t ShiftAmount = ConstantShiftAmount->getValue();
1696     if (ShiftAmount > 32) {
1697       Constant *ReducedShift = Ctx->getConstantInt32(ShiftAmount - 32);
1698       switch (Op) {
1699       default:
1700         assert(0 && "non-shift op");
1701         break;
1702       case InstArithmetic::Shl: {
1703         // a=b<<c ==>
1704         //   t2 = b.lo
1705         //   t2 = shl t2, ShiftAmount-32
1706         //   t3 = t2
1707         //   t2 = 0
1708         _mov(T_2, Src0Lo);
1709         _shl(T_2, ReducedShift);
1710         _mov(DestHi, T_2);
1711         _mov(DestLo, Zero);
1712       } break;
1713       case InstArithmetic::Lshr: {
1714         // a=b>>c (unsigned) ==>
1715         //   t2 = b.hi
1716         //   t2 = shr t2, ShiftAmount-32
1717         //   a.lo = t2
1718         //   a.hi = 0
1719         _mov(T_2, Src0Hi);
1720         _shr(T_2, ReducedShift);
1721         _mov(DestLo, T_2);
1722         _mov(DestHi, Zero);
1723       } break;
1724       case InstArithmetic::Ashr: {
1725         // a=b>>c (signed) ==>
1726         //   t3 = b.hi
1727         //   t3 = sar t3, 0x1f
1728         //   t2 = b.hi
1729         //   t2 = shrd t2, t3, ShiftAmount-32
1730         //   a.lo = t2
1731         //   a.hi = t3
1732         _mov(T_3, Src0Hi);
1733         _sar(T_3, SignExtend);
1734         _mov(T_2, Src0Hi);
1735         _shrd(T_2, T_3, ReducedShift);
1736         _mov(DestLo, T_2);
1737         _mov(DestHi, T_3);
1738       } break;
1739       }
1740     } else if (ShiftAmount == 32) {
1741       switch (Op) {
1742       default:
1743         assert(0 && "non-shift op");
1744         break;
1745       case InstArithmetic::Shl: {
1746         // a=b<<c ==>
1747         //   t2 = b.lo
1748         //   a.hi = t2
1749         //   a.lo = 0
1750         _mov(T_2, Src0Lo);
1751         _mov(DestHi, T_2);
1752         _mov(DestLo, Zero);
1753       } break;
1754       case InstArithmetic::Lshr: {
1755         // a=b>>c (unsigned) ==>
1756         //   t2 = b.hi
1757         //   a.lo = t2
1758         //   a.hi = 0
1759         _mov(T_2, Src0Hi);
1760         _mov(DestLo, T_2);
1761         _mov(DestHi, Zero);
1762       } break;
1763       case InstArithmetic::Ashr: {
1764         // a=b>>c (signed) ==>
1765         //   t2 = b.hi
1766         //   a.lo = t2
1767         //   t3 = b.hi
1768         //   t3 = sar t3, 0x1f
1769         //   a.hi = t3
1770         _mov(T_2, Src0Hi);
1771         _mov(DestLo, T_2);
1772         _mov(T_3, Src0Hi);
1773         _sar(T_3, SignExtend);
1774         _mov(DestHi, T_3);
1775       } break;
1776       }
1777     } else {
1778       // COMMON PREFIX OF: a=b SHIFT_OP c ==>
1779       //   t2 = b.lo
1780       //   t3 = b.hi
1781       _mov(T_2, Src0Lo);
1782       _mov(T_3, Src0Hi);
1783       switch (Op) {
1784       default:
1785         assert(0 && "non-shift op");
1786         break;
1787       case InstArithmetic::Shl: {
1788         // a=b<<c ==>
1789         //   t3 = shld t3, t2, ShiftAmount
1790         //   t2 = shl t2, ShiftAmount
1791         _shld(T_3, T_2, ConstantShiftAmount);
1792         _shl(T_2, ConstantShiftAmount);
1793       } break;
1794       case InstArithmetic::Lshr: {
1795         // a=b>>c (unsigned) ==>
1796         //   t2 = shrd t2, t3, ShiftAmount
1797         //   t3 = shr t3, ShiftAmount
1798         _shrd(T_2, T_3, ConstantShiftAmount);
1799         _shr(T_3, ConstantShiftAmount);
1800       } break;
1801       case InstArithmetic::Ashr: {
1802         // a=b>>c (signed) ==>
1803         //   t2 = shrd t2, t3, ShiftAmount
1804         //   t3 = sar t3, ShiftAmount
1805         _shrd(T_2, T_3, ConstantShiftAmount);
1806         _sar(T_3, ConstantShiftAmount);
1807       } break;
1808       }
1809       // COMMON SUFFIX OF: a=b SHIFT_OP c ==>
1810       //   a.lo = t2
1811       //   a.hi = t3
1812       _mov(DestLo, T_2);
1813       _mov(DestHi, T_3);
1814     }
1815   } else {
1816     // NON-CONSTANT CASES.
1817     Constant *BitTest = Ctx->getConstantInt32(0x20);
1818     InstX86Label *Label = InstX86Label::create(Func, this);
1819     // COMMON PREFIX OF: a=b SHIFT_OP c ==>
1820     //   t1:ecx = c.lo & 0xff
1821     //   t2 = b.lo
1822     //   t3 = b.hi
1823     T_1 = copyToReg8(Src1Lo, Traits::RegisterSet::Reg_cl);
1824     _mov(T_2, Src0Lo);
1825     _mov(T_3, Src0Hi);
1826     switch (Op) {
1827     default:
1828       assert(0 && "non-shift op");
1829       break;
1830     case InstArithmetic::Shl: {
1831       // a=b<<c ==>
1832       //   t3 = shld t3, t2, t1
1833       //   t2 = shl t2, t1
1834       //   test t1, 0x20
1835       //   je L1
1836       //   use(t3)
1837       //   t3 = t2
1838       //   t2 = 0
1839       _shld(T_3, T_2, T_1);
1840       _shl(T_2, T_1);
1841       _test(T_1, BitTest);
1842       _br(Traits::Cond::Br_e, Label);
1843       // T_2 and T_3 are being assigned again because of the intra-block control
1844       // flow, so we need to use _redefined to avoid liveness problems.
1845       _redefined(_mov(T_3, T_2));
1846       _redefined(_mov(T_2, Zero));
1847     } break;
1848     case InstArithmetic::Lshr: {
1849       // a=b>>c (unsigned) ==>
1850       //   t2 = shrd t2, t3, t1
1851       //   t3 = shr t3, t1
1852       //   test t1, 0x20
1853       //   je L1
1854       //   use(t2)
1855       //   t2 = t3
1856       //   t3 = 0
1857       _shrd(T_2, T_3, T_1);
1858       _shr(T_3, T_1);
1859       _test(T_1, BitTest);
1860       _br(Traits::Cond::Br_e, Label);
1861       // T_2 and T_3 are being assigned again because of the intra-block control
1862       // flow, so we need to use _redefined to avoid liveness problems.
1863       _redefined(_mov(T_2, T_3));
1864       _redefined(_mov(T_3, Zero));
1865     } break;
1866     case InstArithmetic::Ashr: {
1867       // a=b>>c (signed) ==>
1868       //   t2 = shrd t2, t3, t1
1869       //   t3 = sar t3, t1
1870       //   test t1, 0x20
1871       //   je L1
1872       //   use(t2)
1873       //   t2 = t3
1874       //   t3 = sar t3, 0x1f
1875       Constant *SignExtend = Ctx->getConstantInt32(0x1f);
1876       _shrd(T_2, T_3, T_1);
1877       _sar(T_3, T_1);
1878       _test(T_1, BitTest);
1879       _br(Traits::Cond::Br_e, Label);
1880       // T_2 and T_3 are being assigned again because of the intra-block control
1881       // flow, so T_2 needs to use _redefined to avoid liveness problems. T_3
1882       // doesn't need special treatment because it is reassigned via _sar
1883       // instead of _mov.
1884       _redefined(_mov(T_2, T_3));
1885       _sar(T_3, SignExtend);
1886     } break;
1887     }
1888     // COMMON SUFFIX OF: a=b SHIFT_OP c ==>
1889     // L1:
1890     //   a.lo = t2
1891     //   a.hi = t3
1892     Context.insert(Label);
1893     _mov(DestLo, T_2);
1894     _mov(DestHi, T_3);
1895   }
1896 }
1897
1898 template <typename TraitsType>
1899 void TargetX86Base<TraitsType>::lowerArithmetic(const InstArithmetic *Instr) {
1900   Variable *Dest = Instr->getDest();
1901   if (Dest->isRematerializable()) {
1902     Context.insert<InstFakeDef>(Dest);
1903     return;
1904   }
1905   Type Ty = Dest->getType();
1906   Operand *Src0 = legalize(Instr->getSrc(0));
1907   Operand *Src1 = legalize(Instr->getSrc(1));
1908   if (Instr->isCommutative()) {
1909     uint32_t SwapCount = 0;
1910     if (!llvm::isa<Variable>(Src0) && llvm::isa<Variable>(Src1)) {
1911       std::swap(Src0, Src1);
1912       ++SwapCount;
1913     }
1914     if (llvm::isa<Constant>(Src0) && !llvm::isa<Constant>(Src1)) {
1915       std::swap(Src0, Src1);
1916       ++SwapCount;
1917     }
1918     // Improve two-address code patterns by avoiding a copy to the dest
1919     // register when one of the source operands ends its lifetime here.
1920     if (!Instr->isLastUse(Src0) && Instr->isLastUse(Src1)) {
1921       std::swap(Src0, Src1);
1922       ++SwapCount;
1923     }
1924     assert(SwapCount <= 1);
1925     (void)SwapCount;
1926   }
1927   if (!Traits::Is64Bit && Ty == IceType_i64) {
1928     // These x86-32 helper-call-involved instructions are lowered in this
1929     // separate switch. This is because loOperand() and hiOperand() may insert
1930     // redundant instructions for constant blinding and pooling. Such redundant
1931     // instructions will fail liveness analysis under -Om1 setting. And,
1932     // actually these arguments do not need to be processed with loOperand()
1933     // and hiOperand() to be used.
1934     switch (Instr->getOp()) {
1935     case InstArithmetic::Udiv:
1936     case InstArithmetic::Sdiv:
1937     case InstArithmetic::Urem:
1938     case InstArithmetic::Srem:
1939       llvm::report_fatal_error("Helper call was expected");
1940       return;
1941     default:
1942       break;
1943     }
1944
1945     auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
1946     auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
1947     Operand *Src0Lo = loOperand(Src0);
1948     Operand *Src0Hi = hiOperand(Src0);
1949     Operand *Src1Lo = loOperand(Src1);
1950     Operand *Src1Hi = hiOperand(Src1);
1951     Variable *T_Lo = nullptr, *T_Hi = nullptr;
1952     switch (Instr->getOp()) {
1953     case InstArithmetic::_num:
1954       llvm_unreachable("Unknown arithmetic operator");
1955       break;
1956     case InstArithmetic::Add:
1957       _mov(T_Lo, Src0Lo);
1958       _add(T_Lo, Src1Lo);
1959       _mov(DestLo, T_Lo);
1960       _mov(T_Hi, Src0Hi);
1961       _adc(T_Hi, Src1Hi);
1962       _mov(DestHi, T_Hi);
1963       break;
1964     case InstArithmetic::And:
1965       _mov(T_Lo, Src0Lo);
1966       _and(T_Lo, Src1Lo);
1967       _mov(DestLo, T_Lo);
1968       _mov(T_Hi, Src0Hi);
1969       _and(T_Hi, Src1Hi);
1970       _mov(DestHi, T_Hi);
1971       break;
1972     case InstArithmetic::Or:
1973       _mov(T_Lo, Src0Lo);
1974       _or(T_Lo, Src1Lo);
1975       _mov(DestLo, T_Lo);
1976       _mov(T_Hi, Src0Hi);
1977       _or(T_Hi, Src1Hi);
1978       _mov(DestHi, T_Hi);
1979       break;
1980     case InstArithmetic::Xor:
1981       _mov(T_Lo, Src0Lo);
1982       _xor(T_Lo, Src1Lo);
1983       _mov(DestLo, T_Lo);
1984       _mov(T_Hi, Src0Hi);
1985       _xor(T_Hi, Src1Hi);
1986       _mov(DestHi, T_Hi);
1987       break;
1988     case InstArithmetic::Sub:
1989       _mov(T_Lo, Src0Lo);
1990       _sub(T_Lo, Src1Lo);
1991       _mov(DestLo, T_Lo);
1992       _mov(T_Hi, Src0Hi);
1993       _sbb(T_Hi, Src1Hi);
1994       _mov(DestHi, T_Hi);
1995       break;
1996     case InstArithmetic::Mul: {
1997       Variable *T_1 = nullptr, *T_2 = nullptr, *T_3 = nullptr;
1998       Variable *T_4Lo = makeReg(IceType_i32, Traits::RegisterSet::Reg_eax);
1999       Variable *T_4Hi = makeReg(IceType_i32, Traits::RegisterSet::Reg_edx);
2000       // gcc does the following:
2001       // a=b*c ==>
2002       //   t1 = b.hi; t1 *=(imul) c.lo
2003       //   t2 = c.hi; t2 *=(imul) b.lo
2004       //   t3:eax = b.lo
2005       //   t4.hi:edx,t4.lo:eax = t3:eax *(mul) c.lo
2006       //   a.lo = t4.lo
2007       //   t4.hi += t1
2008       //   t4.hi += t2
2009       //   a.hi = t4.hi
2010       // The mul instruction cannot take an immediate operand.
2011       Src1Lo = legalize(Src1Lo, Legal_Reg | Legal_Mem);
2012       _mov(T_1, Src0Hi);
2013       _imul(T_1, Src1Lo);
2014       _mov(T_3, Src0Lo, Traits::RegisterSet::Reg_eax);
2015       _mul(T_4Lo, T_3, Src1Lo);
2016       // The mul instruction produces two dest variables, edx:eax. We create a
2017       // fake definition of edx to account for this.
2018       Context.insert<InstFakeDef>(T_4Hi, T_4Lo);
2019       Context.insert<InstFakeUse>(T_4Hi);
2020       _mov(DestLo, T_4Lo);
2021       _add(T_4Hi, T_1);
2022       _mov(T_2, Src1Hi);
2023       _imul(T_2, Src0Lo);
2024       _add(T_4Hi, T_2);
2025       _mov(DestHi, T_4Hi);
2026     } break;
2027     case InstArithmetic::Shl:
2028     case InstArithmetic::Lshr:
2029     case InstArithmetic::Ashr:
2030       lowerShift64(Instr->getOp(), Src0Lo, Src0Hi, Src1Lo, DestLo, DestHi);
2031       break;
2032     case InstArithmetic::Fadd:
2033     case InstArithmetic::Fsub:
2034     case InstArithmetic::Fmul:
2035     case InstArithmetic::Fdiv:
2036     case InstArithmetic::Frem:
2037       llvm_unreachable("FP instruction with i64 type");
2038       break;
2039     case InstArithmetic::Udiv:
2040     case InstArithmetic::Sdiv:
2041     case InstArithmetic::Urem:
2042     case InstArithmetic::Srem:
2043       llvm_unreachable("Call-helper-involved instruction for i64 type \
2044                        should have already been handled before");
2045       break;
2046     }
2047     return;
2048   }
2049   if (isVectorType(Ty)) {
2050     // TODO: Trap on integer divide and integer modulo by zero. See:
2051     // https://code.google.com/p/nativeclient/issues/detail?id=3899
2052     if (llvm::isa<X86OperandMem>(Src1))
2053       Src1 = legalizeToReg(Src1);
2054     switch (Instr->getOp()) {
2055     case InstArithmetic::_num:
2056       llvm_unreachable("Unknown arithmetic operator");
2057       break;
2058     case InstArithmetic::Add: {
2059       Variable *T = makeReg(Ty);
2060       _movp(T, Src0);
2061       _padd(T, Src1);
2062       _movp(Dest, T);
2063     } break;
2064     case InstArithmetic::And: {
2065       Variable *T = makeReg(Ty);
2066       _movp(T, Src0);
2067       _pand(T, Src1);
2068       _movp(Dest, T);
2069     } break;
2070     case InstArithmetic::Or: {
2071       Variable *T = makeReg(Ty);
2072       _movp(T, Src0);
2073       _por(T, Src1);
2074       _movp(Dest, T);
2075     } break;
2076     case InstArithmetic::Xor: {
2077       Variable *T = makeReg(Ty);
2078       _movp(T, Src0);
2079       _pxor(T, Src1);
2080       _movp(Dest, T);
2081     } break;
2082     case InstArithmetic::Sub: {
2083       Variable *T = makeReg(Ty);
2084       _movp(T, Src0);
2085       _psub(T, Src1);
2086       _movp(Dest, T);
2087     } break;
2088     case InstArithmetic::Mul: {
2089       bool TypesAreValidForPmull = Ty == IceType_v4i32 || Ty == IceType_v8i16;
2090       bool InstructionSetIsValidForPmull =
2091           Ty == IceType_v8i16 || InstructionSet >= Traits::SSE4_1;
2092       if (TypesAreValidForPmull && InstructionSetIsValidForPmull) {
2093         Variable *T = makeReg(Ty);
2094         _movp(T, Src0);
2095         _pmull(T, Src0 == Src1 ? T : Src1);
2096         _movp(Dest, T);
2097       } else if (Ty == IceType_v4i32) {
2098         // Lowering sequence:
2099         // Note: The mask arguments have index 0 on the left.
2100         //
2101         // movups  T1, Src0
2102         // pshufd  T2, Src0, {1,0,3,0}
2103         // pshufd  T3, Src1, {1,0,3,0}
2104         // # T1 = {Src0[0] * Src1[0], Src0[2] * Src1[2]}
2105         // pmuludq T1, Src1
2106         // # T2 = {Src0[1] * Src1[1], Src0[3] * Src1[3]}
2107         // pmuludq T2, T3
2108         // # T1 = {lo(T1[0]), lo(T1[2]), lo(T2[0]), lo(T2[2])}
2109         // shufps  T1, T2, {0,2,0,2}
2110         // pshufd  T4, T1, {0,2,1,3}
2111         // movups  Dest, T4
2112
2113         // Mask that directs pshufd to create a vector with entries
2114         // Src[1, 0, 3, 0]
2115         constexpr unsigned Constant1030 = 0x31;
2116         Constant *Mask1030 = Ctx->getConstantInt32(Constant1030);
2117         // Mask that directs shufps to create a vector with entries
2118         // Dest[0, 2], Src[0, 2]
2119         constexpr unsigned Mask0202 = 0x88;
2120         // Mask that directs pshufd to create a vector with entries
2121         // Src[0, 2, 1, 3]
2122         constexpr unsigned Mask0213 = 0xd8;
2123         Variable *T1 = makeReg(IceType_v4i32);
2124         Variable *T2 = makeReg(IceType_v4i32);
2125         Variable *T3 = makeReg(IceType_v4i32);
2126         Variable *T4 = makeReg(IceType_v4i32);
2127         _movp(T1, Src0);
2128         _pshufd(T2, Src0, Mask1030);
2129         _pshufd(T3, Src1, Mask1030);
2130         _pmuludq(T1, Src1);
2131         _pmuludq(T2, T3);
2132         _shufps(T1, T2, Ctx->getConstantInt32(Mask0202));
2133         _pshufd(T4, T1, Ctx->getConstantInt32(Mask0213));
2134         _movp(Dest, T4);
2135       } else if (Ty == IceType_v16i8) {
2136         llvm::report_fatal_error("Scalarized operation was expected");
2137       } else {
2138         llvm::report_fatal_error("Invalid vector multiply type");
2139       }
2140     } break;
2141     case InstArithmetic::Shl:
2142     case InstArithmetic::Lshr:
2143     case InstArithmetic::Ashr:
2144     case InstArithmetic::Udiv:
2145     case InstArithmetic::Urem:
2146     case InstArithmetic::Sdiv:
2147     case InstArithmetic::Srem:
2148       llvm::report_fatal_error("Scalarized operation was expected");
2149       break;
2150     case InstArithmetic::Fadd: {
2151       Variable *T = makeReg(Ty);
2152       _movp(T, Src0);
2153       _addps(T, Src1);
2154       _movp(Dest, T);
2155     } break;
2156     case InstArithmetic::Fsub: {
2157       Variable *T = makeReg(Ty);
2158       _movp(T, Src0);
2159       _subps(T, Src1);
2160       _movp(Dest, T);
2161     } break;
2162     case InstArithmetic::Fmul: {
2163       Variable *T = makeReg(Ty);
2164       _movp(T, Src0);
2165       _mulps(T, Src0 == Src1 ? T : Src1);
2166       _movp(Dest, T);
2167     } break;
2168     case InstArithmetic::Fdiv: {
2169       Variable *T = makeReg(Ty);
2170       _movp(T, Src0);
2171       _divps(T, Src1);
2172       _movp(Dest, T);
2173     } break;
2174     case InstArithmetic::Frem:
2175       llvm::report_fatal_error("Scalarized operation was expected");
2176       break;
2177     }
2178     return;
2179   }
2180   Variable *T_edx = nullptr;
2181   Variable *T = nullptr;
2182   switch (Instr->getOp()) {
2183   case InstArithmetic::_num:
2184     llvm_unreachable("Unknown arithmetic operator");
2185     break;
2186   case InstArithmetic::Add:
2187     _mov(T, Src0);
2188     _add(T, Src1);
2189     _mov(Dest, T);
2190     break;
2191   case InstArithmetic::And:
2192     _mov(T, Src0);
2193     _and(T, Src1);
2194     _mov(Dest, T);
2195     break;
2196   case InstArithmetic::Or:
2197     _mov(T, Src0);
2198     _or(T, Src1);
2199     _mov(Dest, T);
2200     break;
2201   case InstArithmetic::Xor:
2202     _mov(T, Src0);
2203     _xor(T, Src1);
2204     _mov(Dest, T);
2205     break;
2206   case InstArithmetic::Sub:
2207     _mov(T, Src0);
2208     _sub(T, Src1);
2209     _mov(Dest, T);
2210     break;
2211   case InstArithmetic::Mul:
2212     if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) {
2213       if (optimizeScalarMul(Dest, Src0, C->getValue()))
2214         return;
2215     }
2216     // The 8-bit version of imul only allows the form "imul r/m8" where T must
2217     // be in al.
2218     if (isByteSizedArithType(Ty)) {
2219       _mov(T, Src0, Traits::RegisterSet::Reg_al);
2220       Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
2221       _imul(T, Src0 == Src1 ? T : Src1);
2222       _mov(Dest, T);
2223     } else if (auto *ImmConst = llvm::dyn_cast<ConstantInteger32>(Src1)) {
2224       T = makeReg(Ty);
2225       _imul_imm(T, Src0, ImmConst);
2226       _mov(Dest, T);
2227     } else {
2228       _mov(T, Src0);
2229       _imul(T, Src0 == Src1 ? T : Src1);
2230       _mov(Dest, T);
2231     }
2232     break;
2233   case InstArithmetic::Shl:
2234     _mov(T, Src0);
2235     if (!llvm::isa<ConstantInteger32>(Src1) &&
2236         !llvm::isa<ConstantInteger64>(Src1))
2237       Src1 = copyToReg8(Src1, Traits::RegisterSet::Reg_cl);
2238     _shl(T, Src1);
2239     _mov(Dest, T);
2240     break;
2241   case InstArithmetic::Lshr:
2242     _mov(T, Src0);
2243     if (!llvm::isa<ConstantInteger32>(Src1) &&
2244         !llvm::isa<ConstantInteger64>(Src1))
2245       Src1 = copyToReg8(Src1, Traits::RegisterSet::Reg_cl);
2246     _shr(T, Src1);
2247     _mov(Dest, T);
2248     break;
2249   case InstArithmetic::Ashr:
2250     _mov(T, Src0);
2251     if (!llvm::isa<ConstantInteger32>(Src1) &&
2252         !llvm::isa<ConstantInteger64>(Src1))
2253       Src1 = copyToReg8(Src1, Traits::RegisterSet::Reg_cl);
2254     _sar(T, Src1);
2255     _mov(Dest, T);
2256     break;
2257   case InstArithmetic::Udiv: {
2258     // div and idiv are the few arithmetic operators that do not allow
2259     // immediates as the operand.
2260     Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
2261     RegNumT Eax;
2262     RegNumT Edx;
2263     switch (Ty) {
2264     default:
2265       llvm::report_fatal_error("Bad type for udiv");
2266     case IceType_i64:
2267       Eax = Traits::getRaxOrDie();
2268       Edx = Traits::getRdxOrDie();
2269       break;
2270     case IceType_i32:
2271       Eax = Traits::RegisterSet::Reg_eax;
2272       Edx = Traits::RegisterSet::Reg_edx;
2273       break;
2274     case IceType_i16:
2275       Eax = Traits::RegisterSet::Reg_ax;
2276       Edx = Traits::RegisterSet::Reg_dx;
2277       break;
2278     case IceType_i8:
2279       Eax = Traits::RegisterSet::Reg_al;
2280       Edx = Traits::RegisterSet::Reg_ah;
2281       break;
2282     }
2283     T_edx = makeReg(Ty, Edx);
2284     _mov(T, Src0, Eax);
2285     _mov(T_edx, Ctx->getConstantZero(Ty));
2286     _div(T, Src1, T_edx);
2287     _mov(Dest, T);
2288   } break;
2289   case InstArithmetic::Sdiv:
2290     // TODO(stichnot): Enable this after doing better performance and cross
2291     // testing.
2292     if (false && Func->getOptLevel() >= Opt_1) {
2293       // Optimize division by constant power of 2, but not for Om1 or O0, just
2294       // to keep things simple there.
2295       if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) {
2296         const int32_t Divisor = C->getValue();
2297         const uint32_t UDivisor = Divisor;
2298         if (Divisor > 0 && llvm::isPowerOf2_32(UDivisor)) {
2299           uint32_t LogDiv = llvm::Log2_32(UDivisor);
2300           // LLVM does the following for dest=src/(1<<log):
2301           //   t=src
2302           //   sar t,typewidth-1 // -1 if src is negative, 0 if not
2303           //   shr t,typewidth-log
2304           //   add t,src
2305           //   sar t,log
2306           //   dest=t
2307           uint32_t TypeWidth = Traits::X86_CHAR_BIT * typeWidthInBytes(Ty);
2308           _mov(T, Src0);
2309           // If for some reason we are dividing by 1, just treat it like an
2310           // assignment.
2311           if (LogDiv > 0) {
2312             // The initial sar is unnecessary when dividing by 2.
2313             if (LogDiv > 1)
2314               _sar(T, Ctx->getConstantInt(Ty, TypeWidth - 1));
2315             _shr(T, Ctx->getConstantInt(Ty, TypeWidth - LogDiv));
2316             _add(T, Src0);
2317             _sar(T, Ctx->getConstantInt(Ty, LogDiv));
2318           }
2319           _mov(Dest, T);
2320           return;
2321         }
2322       }
2323     }
2324     Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
2325     switch (Ty) {
2326     default:
2327       llvm::report_fatal_error("Bad type for sdiv");
2328     case IceType_i64:
2329       T_edx = makeReg(Ty, Traits::getRdxOrDie());
2330       _mov(T, Src0, Traits::getRaxOrDie());
2331       break;
2332     case IceType_i32:
2333       T_edx = makeReg(Ty, Traits::RegisterSet::Reg_edx);
2334       _mov(T, Src0, Traits::RegisterSet::Reg_eax);
2335       break;
2336     case IceType_i16:
2337       T_edx = makeReg(Ty, Traits::RegisterSet::Reg_dx);
2338       _mov(T, Src0, Traits::RegisterSet::Reg_ax);
2339       break;
2340     case IceType_i8:
2341       T_edx = makeReg(IceType_i16, Traits::RegisterSet::Reg_ax);
2342       _mov(T, Src0, Traits::RegisterSet::Reg_al);
2343       break;
2344     }
2345     _cbwdq(T_edx, T);
2346     _idiv(T, Src1, T_edx);
2347     _mov(Dest, T);
2348     break;
2349   case InstArithmetic::Urem: {
2350     Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
2351     RegNumT Eax;
2352     RegNumT Edx;
2353     switch (Ty) {
2354     default:
2355       llvm::report_fatal_error("Bad type for urem");
2356     case IceType_i64:
2357       Eax = Traits::getRaxOrDie();
2358       Edx = Traits::getRdxOrDie();
2359       break;
2360     case IceType_i32:
2361       Eax = Traits::RegisterSet::Reg_eax;
2362       Edx = Traits::RegisterSet::Reg_edx;
2363       break;
2364     case IceType_i16:
2365       Eax = Traits::RegisterSet::Reg_ax;
2366       Edx = Traits::RegisterSet::Reg_dx;
2367       break;
2368     case IceType_i8:
2369       Eax = Traits::RegisterSet::Reg_al;
2370       Edx = Traits::RegisterSet::Reg_ah;
2371       break;
2372     }
2373     T_edx = makeReg(Ty, Edx);
2374     _mov(T_edx, Ctx->getConstantZero(Ty));
2375     _mov(T, Src0, Eax);
2376     _div(T_edx, Src1, T);
2377     if (Ty == IceType_i8) {
2378       // Register ah must be moved into one of {al,bl,cl,dl} before it can be
2379       // moved into a general 8-bit register.
2380       auto *T_AhRcvr = makeReg(Ty);
2381       T_AhRcvr->setRegClass(RCX86_IsAhRcvr);
2382       _mov(T_AhRcvr, T_edx);
2383       T_edx = T_AhRcvr;
2384     }
2385     _mov(Dest, T_edx);
2386   } break;
2387   case InstArithmetic::Srem: {
2388     // TODO(stichnot): Enable this after doing better performance and cross
2389     // testing.
2390     if (false && Func->getOptLevel() >= Opt_1) {
2391       // Optimize mod by constant power of 2, but not for Om1 or O0, just to
2392       // keep things simple there.
2393       if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) {
2394         const int32_t Divisor = C->getValue();
2395         const uint32_t UDivisor = Divisor;
2396         if (Divisor > 0 && llvm::isPowerOf2_32(UDivisor)) {
2397           uint32_t LogDiv = llvm::Log2_32(UDivisor);
2398           // LLVM does the following for dest=src%(1<<log):
2399           //   t=src
2400           //   sar t,typewidth-1 // -1 if src is negative, 0 if not
2401           //   shr t,typewidth-log
2402           //   add t,src
2403           //   and t, -(1<<log)
2404           //   sub t,src
2405           //   neg t
2406           //   dest=t
2407           uint32_t TypeWidth = Traits::X86_CHAR_BIT * typeWidthInBytes(Ty);
2408           // If for some reason we are dividing by 1, just assign 0.
2409           if (LogDiv == 0) {
2410             _mov(Dest, Ctx->getConstantZero(Ty));
2411             return;
2412           }
2413           _mov(T, Src0);
2414           // The initial sar is unnecessary when dividing by 2.
2415           if (LogDiv > 1)
2416             _sar(T, Ctx->getConstantInt(Ty, TypeWidth - 1));
2417           _shr(T, Ctx->getConstantInt(Ty, TypeWidth - LogDiv));
2418           _add(T, Src0);
2419           _and(T, Ctx->getConstantInt(Ty, -(1 << LogDiv)));
2420           _sub(T, Src0);
2421           _neg(T);
2422           _mov(Dest, T);
2423           return;
2424         }
2425       }
2426     }
2427     Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
2428     RegNumT Eax;
2429     RegNumT Edx;
2430     switch (Ty) {
2431     default:
2432       llvm::report_fatal_error("Bad type for srem");
2433     case IceType_i64:
2434       Eax = Traits::getRaxOrDie();
2435       Edx = Traits::getRdxOrDie();
2436       break;
2437     case IceType_i32:
2438       Eax = Traits::RegisterSet::Reg_eax;
2439       Edx = Traits::RegisterSet::Reg_edx;
2440       break;
2441     case IceType_i16:
2442       Eax = Traits::RegisterSet::Reg_ax;
2443       Edx = Traits::RegisterSet::Reg_dx;
2444       break;
2445     case IceType_i8:
2446       Eax = Traits::RegisterSet::Reg_al;
2447       Edx = Traits::RegisterSet::Reg_ah;
2448       break;
2449     }
2450     T_edx = makeReg(Ty, Edx);
2451     _mov(T, Src0, Eax);
2452     _cbwdq(T_edx, T);
2453     _idiv(T_edx, Src1, T);
2454     if (Ty == IceType_i8) {
2455       // Register ah must be moved into one of {al,bl,cl,dl} before it can be
2456       // moved into a general 8-bit register.
2457       auto *T_AhRcvr = makeReg(Ty);
2458       T_AhRcvr->setRegClass(RCX86_IsAhRcvr);
2459       _mov(T_AhRcvr, T_edx);
2460       T_edx = T_AhRcvr;
2461     }
2462     _mov(Dest, T_edx);
2463   } break;
2464   case InstArithmetic::Fadd:
2465     _mov(T, Src0);
2466     _addss(T, Src1);
2467     _mov(Dest, T);
2468     break;
2469   case InstArithmetic::Fsub:
2470     _mov(T, Src0);
2471     _subss(T, Src1);
2472     _mov(Dest, T);
2473     break;
2474   case InstArithmetic::Fmul:
2475     _mov(T, Src0);
2476     _mulss(T, Src0 == Src1 ? T : Src1);
2477     _mov(Dest, T);
2478     break;
2479   case InstArithmetic::Fdiv:
2480     _mov(T, Src0);
2481     _divss(T, Src1);
2482     _mov(Dest, T);
2483     break;
2484   case InstArithmetic::Frem:
2485     llvm::report_fatal_error("Helper call was expected");
2486     break;
2487   }
2488 }
2489
2490 template <typename TraitsType>
2491 void TargetX86Base<TraitsType>::lowerAssign(const InstAssign *Instr) {
2492   Variable *Dest = Instr->getDest();
2493   if (Dest->isRematerializable()) {
2494     Context.insert<InstFakeDef>(Dest);
2495     return;
2496   }
2497   Operand *Src = Instr->getSrc(0);
2498   assert(Dest->getType() == Src->getType());
2499   lowerMove(Dest, Src, false);
2500 }
2501
2502 template <typename TraitsType>
2503 void TargetX86Base<TraitsType>::lowerBr(const InstBr *Br) {
2504   if (Br->isUnconditional()) {
2505     _br(Br->getTargetUnconditional());
2506     return;
2507   }
2508   Operand *Cond = Br->getCondition();
2509
2510   // Handle folding opportunities.
2511   if (const Inst *Producer = FoldingInfo.getProducerFor(Cond)) {
2512     assert(Producer->isDeleted());
2513     switch (BoolFolding<Traits>::getProducerKind(Producer)) {
2514     default:
2515       break;
2516     case BoolFolding<Traits>::PK_Icmp32:
2517     case BoolFolding<Traits>::PK_Icmp64: {
2518       lowerIcmpAndConsumer(llvm::cast<InstIcmp>(Producer), Br);
2519       return;
2520     }
2521     case BoolFolding<Traits>::PK_Fcmp: {
2522       lowerFcmpAndConsumer(llvm::cast<InstFcmp>(Producer), Br);
2523       return;
2524     }
2525     case BoolFolding<Traits>::PK_Arith: {
2526       lowerArithAndConsumer(llvm::cast<InstArithmetic>(Producer), Br);
2527       return;
2528     }
2529     }
2530   }
2531   Operand *Src0 = legalize(Cond, Legal_Reg | Legal_Mem);
2532   Constant *Zero = Ctx->getConstantZero(IceType_i32);
2533   _cmp(Src0, Zero);
2534   _br(Traits::Cond::Br_ne, Br->getTargetTrue(), Br->getTargetFalse());
2535 }
2536
2537 // constexprMax returns a (constexpr) max(S0, S1), and it is used for defining
2538 // OperandList in lowerCall. std::max() is supposed to work, but it doesn't.
2539 inline constexpr SizeT constexprMax(SizeT S0, SizeT S1) {
2540   return S0 < S1 ? S1 : S0;
2541 }
2542
2543 template <typename TraitsType>
2544 void TargetX86Base<TraitsType>::lowerCall(const InstCall *Instr) {
2545   // Common x86 calling convention lowering:
2546   //
2547   // * At the point before the call, the stack must be aligned to 16 bytes.
2548   //
2549   // * Non-register arguments are pushed onto the stack in right-to-left order,
2550   // such that the left-most argument ends up on the top of the stack at the
2551   // lowest memory address.
2552   //
2553   // * Stack arguments of vector type are aligned to start at the next highest
2554   // multiple of 16 bytes. Other stack arguments are aligned to the next word
2555   // size boundary (4 or 8 bytes, respectively).
2556   NeedsStackAlignment = true;
2557
2558   using OperandList =
2559       llvm::SmallVector<Operand *, constexprMax(Traits::X86_MAX_XMM_ARGS,
2560                                                 Traits::X86_MAX_GPR_ARGS)>;
2561   OperandList XmmArgs;
2562   CfgVector<std::pair<const Type, Operand *>> GprArgs;
2563   OperandList StackArgs, StackArgLocations;
2564   uint32_t ParameterAreaSizeBytes = 0;
2565
2566   // Classify each argument operand according to the location where the argument
2567   // is passed.
2568   for (SizeT i = 0, NumArgs = Instr->getNumArgs(); i < NumArgs; ++i) {
2569     Operand *Arg = Instr->getArg(i);
2570     const Type Ty = Arg->getType();
2571     // The PNaCl ABI requires the width of arguments to be at least 32 bits.
2572     assert(typeWidthInBytes(Ty) >= 4);
2573     if (isVectorType(Ty) &&
2574         Traits::getRegisterForXmmArgNum(XmmArgs.size()).hasValue()) {
2575       XmmArgs.push_back(Arg);
2576     } else if (isScalarFloatingType(Ty) && Traits::X86_PASS_SCALAR_FP_IN_XMM &&
2577                Traits::getRegisterForXmmArgNum(XmmArgs.size()).hasValue()) {
2578       XmmArgs.push_back(Arg);
2579     } else if (isScalarIntegerType(Ty) &&
2580                Traits::getRegisterForGprArgNum(Ty, GprArgs.size()).hasValue()) {
2581       GprArgs.emplace_back(Ty, Arg);
2582     } else {
2583       // Place on stack.
2584       StackArgs.push_back(Arg);
2585       if (isVectorType(Arg->getType())) {
2586         ParameterAreaSizeBytes =
2587             Traits::applyStackAlignment(ParameterAreaSizeBytes);
2588       }
2589       Variable *esp = getPhysicalRegister(getStackReg(), Traits::WordType);
2590       Constant *Loc = Ctx->getConstantInt32(ParameterAreaSizeBytes);
2591       StackArgLocations.push_back(
2592           Traits::X86OperandMem::create(Func, Ty, esp, Loc));
2593       ParameterAreaSizeBytes += typeWidthInBytesOnStack(Arg->getType());
2594     }
2595   }
2596   // Ensure there is enough space for the fstp/movs for floating returns.
2597   Variable *Dest = Instr->getDest();
2598   const Type DestTy = Dest ? Dest->getType() : IceType_void;
2599   if (!Traits::X86_PASS_SCALAR_FP_IN_XMM) {
2600     if (isScalarFloatingType(DestTy)) {
2601       ParameterAreaSizeBytes =
2602           std::max(static_cast<size_t>(ParameterAreaSizeBytes),
2603                    typeWidthInBytesOnStack(DestTy));
2604     }
2605   }
2606   // Adjust the parameter area so that the stack is aligned. It is assumed that
2607   // the stack is already aligned at the start of the calling sequence.
2608   ParameterAreaSizeBytes = Traits::applyStackAlignment(ParameterAreaSizeBytes);
2609   assert(ParameterAreaSizeBytes <= maxOutArgsSizeBytes());
2610   // Copy arguments that are passed on the stack to the appropriate stack
2611   // locations.  We make sure legalize() is called on each argument at this
2612   // point, to allow availabilityGet() to work.
2613   for (SizeT i = 0, NumStackArgs = StackArgs.size(); i < NumStackArgs; ++i) {
2614     lowerStore(
2615         InstStore::create(Func, legalize(StackArgs[i]), StackArgLocations[i]));
2616   }
2617   // Copy arguments to be passed in registers to the appropriate registers.
2618   for (SizeT i = 0, NumXmmArgs = XmmArgs.size(); i < NumXmmArgs; ++i) {
2619     XmmArgs[i] =
2620         legalizeToReg(legalize(XmmArgs[i]), Traits::getRegisterForXmmArgNum(i));
2621   }
2622   // Materialize moves for arguments passed in GPRs.
2623   for (SizeT i = 0, NumGprArgs = GprArgs.size(); i < NumGprArgs; ++i) {
2624     const Type SignatureTy = GprArgs[i].first;
2625     Operand *Arg =
2626         legalize(GprArgs[i].second, Legal_Default | Legal_Rematerializable);
2627     GprArgs[i].second =
2628         legalizeToReg(Arg, Traits::getRegisterForGprArgNum(Arg->getType(), i));
2629     assert(SignatureTy == IceType_i64 || SignatureTy == IceType_i32);
2630     assert(SignatureTy == Arg->getType());
2631     (void)SignatureTy;
2632   }
2633   // Generate a FakeUse of register arguments so that they do not get dead code
2634   // eliminated as a result of the FakeKill of scratch registers after the call.
2635   // These need to be right before the call instruction.
2636   for (auto *Arg : XmmArgs) {
2637     Context.insert<InstFakeUse>(llvm::cast<Variable>(Arg));
2638   }
2639   for (auto &ArgPair : GprArgs) {
2640     Context.insert<InstFakeUse>(llvm::cast<Variable>(ArgPair.second));
2641   }
2642   // Generate the call instruction. Assign its result to a temporary with high
2643   // register allocation weight.
2644   // ReturnReg doubles as ReturnRegLo as necessary.
2645   Variable *ReturnReg = nullptr;
2646   Variable *ReturnRegHi = nullptr;
2647   if (Dest) {
2648     switch (DestTy) {
2649     case IceType_NUM:
2650     case IceType_void:
2651     case IceType_i1:
2652     case IceType_i8:
2653     case IceType_i16:
2654       llvm::report_fatal_error("Invalid Call dest type");
2655       break;
2656     case IceType_i32:
2657       ReturnReg = makeReg(DestTy, Traits::RegisterSet::Reg_eax);
2658       break;
2659     case IceType_i64:
2660       if (Traits::Is64Bit) {
2661         ReturnReg = makeReg(IceType_i64, Traits::getRaxOrDie());
2662       } else {
2663         ReturnReg = makeReg(IceType_i32, Traits::RegisterSet::Reg_eax);
2664         ReturnRegHi = makeReg(IceType_i32, Traits::RegisterSet::Reg_edx);
2665       }
2666       break;
2667     case IceType_f32:
2668     case IceType_f64:
2669       if (!Traits::X86_PASS_SCALAR_FP_IN_XMM) {
2670         // Leave ReturnReg==ReturnRegHi==nullptr, and capture the result with
2671         // the fstp instruction.
2672         break;
2673       }
2674     // Fallthrough intended.
2675     case IceType_v4i1:
2676     case IceType_v8i1:
2677     case IceType_v16i1:
2678     case IceType_v16i8:
2679     case IceType_v8i16:
2680     case IceType_v4i32:
2681     case IceType_v4f32:
2682       ReturnReg = makeReg(DestTy, Traits::RegisterSet::Reg_xmm0);
2683       break;
2684     }
2685   }
2686   // Emit the call to the function.
2687   Operand *CallTarget =
2688       legalize(Instr->getCallTarget(), Legal_Reg | Legal_Imm | Legal_AddrAbs);
2689   Inst *NewCall = emitCallToTarget(CallTarget, ReturnReg);
2690   // Keep the upper return register live on 32-bit platform.
2691   if (ReturnRegHi)
2692     Context.insert<InstFakeDef>(ReturnRegHi);
2693   // Mark the call as killing all the caller-save registers.
2694   Context.insert<InstFakeKill>(NewCall);
2695   // Handle x86-32 floating point returns.
2696   if (Dest != nullptr && isScalarFloatingType(DestTy) &&
2697       !Traits::X86_PASS_SCALAR_FP_IN_XMM) {
2698     // Special treatment for an FP function which returns its result in st(0).
2699     // If Dest ends up being a physical xmm register, the fstp emit code will
2700     // route st(0) through the space reserved in the function argument area
2701     // we allocated.
2702     _fstp(Dest);
2703     // Create a fake use of Dest in case it actually isn't used, because st(0)
2704     // still needs to be popped.
2705     Context.insert<InstFakeUse>(Dest);
2706   }
2707   // Generate a FakeUse to keep the call live if necessary.
2708   if (Instr->hasSideEffects() && ReturnReg) {
2709     Context.insert<InstFakeUse>(ReturnReg);
2710   }
2711   // Process the return value, if any.
2712   if (Dest == nullptr)
2713     return;
2714   // Assign the result of the call to Dest.  Route it through a temporary so
2715   // that the local register availability peephole can be subsequently used.
2716   Variable *Tmp = nullptr;
2717   if (isVectorType(DestTy)) {
2718     assert(ReturnReg && "Vector type requires a return register");
2719     Tmp = makeReg(DestTy);
2720     _movp(Tmp, ReturnReg);
2721     _movp(Dest, Tmp);
2722   } else if (isScalarFloatingType(DestTy)) {
2723     if (Traits::X86_PASS_SCALAR_FP_IN_XMM) {
2724       assert(ReturnReg && "FP type requires a return register");
2725       _mov(Tmp, ReturnReg);
2726       _mov(Dest, Tmp);
2727     }
2728   } else {
2729     assert(isScalarIntegerType(DestTy));
2730     assert(ReturnReg && "Integer type requires a return register");
2731     if (DestTy == IceType_i64 && !Traits::Is64Bit) {
2732       assert(ReturnRegHi && "64-bit type requires two return registers");
2733       auto *Dest64On32 = llvm::cast<Variable64On32>(Dest);
2734       Variable *DestLo = Dest64On32->getLo();
2735       Variable *DestHi = Dest64On32->getHi();
2736       _mov(Tmp, ReturnReg);
2737       _mov(DestLo, Tmp);
2738       Variable *TmpHi = nullptr;
2739       _mov(TmpHi, ReturnRegHi);
2740       _mov(DestHi, TmpHi);
2741     } else {
2742       _mov(Tmp, ReturnReg);
2743       _mov(Dest, Tmp);
2744     }
2745   }
2746 }
2747
2748 template <typename TraitsType>
2749 void TargetX86Base<TraitsType>::lowerCast(const InstCast *Instr) {
2750   // a = cast(b) ==> t=cast(b); a=t; (link t->b, link a->t, no overlap)
2751   InstCast::OpKind CastKind = Instr->getCastKind();
2752   Variable *Dest = Instr->getDest();
2753   Type DestTy = Dest->getType();
2754   switch (CastKind) {
2755   default:
2756     Func->setError("Cast type not supported");
2757     return;
2758   case InstCast::Sext: {
2759     // Src0RM is the source operand legalized to physical register or memory,
2760     // but not immediate, since the relevant x86 native instructions don't
2761     // allow an immediate operand. If the operand is an immediate, we could
2762     // consider computing the strength-reduced result at translation time, but
2763     // we're unlikely to see something like that in the bitcode that the
2764     // optimizer wouldn't have already taken care of.
2765     Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
2766     if (isVectorType(DestTy)) {
2767       if (DestTy == IceType_v16i8) {
2768         // onemask = materialize(1,1,...); dst = (src & onemask) > 0
2769         Variable *OneMask = makeVectorOfOnes(DestTy);
2770         Variable *T = makeReg(DestTy);
2771         _movp(T, Src0RM);
2772         _pand(T, OneMask);
2773         Variable *Zeros = makeVectorOfZeros(DestTy);
2774         _pcmpgt(T, Zeros);
2775         _movp(Dest, T);
2776       } else {
2777         /// width = width(elty) - 1; dest = (src << width) >> width
2778         SizeT ShiftAmount =
2779             Traits::X86_CHAR_BIT * typeWidthInBytes(typeElementType(DestTy)) -
2780             1;
2781         Constant *ShiftConstant = Ctx->getConstantInt8(ShiftAmount);
2782         Variable *T = makeReg(DestTy);
2783         _movp(T, Src0RM);
2784         _psll(T, ShiftConstant);
2785         _psra(T, ShiftConstant);
2786         _movp(Dest, T);
2787       }
2788     } else if (!Traits::Is64Bit && DestTy == IceType_i64) {
2789       // t1=movsx src; t2=t1; t2=sar t2, 31; dst.lo=t1; dst.hi=t2
2790       Constant *Shift = Ctx->getConstantInt32(31);
2791       auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
2792       auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
2793       Variable *T_Lo = makeReg(DestLo->getType());
2794       if (Src0RM->getType() == IceType_i32) {
2795         _mov(T_Lo, Src0RM);
2796       } else if (Src0RM->getType() == IceType_i1) {
2797         _movzx(T_Lo, Src0RM);
2798         _shl(T_Lo, Shift);
2799         _sar(T_Lo, Shift);
2800       } else {
2801         _movsx(T_Lo, Src0RM);
2802       }
2803       _mov(DestLo, T_Lo);
2804       Variable *T_Hi = nullptr;
2805       _mov(T_Hi, T_Lo);
2806       if (Src0RM->getType() != IceType_i1)
2807         // For i1, the sar instruction is already done above.
2808         _sar(T_Hi, Shift);
2809       _mov(DestHi, T_Hi);
2810     } else if (Src0RM->getType() == IceType_i1) {
2811       // t1 = src
2812       // shl t1, dst_bitwidth - 1
2813       // sar t1, dst_bitwidth - 1
2814       // dst = t1
2815       size_t DestBits = Traits::X86_CHAR_BIT * typeWidthInBytes(DestTy);
2816       Constant *ShiftAmount = Ctx->getConstantInt32(DestBits - 1);
2817       Variable *T = makeReg(DestTy);
2818       if (typeWidthInBytes(DestTy) <= typeWidthInBytes(Src0RM->getType())) {
2819         _mov(T, Src0RM);
2820       } else {
2821         // Widen the source using movsx or movzx. (It doesn't matter which one,
2822         // since the following shl/sar overwrite the bits.)
2823         _movzx(T, Src0RM);
2824       }
2825       _shl(T, ShiftAmount);
2826       _sar(T, ShiftAmount);
2827       _mov(Dest, T);
2828     } else {
2829       // t1 = movsx src; dst = t1
2830       Variable *T = makeReg(DestTy);
2831       _movsx(T, Src0RM);
2832       _mov(Dest, T);
2833     }
2834     break;
2835   }
2836   case InstCast::Zext: {
2837     Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
2838     if (isVectorType(DestTy)) {
2839       // onemask = materialize(1,1,...); dest = onemask & src
2840       Variable *OneMask = makeVectorOfOnes(DestTy);
2841       Variable *T = makeReg(DestTy);
2842       _movp(T, Src0RM);
2843       _pand(T, OneMask);
2844       _movp(Dest, T);
2845     } else if (!Traits::Is64Bit && DestTy == IceType_i64) {
2846       // t1=movzx src; dst.lo=t1; dst.hi=0
2847       Constant *Zero = Ctx->getConstantZero(IceType_i32);
2848       auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
2849       auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
2850       Variable *Tmp = makeReg(DestLo->getType());
2851       if (Src0RM->getType() == IceType_i32) {
2852         _mov(Tmp, Src0RM);
2853       } else {
2854         _movzx(Tmp, Src0RM);
2855       }
2856       _mov(DestLo, Tmp);
2857       _mov(DestHi, Zero);
2858     } else if (Src0RM->getType() == IceType_i1) {
2859       // t = Src0RM; Dest = t
2860       Variable *T = nullptr;
2861       if (DestTy == IceType_i8) {
2862         _mov(T, Src0RM);
2863       } else {
2864         assert(DestTy != IceType_i1);
2865         assert(Traits::Is64Bit || DestTy != IceType_i64);
2866         // Use 32-bit for both 16-bit and 32-bit, since 32-bit ops are shorter.
2867         // In x86-64 we need to widen T to 64-bits to ensure that T -- if
2868         // written to the stack (i.e., in -Om1) will be fully zero-extended.
2869         T = makeReg(DestTy == IceType_i64 ? IceType_i64 : IceType_i32);
2870         _movzx(T, Src0RM);
2871       }
2872       _mov(Dest, T);
2873     } else {
2874       // t1 = movzx src; dst = t1
2875       Variable *T = makeReg(DestTy);
2876       _movzx(T, Src0RM);
2877       _mov(Dest, T);
2878     }
2879     break;
2880   }
2881   case InstCast::Trunc: {
2882     if (isVectorType(DestTy)) {
2883       // onemask = materialize(1,1,...); dst = src & onemask
2884       Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
2885       Type Src0Ty = Src0RM->getType();
2886       Variable *OneMask = makeVectorOfOnes(Src0Ty);
2887       Variable *T = makeReg(DestTy);
2888       _movp(T, Src0RM);
2889       _pand(T, OneMask);
2890       _movp(Dest, T);
2891     } else if (DestTy == IceType_i1 || DestTy == IceType_i8) {
2892       // Make sure we truncate from and into valid registers.
2893       Operand *Src0 = legalizeUndef(Instr->getSrc(0));
2894       if (!Traits::Is64Bit && Src0->getType() == IceType_i64)
2895         Src0 = loOperand(Src0);
2896       Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
2897       Variable *T = copyToReg8(Src0RM);
2898       if (DestTy == IceType_i1)
2899         _and(T, Ctx->getConstantInt1(1));
2900       _mov(Dest, T);
2901     } else {
2902       Operand *Src0 = legalizeUndef(Instr->getSrc(0));
2903       if (!Traits::Is64Bit && Src0->getType() == IceType_i64)
2904         Src0 = loOperand(Src0);
2905       Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
2906       // t1 = trunc Src0RM; Dest = t1
2907       Variable *T = makeReg(DestTy);
2908       _mov(T, Src0RM);
2909       _mov(Dest, T);
2910     }
2911     break;
2912   }
2913   case InstCast::Fptrunc:
2914   case InstCast::Fpext: {
2915     Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
2916     // t1 = cvt Src0RM; Dest = t1
2917     Variable *T = makeReg(DestTy);
2918     _cvt(T, Src0RM, Traits::Insts::Cvt::Float2float);
2919     _mov(Dest, T);
2920     break;
2921   }
2922   case InstCast::Fptosi:
2923     if (isVectorType(DestTy)) {
2924       assert(DestTy == IceType_v4i32 &&
2925              Instr->getSrc(0)->getType() == IceType_v4f32);
2926       Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
2927       if (llvm::isa<X86OperandMem>(Src0RM))
2928         Src0RM = legalizeToReg(Src0RM);
2929       Variable *T = makeReg(DestTy);
2930       _cvt(T, Src0RM, Traits::Insts::Cvt::Tps2dq);
2931       _movp(Dest, T);
2932     } else if (!Traits::Is64Bit && DestTy == IceType_i64) {
2933       llvm::report_fatal_error("Helper call was expected");
2934     } else {
2935       Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
2936       // t1.i32 = cvt Src0RM; t2.dest_type = t1; Dest = t2.dest_type
2937       Variable *T_1 = nullptr;
2938       if (Traits::Is64Bit && DestTy == IceType_i64) {
2939         T_1 = makeReg(IceType_i64);
2940       } else {
2941         assert(DestTy != IceType_i64);
2942         T_1 = makeReg(IceType_i32);
2943       }
2944       // cvt() requires its integer argument to be a GPR.
2945       Variable *T_2 = makeReg(DestTy);
2946       if (isByteSizedType(DestTy)) {
2947         assert(T_1->getType() == IceType_i32);
2948         T_1->setRegClass(RCX86_Is32To8);
2949         T_2->setRegClass(RCX86_IsTrunc8Rcvr);
2950       }
2951       _cvt(T_1, Src0RM, Traits::Insts::Cvt::Tss2si);
2952       _mov(T_2, T_1); // T_1 and T_2 may have different integer types
2953       if (DestTy == IceType_i1)
2954         _and(T_2, Ctx->getConstantInt1(1));
2955       _mov(Dest, T_2);
2956     }
2957     break;
2958   case InstCast::Fptoui:
2959     if (isVectorType(DestTy)) {
2960       llvm::report_fatal_error("Helper call was expected");
2961     } else if (DestTy == IceType_i64 ||
2962                (!Traits::Is64Bit && DestTy == IceType_i32)) {
2963       llvm::report_fatal_error("Helper call was expected");
2964     } else {
2965       Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
2966       // t1.i32 = cvt Src0RM; t2.dest_type = t1; Dest = t2.dest_type
2967       assert(DestTy != IceType_i64);
2968       Variable *T_1 = nullptr;
2969       if (Traits::Is64Bit && DestTy == IceType_i32) {
2970         T_1 = makeReg(IceType_i64);
2971       } else {
2972         assert(DestTy != IceType_i32);
2973         T_1 = makeReg(IceType_i32);
2974       }
2975       Variable *T_2 = makeReg(DestTy);
2976       if (isByteSizedType(DestTy)) {
2977         assert(T_1->getType() == IceType_i32);
2978         T_1->setRegClass(RCX86_Is32To8);
2979         T_2->setRegClass(RCX86_IsTrunc8Rcvr);
2980       }
2981       _cvt(T_1, Src0RM, Traits::Insts::Cvt::Tss2si);
2982       _mov(T_2, T_1); // T_1 and T_2 may have different integer types
2983       if (DestTy == IceType_i1)
2984         _and(T_2, Ctx->getConstantInt1(1));
2985       _mov(Dest, T_2);
2986     }
2987     break;
2988   case InstCast::Sitofp:
2989     if (isVectorType(DestTy)) {
2990       assert(DestTy == IceType_v4f32 &&
2991              Instr->getSrc(0)->getType() == IceType_v4i32);
2992       Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
2993       if (llvm::isa<X86OperandMem>(Src0RM))
2994         Src0RM = legalizeToReg(Src0RM);
2995       Variable *T = makeReg(DestTy);
2996       _cvt(T, Src0RM, Traits::Insts::Cvt::Dq2ps);
2997       _movp(Dest, T);
2998     } else if (!Traits::Is64Bit && Instr->getSrc(0)->getType() == IceType_i64) {
2999       llvm::report_fatal_error("Helper call was expected");
3000     } else {
3001       Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
3002       // Sign-extend the operand.
3003       // t1.i32 = movsx Src0RM; t2 = Cvt t1.i32; Dest = t2
3004       Variable *T_1 = nullptr;
3005       if (Traits::Is64Bit && Src0RM->getType() == IceType_i64) {
3006         T_1 = makeReg(IceType_i64);
3007       } else {
3008         assert(Src0RM->getType() != IceType_i64);
3009         T_1 = makeReg(IceType_i32);
3010       }
3011       Variable *T_2 = makeReg(DestTy);
3012       if (Src0RM->getType() == T_1->getType())
3013         _mov(T_1, Src0RM);
3014       else
3015         _movsx(T_1, Src0RM);
3016       _cvt(T_2, T_1, Traits::Insts::Cvt::Si2ss);
3017       _mov(Dest, T_2);
3018     }
3019     break;
3020   case InstCast::Uitofp: {
3021     Operand *Src0 = Instr->getSrc(0);
3022     if (isVectorType(Src0->getType())) {
3023       llvm::report_fatal_error("Helper call was expected");
3024     } else if (Src0->getType() == IceType_i64 ||
3025                (!Traits::Is64Bit && Src0->getType() == IceType_i32)) {
3026       llvm::report_fatal_error("Helper call was expected");
3027     } else {
3028       Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
3029       // Zero-extend the operand.
3030       // t1.i32 = movzx Src0RM; t2 = Cvt t1.i32; Dest = t2
3031       Variable *T_1 = nullptr;
3032       if (Traits::Is64Bit && Src0RM->getType() == IceType_i32) {
3033         T_1 = makeReg(IceType_i64);
3034       } else {
3035         assert(Src0RM->getType() != IceType_i64);
3036         assert(Traits::Is64Bit || Src0RM->getType() != IceType_i32);
3037         T_1 = makeReg(IceType_i32);
3038       }
3039       Variable *T_2 = makeReg(DestTy);
3040       if (Src0RM->getType() == T_1->getType())
3041         _mov(T_1, Src0RM);
3042       else
3043         _movzx(T_1, Src0RM);
3044       _cvt(T_2, T_1, Traits::Insts::Cvt::Si2ss);
3045       _mov(Dest, T_2);
3046     }
3047     break;
3048   }
3049   case InstCast::Bitcast: {
3050     Operand *Src0 = Instr->getSrc(0);
3051     if (DestTy == Src0->getType()) {
3052       auto *Assign = InstAssign::create(Func, Dest, Src0);
3053       lowerAssign(Assign);
3054       return;
3055     }
3056     switch (DestTy) {
3057     default:
3058       llvm_unreachable("Unexpected Bitcast dest type");
3059     case IceType_i8: {
3060       llvm::report_fatal_error("Helper call was expected");
3061     } break;
3062     case IceType_i16: {
3063       llvm::report_fatal_error("Helper call was expected");
3064     } break;
3065     case IceType_i32:
3066     case IceType_f32: {
3067       Variable *Src0R = legalizeToReg(Src0);
3068       Variable *T = makeReg(DestTy);
3069       _movd(T, Src0R);
3070       _mov(Dest, T);
3071     } break;
3072     case IceType_i64: {
3073       assert(Src0->getType() == IceType_f64);
3074       if (Traits::Is64Bit) {
3075         Variable *Src0R = legalizeToReg(Src0);
3076         Variable *T = makeReg(IceType_i64);
3077         _movd(T, Src0R);
3078         _mov(Dest, T);
3079       } else {
3080         Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
3081         // a.i64 = bitcast b.f64 ==>
3082         //   s.f64 = spill b.f64
3083         //   t_lo.i32 = lo(s.f64)
3084         //   a_lo.i32 = t_lo.i32
3085         //   t_hi.i32 = hi(s.f64)
3086         //   a_hi.i32 = t_hi.i32
3087         Operand *SpillLo, *SpillHi;
3088         if (auto *Src0Var = llvm::dyn_cast<Variable>(Src0RM)) {
3089           Variable *Spill = Func->makeVariable(IceType_f64);
3090           Spill->setLinkedTo(Src0Var);
3091           Spill->setMustNotHaveReg();
3092           _movq(Spill, Src0RM);
3093           SpillLo = Traits::VariableSplit::create(Func, Spill,
3094                                                   Traits::VariableSplit::Low);
3095           SpillHi = Traits::VariableSplit::create(Func, Spill,
3096                                                   Traits::VariableSplit::High);
3097         } else {
3098           SpillLo = loOperand(Src0RM);
3099           SpillHi = hiOperand(Src0RM);
3100         }
3101
3102         auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
3103         auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
3104         Variable *T_Lo = makeReg(IceType_i32);
3105         Variable *T_Hi = makeReg(IceType_i32);
3106
3107         _mov(T_Lo, SpillLo);
3108         _mov(DestLo, T_Lo);
3109         _mov(T_Hi, SpillHi);
3110         _mov(DestHi, T_Hi);
3111       }
3112     } break;
3113     case IceType_f64: {
3114       assert(Src0->getType() == IceType_i64);
3115       if (Traits::Is64Bit) {
3116         Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
3117         Variable *T = makeReg(IceType_f64);
3118         _movd(T, Src0RM);
3119         _mov(Dest, T);
3120       } else {
3121         Src0 = legalize(Src0);
3122         if (llvm::isa<X86OperandMem>(Src0)) {
3123           Variable *T = Func->makeVariable(DestTy);
3124           _movq(T, Src0);
3125           _movq(Dest, T);
3126           break;
3127         }
3128         // a.f64 = bitcast b.i64 ==>
3129         //   t_lo.i32 = b_lo.i32
3130         //   FakeDef(s.f64)
3131         //   lo(s.f64) = t_lo.i32
3132         //   t_hi.i32 = b_hi.i32
3133         //   hi(s.f64) = t_hi.i32
3134         //   a.f64 = s.f64
3135         Variable *Spill = Func->makeVariable(IceType_f64);
3136         Spill->setLinkedTo(Dest);
3137         Spill->setMustNotHaveReg();
3138
3139         Variable *T_Lo = nullptr, *T_Hi = nullptr;
3140         auto *SpillLo = Traits::VariableSplit::create(
3141             Func, Spill, Traits::VariableSplit::Low);
3142         auto *SpillHi = Traits::VariableSplit::create(
3143             Func, Spill, Traits::VariableSplit::High);
3144         _mov(T_Lo, loOperand(Src0));
3145         // Technically, the Spill is defined after the _store happens, but
3146         // SpillLo is considered a "use" of Spill so define Spill before it is
3147         // used.
3148         Context.insert<InstFakeDef>(Spill);
3149         _store(T_Lo, SpillLo);
3150         _mov(T_Hi, hiOperand(Src0));
3151         _store(T_Hi, SpillHi);
3152         _movq(Dest, Spill);
3153       }
3154     } break;
3155     case IceType_v8i1: {
3156       llvm::report_fatal_error("Helper call was expected");
3157     } break;
3158     case IceType_v16i1: {
3159       llvm::report_fatal_error("Helper call was expected");
3160     } break;
3161     case IceType_v8i16:
3162     case IceType_v16i8:
3163     case IceType_v4i32:
3164     case IceType_v4f32: {
3165       _movp(Dest, legalizeToReg(Src0));
3166     } break;
3167     }
3168     break;
3169   }
3170   }
3171 }
3172
3173 template <typename TraitsType>
3174 void TargetX86Base<TraitsType>::lowerExtractElement(
3175     const InstExtractElement *Instr) {
3176   Operand *SourceVectNotLegalized = Instr->getSrc(0);
3177   auto *ElementIndex = llvm::dyn_cast<ConstantInteger32>(Instr->getSrc(1));
3178   // Only constant indices are allowed in PNaCl IR.
3179   assert(ElementIndex);
3180
3181   unsigned Index = ElementIndex->getValue();
3182   Type Ty = SourceVectNotLegalized->getType();
3183   Type ElementTy = typeElementType(Ty);
3184   Type InVectorElementTy = Traits::getInVectorElementType(Ty);
3185
3186   // TODO(wala): Determine the best lowering sequences for each type.
3187   bool CanUsePextr = Ty == IceType_v8i16 || Ty == IceType_v8i1 ||
3188                      (InstructionSet >= Traits::SSE4_1 && Ty != IceType_v4f32);
3189   Variable *ExtractedElementR =
3190       makeReg(CanUsePextr ? IceType_i32 : InVectorElementTy);
3191   if (CanUsePextr) {
3192     // Use pextrb, pextrw, or pextrd.  The "b" and "w" versions clear the upper
3193     // bits of the destination register, so we represent this by always
3194     // extracting into an i32 register.  The _mov into Dest below will do
3195     // truncation as necessary.
3196     Constant *Mask = Ctx->getConstantInt32(Index);
3197     Variable *SourceVectR = legalizeToReg(SourceVectNotLegalized);
3198     _pextr(ExtractedElementR, SourceVectR, Mask);
3199   } else if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) {
3200     // Use pshufd and movd/movss.
3201     Variable *T = nullptr;
3202     if (Index) {
3203       // The shuffle only needs to occur if the element to be extracted is not
3204       // at the lowest index.
3205       Constant *Mask = Ctx->getConstantInt32(Index);
3206       T = makeReg(Ty);
3207       _pshufd(T, legalize(SourceVectNotLegalized, Legal_Reg | Legal_Mem), Mask);
3208     } else {
3209       T = legalizeToReg(SourceVectNotLegalized);
3210     }
3211
3212     if (InVectorElementTy == IceType_i32) {
3213       _movd(ExtractedElementR, T);
3214     } else { // Ty == IceType_f32
3215       // TODO(wala): _movss is only used here because _mov does not allow a
3216       // vector source and a scalar destination.  _mov should be able to be
3217       // used here.
3218       // _movss is a binary instruction, so the FakeDef is needed to keep the
3219       // live range analysis consistent.
3220       Context.insert<InstFakeDef>(ExtractedElementR);
3221       _movss(ExtractedElementR, T);
3222     }
3223   } else {
3224     assert(Ty == IceType_v16i8 || Ty == IceType_v16i1);
3225     // Spill the value to a stack slot and do the extraction in memory.
3226     //
3227     // TODO(wala): use legalize(SourceVectNotLegalized, Legal_Mem) when support
3228     // for legalizing to mem is implemented.
3229     Variable *Slot = Func->makeVariable(Ty);
3230     Slot->setMustNotHaveReg();
3231     _movp(Slot, legalizeToReg(SourceVectNotLegalized));
3232
3233     // Compute the location of the element in memory.
3234     unsigned Offset = Index * typeWidthInBytes(InVectorElementTy);
3235     X86OperandMem *Loc =
3236         getMemoryOperandForStackSlot(InVectorElementTy, Slot, Offset);
3237     _mov(ExtractedElementR, Loc);
3238   }
3239
3240   if (ElementTy == IceType_i1) {
3241     // Truncate extracted integers to i1s if necessary.
3242     Variable *T = makeReg(IceType_i1);
3243     InstCast *Cast =
3244         InstCast::create(Func, InstCast::Trunc, T, ExtractedElementR);
3245     lowerCast(Cast);
3246     ExtractedElementR = T;
3247   }
3248
3249   // Copy the element to the destination.
3250   Variable *Dest = Instr->getDest();
3251   _mov(Dest, ExtractedElementR);
3252 }
3253
3254 template <typename TraitsType>
3255 void TargetX86Base<TraitsType>::lowerFcmp(const InstFcmp *Fcmp) {
3256   Variable *Dest = Fcmp->getDest();
3257
3258   if (isVectorType(Dest->getType())) {
3259     lowerFcmpVector(Fcmp);
3260   } else {
3261     constexpr Inst *Consumer = nullptr;
3262     lowerFcmpAndConsumer(Fcmp, Consumer);
3263   }
3264 }
3265
3266 template <typename TraitsType>
3267 void TargetX86Base<TraitsType>::lowerFcmpAndConsumer(const InstFcmp *Fcmp,
3268                                                      const Inst *Consumer) {
3269   Operand *Src0 = Fcmp->getSrc(0);
3270   Operand *Src1 = Fcmp->getSrc(1);
3271   Variable *Dest = Fcmp->getDest();
3272
3273   if (isVectorType(Dest->getType()))
3274     llvm::report_fatal_error("Vector compare/branch cannot be folded");
3275
3276   if (Consumer != nullptr) {
3277     if (auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) {
3278       if (lowerOptimizeFcmpSelect(Fcmp, Select))
3279         return;
3280     }
3281   }
3282
3283   // Lowering a = fcmp cond, b, c
3284   //   ucomiss b, c       /* only if C1 != Br_None */
3285   //                      /* but swap b,c order if SwapOperands==true */
3286   //   mov a, <default>
3287   //   j<C1> label        /* only if C1 != Br_None */
3288   //   j<C2> label        /* only if C2 != Br_None */
3289   //   FakeUse(a)         /* only if C1 != Br_None */
3290   //   mov a, !<default>  /* only if C1 != Br_None */
3291   //   label:             /* only if C1 != Br_None */
3292   //
3293   // setcc lowering when C1 != Br_None && C2 == Br_None:
3294   //   ucomiss b, c       /* but swap b,c order if SwapOperands==true */
3295   //   setcc a, C1
3296   InstFcmp::FCond Condition = Fcmp->getCondition();
3297   assert(Condition < Traits::TableFcmpSize);
3298   if (Traits::TableFcmp[Condition].SwapScalarOperands)
3299     std::swap(Src0, Src1);
3300   const bool HasC1 = (Traits::TableFcmp[Condition].C1 != Traits::Cond::Br_None);
3301   const bool HasC2 = (Traits::TableFcmp[Condition].C2 != Traits::Cond::Br_None);
3302   if (HasC1) {
3303     Src0 = legalize(Src0);
3304     Operand *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
3305     Variable *T = nullptr;
3306     _mov(T, Src0);
3307     _ucomiss(T, Src1RM);
3308     if (!HasC2) {
3309       assert(Traits::TableFcmp[Condition].Default);
3310       setccOrConsumer(Traits::TableFcmp[Condition].C1, Dest, Consumer);
3311       return;
3312     }
3313   }
3314   int32_t IntDefault = Traits::TableFcmp[Condition].Default;
3315   if (Consumer == nullptr) {
3316     Constant *Default = Ctx->getConstantInt(Dest->getType(), IntDefault);
3317     _mov(Dest, Default);
3318     if (HasC1) {
3319       InstX86Label *Label = InstX86Label::create(Func, this);
3320       _br(Traits::TableFcmp[Condition].C1, Label);
3321       if (HasC2) {
3322         _br(Traits::TableFcmp[Condition].C2, Label);
3323       }
3324       Constant *NonDefault = Ctx->getConstantInt(Dest->getType(), !IntDefault);
3325       _redefined(_mov(Dest, NonDefault));
3326       Context.insert(Label);
3327     }
3328     return;
3329   }
3330   if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) {
3331     CfgNode *TrueSucc = Br->getTargetTrue();
3332     CfgNode *FalseSucc = Br->getTargetFalse();
3333     if (IntDefault != 0)
3334       std::swap(TrueSucc, FalseSucc);
3335     if (HasC1) {
3336       _br(Traits::TableFcmp[Condition].C1, FalseSucc);
3337       if (HasC2) {
3338         _br(Traits::TableFcmp[Condition].C2, FalseSucc);
3339       }
3340       _br(TrueSucc);
3341       return;
3342     }
3343     _br(FalseSucc);
3344     return;
3345   }
3346   if (auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) {
3347     Operand *SrcT = Select->getTrueOperand();
3348     Operand *SrcF = Select->getFalseOperand();
3349     Variable *SelectDest = Select->getDest();
3350     if (IntDefault != 0)
3351       std::swap(SrcT, SrcF);
3352     lowerMove(SelectDest, SrcF, false);
3353     if (HasC1) {
3354       InstX86Label *Label = InstX86Label::create(Func, this);
3355       _br(Traits::TableFcmp[Condition].C1, Label);
3356       if (HasC2) {
3357         _br(Traits::TableFcmp[Condition].C2, Label);
3358       }
3359       static constexpr bool IsRedefinition = true;
3360       lowerMove(SelectDest, SrcT, IsRedefinition);
3361       Context.insert(Label);
3362     }
3363     return;
3364   }
3365   llvm::report_fatal_error("Unexpected consumer type");
3366 }
3367
3368 template <typename TraitsType>
3369 void TargetX86Base<TraitsType>::lowerFcmpVector(const InstFcmp *Fcmp) {
3370   Operand *Src0 = Fcmp->getSrc(0);
3371   Operand *Src1 = Fcmp->getSrc(1);
3372   Variable *Dest = Fcmp->getDest();
3373
3374   if (!isVectorType(Dest->getType()))
3375     llvm::report_fatal_error("Expected vector compare");
3376
3377   InstFcmp::FCond Condition = Fcmp->getCondition();
3378   assert(Condition < Traits::TableFcmpSize);
3379
3380   if (Traits::TableFcmp[Condition].SwapVectorOperands)
3381     std::swap(Src0, Src1);
3382
3383   Variable *T = nullptr;
3384
3385   if (Condition == InstFcmp::True) {
3386     // makeVectorOfOnes() requires an integer vector type.
3387     T = makeVectorOfMinusOnes(IceType_v4i32);
3388   } else if (Condition == InstFcmp::False) {
3389     T = makeVectorOfZeros(Dest->getType());
3390   } else {
3391     Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
3392     Operand *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
3393     if (llvm::isa<X86OperandMem>(Src1RM))
3394       Src1RM = legalizeToReg(Src1RM);
3395
3396     switch (Condition) {
3397     default: {
3398       const CmppsCond Predicate = Traits::TableFcmp[Condition].Predicate;
3399       assert(Predicate != Traits::Cond::Cmpps_Invalid);
3400       T = makeReg(Src0RM->getType());
3401       _movp(T, Src0RM);
3402       _cmpps(T, Src1RM, Predicate);
3403     } break;
3404     case InstFcmp::One: {
3405       // Check both unequal and ordered.
3406       T = makeReg(Src0RM->getType());
3407       Variable *T2 = makeReg(Src0RM->getType());
3408       _movp(T, Src0RM);
3409       _cmpps(T, Src1RM, Traits::Cond::Cmpps_neq);
3410       _movp(T2, Src0RM);
3411       _cmpps(T2, Src1RM, Traits::Cond::Cmpps_ord);
3412       _pand(T, T2);
3413     } break;
3414     case InstFcmp::Ueq: {
3415       // Check both equal or unordered.
3416       T = makeReg(Src0RM->getType());
3417       Variable *T2 = makeReg(Src0RM->getType());
3418       _movp(T, Src0RM);
3419       _cmpps(T, Src1RM, Traits::Cond::Cmpps_eq);
3420       _movp(T2, Src0RM);
3421       _cmpps(T2, Src1RM, Traits::Cond::Cmpps_unord);
3422       _por(T, T2);
3423     } break;
3424     }
3425   }
3426
3427   assert(T != nullptr);
3428   _movp(Dest, T);
3429   eliminateNextVectorSextInstruction(Dest);
3430 }
3431
3432 inline bool isZero(const Operand *Opnd) {
3433   if (auto *C64 = llvm::dyn_cast<ConstantInteger64>(Opnd))
3434     return C64->getValue() == 0;
3435   if (auto *C32 = llvm::dyn_cast<ConstantInteger32>(Opnd))
3436     return C32->getValue() == 0;
3437   return false;
3438 }
3439
3440 template <typename TraitsType>
3441 void TargetX86Base<TraitsType>::lowerIcmpAndConsumer(const InstIcmp *Icmp,
3442                                                      const Inst *Consumer) {
3443   Operand *Src0 = legalize(Icmp->getSrc(0));
3444   Operand *Src1 = legalize(Icmp->getSrc(1));
3445   Variable *Dest = Icmp->getDest();
3446
3447   if (isVectorType(Dest->getType()))
3448     llvm::report_fatal_error("Vector compare/branch cannot be folded");
3449
3450   if (!Traits::Is64Bit && Src0->getType() == IceType_i64) {
3451     lowerIcmp64(Icmp, Consumer);
3452     return;
3453   }
3454
3455   // cmp b, c
3456   if (isZero(Src1)) {
3457     switch (Icmp->getCondition()) {
3458     default:
3459       break;
3460     case InstIcmp::Uge:
3461       movOrConsumer(true, Dest, Consumer);
3462       return;
3463     case InstIcmp::Ult:
3464       movOrConsumer(false, Dest, Consumer);
3465       return;
3466     }
3467   }
3468   Operand *Src0RM = legalizeSrc0ForCmp(Src0, Src1);
3469   _cmp(Src0RM, Src1);
3470   setccOrConsumer(Traits::getIcmp32Mapping(Icmp->getCondition()), Dest,
3471                   Consumer);
3472 }
3473
3474 template <typename TraitsType>
3475 void TargetX86Base<TraitsType>::lowerIcmpVector(const InstIcmp *Icmp) {
3476   Operand *Src0 = legalize(Icmp->getSrc(0));
3477   Operand *Src1 = legalize(Icmp->getSrc(1));
3478   Variable *Dest = Icmp->getDest();
3479
3480   if (!isVectorType(Dest->getType()))
3481     llvm::report_fatal_error("Expected a vector compare");
3482
3483   Type Ty = Src0->getType();
3484   // Promote i1 vectors to 128 bit integer vector types.
3485   if (typeElementType(Ty) == IceType_i1) {
3486     Type NewTy = IceType_NUM;
3487     switch (Ty) {
3488     default:
3489       llvm::report_fatal_error("unexpected type");
3490       break;
3491     case IceType_v4i1:
3492       NewTy = IceType_v4i32;
3493       break;
3494     case IceType_v8i1:
3495       NewTy = IceType_v8i16;
3496       break;
3497     case IceType_v16i1:
3498       NewTy = IceType_v16i8;
3499       break;
3500     }
3501     Variable *NewSrc0 = Func->makeVariable(NewTy);
3502     Variable *NewSrc1 = Func->makeVariable(NewTy);
3503     lowerCast(InstCast::create(Func, InstCast::Sext, NewSrc0, Src0));
3504     lowerCast(InstCast::create(Func, InstCast::Sext, NewSrc1, Src1));
3505     Src0 = NewSrc0;
3506     Src1 = NewSrc1;
3507     Ty = NewTy;
3508   }
3509
3510   InstIcmp::ICond Condition = Icmp->getCondition();
3511
3512   Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
3513   Operand *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
3514
3515   // SSE2 only has signed comparison operations. Transform unsigned inputs in
3516   // a manner that allows for the use of signed comparison operations by
3517   // flipping the high order bits.
3518   if (Condition == InstIcmp::Ugt || Condition == InstIcmp::Uge ||
3519       Condition == InstIcmp::Ult || Condition == InstIcmp::Ule) {
3520     Variable *T0 = makeReg(Ty);
3521     Variable *T1 = makeReg(Ty);
3522     Variable *HighOrderBits = makeVectorOfHighOrderBits(Ty);
3523     _movp(T0, Src0RM);
3524     _pxor(T0, HighOrderBits);
3525     _movp(T1, Src1RM);
3526     _pxor(T1, HighOrderBits);
3527     Src0RM = T0;
3528     Src1RM = T1;
3529   }
3530
3531   Variable *T = makeReg(Ty);
3532   switch (Condition) {
3533   default:
3534     llvm_unreachable("unexpected condition");
3535     break;
3536   case InstIcmp::Eq: {
3537     if (llvm::isa<X86OperandMem>(Src1RM))
3538       Src1RM = legalizeToReg(Src1RM);
3539     _movp(T, Src0RM);
3540     _pcmpeq(T, Src1RM);
3541   } break;
3542   case InstIcmp::Ne: {
3543     if (llvm::isa<X86OperandMem>(Src1RM))
3544       Src1RM = legalizeToReg(Src1RM);
3545     _movp(T, Src0RM);
3546     _pcmpeq(T, Src1RM);
3547     Variable *MinusOne = makeVectorOfMinusOnes(Ty);
3548     _pxor(T, MinusOne);
3549   } break;
3550   case InstIcmp::Ugt:
3551   case InstIcmp::Sgt: {
3552     if (llvm::isa<X86OperandMem>(Src1RM))
3553       Src1RM = legalizeToReg(Src1RM);
3554     _movp(T, Src0RM);
3555     _pcmpgt(T, Src1RM);
3556   } break;
3557   case InstIcmp::Uge:
3558   case InstIcmp::Sge: {
3559     // !(Src1RM > Src0RM)
3560     if (llvm::isa<X86OperandMem>(Src0RM))
3561       Src0RM = legalizeToReg(Src0RM);
3562     _movp(T, Src1RM);
3563     _pcmpgt(T, Src0RM);
3564     Variable *MinusOne = makeVectorOfMinusOnes(Ty);
3565     _pxor(T, MinusOne);
3566   } break;
3567   case InstIcmp::Ult:
3568   case InstIcmp::Slt: {
3569     if (llvm::isa<X86OperandMem>(Src0RM))
3570       Src0RM = legalizeToReg(Src0RM);
3571     _movp(T, Src1RM);
3572     _pcmpgt(T, Src0RM);
3573   } break;
3574   case InstIcmp::Ule:
3575   case InstIcmp::Sle: {
3576     // !(Src0RM > Src1RM)
3577     if (llvm::isa<X86OperandMem>(Src1RM))
3578       Src1RM = legalizeToReg(Src1RM);
3579     _movp(T, Src0RM);
3580     _pcmpgt(T, Src1RM);
3581     Variable *MinusOne = makeVectorOfMinusOnes(Ty);
3582     _pxor(T, MinusOne);
3583   } break;
3584   }
3585
3586   _movp(Dest, T);
3587   eliminateNextVectorSextInstruction(Dest);
3588 }
3589
3590 template <typename TraitsType>
3591 template <typename T>
3592 typename std::enable_if<!T::Is64Bit, void>::type
3593 TargetX86Base<TraitsType>::lowerIcmp64(const InstIcmp *Icmp,
3594                                        const Inst *Consumer) {
3595   // a=icmp cond, b, c ==> cmp b,c; a=1; br cond,L1; FakeUse(a); a=0; L1:
3596   Operand *Src0 = legalize(Icmp->getSrc(0));
3597   Operand *Src1 = legalize(Icmp->getSrc(1));
3598   Variable *Dest = Icmp->getDest();
3599   InstIcmp::ICond Condition = Icmp->getCondition();
3600   assert(Condition < Traits::TableIcmp64Size);
3601   Operand *Src0LoRM = nullptr;
3602   Operand *Src0HiRM = nullptr;
3603   // Legalize the portions of Src0 that are going to be needed.
3604   if (isZero(Src1)) {
3605     switch (Condition) {
3606     default:
3607       llvm_unreachable("unexpected condition");
3608       break;
3609     // These two are not optimized, so we fall through to the general case,
3610     // which needs the upper and lower halves legalized.
3611     case InstIcmp::Sgt:
3612     case InstIcmp::Sle:
3613     // These four compare after performing an "or" of the high and low half, so
3614     // they need the upper and lower halves legalized.
3615     case InstIcmp::Eq:
3616     case InstIcmp::Ule:
3617     case InstIcmp::Ne:
3618     case InstIcmp::Ugt:
3619       Src0LoRM = legalize(loOperand(Src0), Legal_Reg | Legal_Mem);
3620     // These two test only the high half's sign bit, so they need only
3621     // the upper half legalized.
3622     case InstIcmp::Sge:
3623     case InstIcmp::Slt:
3624       Src0HiRM = legalize(hiOperand(Src0), Legal_Reg | Legal_Mem);
3625       break;
3626
3627     // These two move constants and hence need no legalization.
3628     case InstIcmp::Uge:
3629     case InstIcmp::Ult:
3630       break;
3631     }
3632   } else {
3633     Src0LoRM = legalize(loOperand(Src0), Legal_Reg | Legal_Mem);
3634     Src0HiRM = legalize(hiOperand(Src0), Legal_Reg | Legal_Mem);
3635   }
3636   // Optimize comparisons with zero.
3637   if (isZero(Src1)) {
3638     Constant *SignMask = Ctx->getConstantInt32(0x80000000);
3639     Variable *Temp = nullptr;
3640     switch (Condition) {
3641     default:
3642       llvm_unreachable("unexpected condition");
3643       break;
3644     case InstIcmp::Eq:
3645     case InstIcmp::Ule:
3646       // Mov Src0HiRM first, because it was legalized most recently, and will
3647       // sometimes avoid a move before the OR.
3648       _mov(Temp, Src0HiRM);
3649       _or(Temp, Src0LoRM);
3650       Context.insert<InstFakeUse>(Temp);
3651       setccOrConsumer(Traits::Cond::Br_e, Dest, Consumer);
3652       return;
3653     case InstIcmp::Ne:
3654     case InstIcmp::Ugt:
3655       // Mov Src0HiRM first, because it was legalized most recently, and will
3656       // sometimes avoid a move before the OR.
3657       _mov(Temp, Src0HiRM);
3658       _or(Temp, Src0LoRM);
3659       Context.insert<InstFakeUse>(Temp);
3660       setccOrConsumer(Traits::Cond::Br_ne, Dest, Consumer);
3661       return;
3662     case InstIcmp::Uge:
3663       movOrConsumer(true, Dest, Consumer);
3664       return;
3665     case InstIcmp::Ult:
3666       movOrConsumer(false, Dest, Consumer);
3667       return;
3668     case InstIcmp::Sgt:
3669       break;
3670     case InstIcmp::Sge:
3671       _test(Src0HiRM, SignMask);
3672       setccOrConsumer(Traits::Cond::Br_e, Dest, Consumer);
3673       return;
3674     case InstIcmp::Slt:
3675       _test(Src0HiRM, SignMask);
3676       setccOrConsumer(Traits::Cond::Br_ne, Dest, Consumer);
3677       return;
3678     case InstIcmp::Sle:
3679       break;
3680     }
3681   }
3682   // Handle general compares.
3683   Operand *Src1LoRI = legalize(loOperand(Src1), Legal_Reg | Legal_Imm);
3684   Operand *Src1HiRI = legalize(hiOperand(Src1), Legal_Reg | Legal_Imm);
3685   if (Consumer == nullptr) {
3686     Constant *Zero = Ctx->getConstantInt(Dest->getType(), 0);
3687     Constant *One = Ctx->getConstantInt(Dest->getType(), 1);
3688     InstX86Label *LabelFalse = InstX86Label::create(Func, this);
3689     InstX86Label *LabelTrue = InstX86Label::create(Func, this);
3690     _mov(Dest, One);
3691     _cmp(Src0HiRM, Src1HiRI);
3692     if (Traits::TableIcmp64[Condition].C1 != Traits::Cond::Br_None)
3693       _br(Traits::TableIcmp64[Condition].C1, LabelTrue);
3694     if (Traits::TableIcmp64[Condition].C2 != Traits::Cond::Br_None)
3695       _br(Traits::TableIcmp64[Condition].C2, LabelFalse);
3696     _cmp(Src0LoRM, Src1LoRI);
3697     _br(Traits::TableIcmp64[Condition].C3, LabelTrue);
3698     Context.insert(LabelFalse);
3699     _redefined(_mov(Dest, Zero));
3700     Context.insert(LabelTrue);
3701     return;
3702   }
3703   if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) {
3704     _cmp(Src0HiRM, Src1HiRI);
3705     if (Traits::TableIcmp64[Condition].C1 != Traits::Cond::Br_None)
3706       _br(Traits::TableIcmp64[Condition].C1, Br->getTargetTrue());
3707     if (Traits::TableIcmp64[Condition].C2 != Traits::Cond::Br_None)
3708       _br(Traits::TableIcmp64[Condition].C2, Br->getTargetFalse());
3709     _cmp(Src0LoRM, Src1LoRI);
3710     _br(Traits::TableIcmp64[Condition].C3, Br->getTargetTrue(),
3711         Br->getTargetFalse());
3712     return;
3713   }
3714   if (auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) {
3715     Operand *SrcT = Select->getTrueOperand();
3716     Operand *SrcF = Select->getFalseOperand();
3717     Variable *SelectDest = Select->getDest();
3718     InstX86Label *LabelFalse = InstX86Label::create(Func, this);
3719     InstX86Label *LabelTrue = InstX86Label::create(Func, this);
3720     lowerMove(SelectDest, SrcT, false);
3721     _cmp(Src0HiRM, Src1HiRI);
3722     if (Traits::TableIcmp64[Condition].C1 != Traits::Cond::Br_None)
3723       _br(Traits::TableIcmp64[Condition].C1, LabelTrue);
3724     if (Traits::TableIcmp64[Condition].C2 != Traits::Cond::Br_None)
3725       _br(Traits::TableIcmp64[Condition].C2, LabelFalse);
3726     _cmp(Src0LoRM, Src1LoRI);
3727     _br(Traits::TableIcmp64[Condition].C3, LabelTrue);
3728     Context.insert(LabelFalse);
3729     static constexpr bool IsRedefinition = true;
3730     lowerMove(SelectDest, SrcF, IsRedefinition);
3731     Context.insert(LabelTrue);
3732     return;
3733   }
3734   llvm::report_fatal_error("Unexpected consumer type");
3735 }
3736
3737 template <typename TraitsType>
3738 void TargetX86Base<TraitsType>::setccOrConsumer(BrCond Condition,
3739                                                 Variable *Dest,
3740                                                 const Inst *Consumer) {
3741   if (Consumer == nullptr) {
3742     _setcc(Dest, Condition);
3743     return;
3744   }
3745   if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) {
3746     _br(Condition, Br->getTargetTrue(), Br->getTargetFalse());
3747     return;
3748   }
3749   if (const auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) {
3750     Operand *SrcT = Select->getTrueOperand();
3751     Operand *SrcF = Select->getFalseOperand();
3752     Variable *SelectDest = Select->getDest();
3753     lowerSelectMove(SelectDest, Condition, SrcT, SrcF);
3754     return;
3755   }
3756   llvm::report_fatal_error("Unexpected consumer type");
3757 }
3758
3759 template <typename TraitsType>
3760 void TargetX86Base<TraitsType>::movOrConsumer(bool IcmpResult, Variable *Dest,
3761                                               const Inst *Consumer) {
3762   if (Consumer == nullptr) {
3763     _mov(Dest, Ctx->getConstantInt(Dest->getType(), (IcmpResult ? 1 : 0)));
3764     return;
3765   }
3766   if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) {
3767     // TODO(sehr,stichnot): This could be done with a single unconditional
3768     // branch instruction, but subzero doesn't know how to handle the resulting
3769     // control flow graph changes now.  Make it do so to eliminate mov and cmp.
3770     _mov(Dest, Ctx->getConstantInt(Dest->getType(), (IcmpResult ? 1 : 0)));
3771     _cmp(Dest, Ctx->getConstantInt(Dest->getType(), 0));
3772     _br(Traits::Cond::Br_ne, Br->getTargetTrue(), Br->getTargetFalse());
3773     return;
3774   }
3775   if (const auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) {
3776     Operand *Src = nullptr;
3777     if (IcmpResult) {
3778       Src = legalize(Select->getTrueOperand(), Legal_Reg | Legal_Imm);
3779     } else {
3780       Src = legalize(Select->getFalseOperand(), Legal_Reg | Legal_Imm);
3781     }
3782     Variable *SelectDest = Select->getDest();
3783     lowerMove(SelectDest, Src, false);
3784     return;
3785   }
3786   llvm::report_fatal_error("Unexpected consumer type");
3787 }
3788
3789 template <typename TraitsType>
3790 void TargetX86Base<TraitsType>::lowerArithAndConsumer(
3791     const InstArithmetic *Arith, const Inst *Consumer) {
3792   Variable *T = nullptr;
3793   Operand *Src0 = legalize(Arith->getSrc(0));
3794   Operand *Src1 = legalize(Arith->getSrc(1));
3795   Variable *Dest = Arith->getDest();
3796   switch (Arith->getOp()) {
3797   default:
3798     llvm_unreachable("arithmetic operator not AND or OR");
3799     break;
3800   case InstArithmetic::And:
3801     _mov(T, Src0);
3802     // Test cannot have an address in the second position.  Since T is
3803     // guaranteed to be a register and Src1 could be a memory load, ensure
3804     // that the second argument is a register.
3805     if (llvm::isa<Constant>(Src1))
3806       _test(T, Src1);
3807     else
3808       _test(Src1, T);
3809     break;
3810   case InstArithmetic::Or:
3811     _mov(T, Src0);
3812     _or(T, Src1);
3813     break;
3814   }
3815
3816   if (Consumer == nullptr) {
3817     llvm::report_fatal_error("Expected a consumer instruction");
3818   }
3819   if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) {
3820     Context.insert<InstFakeUse>(T);
3821     Context.insert<InstFakeDef>(Dest);
3822     _br(Traits::Cond::Br_ne, Br->getTargetTrue(), Br->getTargetFalse());
3823     return;
3824   }
3825   llvm::report_fatal_error("Unexpected consumer type");
3826 }
3827
3828 template <typename TraitsType>
3829 void TargetX86Base<TraitsType>::lowerInsertElement(
3830     const InstInsertElement *Instr) {
3831   Operand *SourceVectNotLegalized = Instr->getSrc(0);
3832   Operand *ElementToInsertNotLegalized = Instr->getSrc(1);
3833   auto *ElementIndex = llvm::dyn_cast<ConstantInteger32>(Instr->getSrc(2));
3834   // Only constant indices are allowed in PNaCl IR.
3835   assert(ElementIndex);
3836   unsigned Index = ElementIndex->getValue();
3837   assert(Index < typeNumElements(SourceVectNotLegalized->getType()));
3838
3839   Type Ty = SourceVectNotLegalized->getType();
3840   Type ElementTy = typeElementType(Ty);
3841   Type InVectorElementTy = Traits::getInVectorElementType(Ty);
3842
3843   if (ElementTy == IceType_i1) {
3844     // Expand the element to the appropriate size for it to be inserted in the
3845     // vector.
3846     Variable *Expanded = Func->makeVariable(InVectorElementTy);
3847     auto *Cast = InstCast::create(Func, InstCast::Zext, Expanded,
3848                                   ElementToInsertNotLegalized);
3849     lowerCast(Cast);
3850     ElementToInsertNotLegalized = Expanded;
3851   }
3852
3853   if (Ty == IceType_v8i16 || Ty == IceType_v8i1 ||
3854       InstructionSet >= Traits::SSE4_1) {
3855     // Use insertps, pinsrb, pinsrw, or pinsrd.
3856     Operand *ElementRM =
3857         legalize(ElementToInsertNotLegalized, Legal_Reg | Legal_Mem);
3858     Operand *SourceVectRM =
3859         legalize(SourceVectNotLegalized, Legal_Reg | Legal_Mem);
3860     Variable *T = makeReg(Ty);
3861     _movp(T, SourceVectRM);
3862     if (Ty == IceType_v4f32) {
3863       _insertps(T, ElementRM, Ctx->getConstantInt32(Index << 4));
3864     } else {
3865       // For the pinsrb and pinsrw instructions, when the source operand is a
3866       // register, it must be a full r32 register like eax, and not ax/al/ah.
3867       // For filetype=asm, InstX86Pinsr<TraitsType>::emit() compensates for
3868       // the use
3869       // of r16 and r8 by converting them through getBaseReg(), while emitIAS()
3870       // validates that the original and base register encodings are the same.
3871       if (ElementRM->getType() == IceType_i8 &&
3872           llvm::isa<Variable>(ElementRM)) {
3873         // Don't use ah/bh/ch/dh for pinsrb.
3874         ElementRM = copyToReg8(ElementRM);
3875       }
3876       _pinsr(T, ElementRM, Ctx->getConstantInt32(Index));
3877     }
3878     _movp(Instr->getDest(), T);
3879   } else if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) {
3880     // Use shufps or movss.
3881     Variable *ElementR = nullptr;
3882     Operand *SourceVectRM =
3883         legalize(SourceVectNotLegalized, Legal_Reg | Legal_Mem);
3884
3885     if (InVectorElementTy == IceType_f32) {
3886       // ElementR will be in an XMM register since it is floating point.
3887       ElementR = legalizeToReg(ElementToInsertNotLegalized);
3888     } else {
3889       // Copy an integer to an XMM register.
3890       Operand *T = legalize(ElementToInsertNotLegalized, Legal_Reg | Legal_Mem);
3891       ElementR = makeReg(Ty);
3892       _movd(ElementR, T);
3893     }
3894
3895     if (Index == 0) {
3896       Variable *T = makeReg(Ty);
3897       _movp(T, SourceVectRM);
3898       _movss(T, ElementR);
3899       _movp(Instr->getDest(), T);
3900       return;
3901     }
3902
3903     // shufps treats the source and destination operands as vectors of four
3904     // doublewords. The destination's two high doublewords are selected from
3905     // the source operand and the two low doublewords are selected from the
3906     // (original value of) the destination operand. An insertelement operation
3907     // can be effected with a sequence of two shufps operations with
3908     // appropriate masks. In all cases below, Element[0] is being inserted into
3909     // SourceVectOperand. Indices are ordered from left to right.
3910     //
3911     // insertelement into index 1 (result is stored in ElementR):
3912     //   ElementR := ElementR[0, 0] SourceVectRM[0, 0]
3913     //   ElementR := ElementR[3, 0] SourceVectRM[2, 3]
3914     //
3915     // insertelement into index 2 (result is stored in T):
3916     //   T := SourceVectRM
3917     //   ElementR := ElementR[0, 0] T[0, 3]
3918     //   T := T[0, 1] ElementR[0, 3]
3919     //
3920     // insertelement into index 3 (result is stored in T):
3921     //   T := SourceVectRM
3922     //   ElementR := ElementR[0, 0] T[0, 2]
3923     //   T := T[0, 1] ElementR[3, 0]
3924     const unsigned char Mask1[3] = {0, 192, 128};
3925     const unsigned char Mask2[3] = {227, 196, 52};
3926
3927     Constant *Mask1Constant = Ctx->getConstantInt32(Mask1[Index - 1]);
3928     Constant *Mask2Constant = Ctx->getConstantInt32(Mask2[Index - 1]);
3929
3930     if (Index == 1) {
3931       _shufps(ElementR, SourceVectRM, Mask1Constant);
3932       _shufps(ElementR, SourceVectRM, Mask2Constant);
3933       _movp(Instr->getDest(), ElementR);
3934     } else {
3935       Variable *T = makeReg(Ty);
3936       _movp(T, SourceVectRM);
3937       _shufps(ElementR, T, Mask1Constant);
3938       _shufps(T, ElementR, Mask2Constant);
3939       _movp(Instr->getDest(), T);
3940     }
3941   } else {
3942     assert(Ty == IceType_v16i8 || Ty == IceType_v16i1);
3943     // Spill the value to a stack slot and perform the insertion in memory.
3944     //
3945     // TODO(wala): use legalize(SourceVectNotLegalized, Legal_Mem) when support
3946     // for legalizing to mem is implemented.
3947     Variable *Slot = Func->makeVariable(Ty);
3948     Slot->setMustNotHaveReg();
3949     _movp(Slot, legalizeToReg(SourceVectNotLegalized));
3950
3951     // Compute the location of the position to insert in memory.
3952     unsigned Offset = Index * typeWidthInBytes(InVectorElementTy);
3953     X86OperandMem *Loc =
3954         getMemoryOperandForStackSlot(InVectorElementTy, Slot, Offset);
3955     _store(legalizeToReg(ElementToInsertNotLegalized), Loc);
3956
3957     Variable *T = makeReg(Ty);
3958     _movp(T, Slot);
3959     _movp(Instr->getDest(), T);
3960   }
3961 }
3962
3963 template <typename TraitsType>
3964 void TargetX86Base<TraitsType>::lowerIntrinsicCall(
3965     const InstIntrinsicCall *Instr) {
3966   switch (Intrinsics::IntrinsicID ID = Instr->getIntrinsicInfo().ID) {
3967   case Intrinsics::AtomicCmpxchg: {
3968     if (!Intrinsics::isMemoryOrderValid(
3969             ID, getConstantMemoryOrder(Instr->getArg(3)),
3970             getConstantMemoryOrder(Instr->getArg(4)))) {
3971       Func->setError("Unexpected memory ordering for AtomicCmpxchg");
3972       return;
3973     }
3974     Variable *DestPrev = Instr->getDest();
3975     Operand *PtrToMem = legalize(Instr->getArg(0));
3976     Operand *Expected = legalize(Instr->getArg(1));
3977     Operand *Desired = legalize(Instr->getArg(2));
3978     if (tryOptimizedCmpxchgCmpBr(DestPrev, PtrToMem, Expected, Desired))
3979       return;
3980     lowerAtomicCmpxchg(DestPrev, PtrToMem, Expected, Desired);
3981     return;
3982   }
3983   case Intrinsics::AtomicFence:
3984     if (!Intrinsics::isMemoryOrderValid(
3985             ID, getConstantMemoryOrder(Instr->getArg(0)))) {
3986       Func->setError("Unexpected memory ordering for AtomicFence");
3987       return;
3988     }
3989     _mfence();
3990     return;
3991   case Intrinsics::AtomicFenceAll:
3992     // NOTE: FenceAll should prevent and load/store from being moved across the
3993     // fence (both atomic and non-atomic). The InstX8632Mfence instruction is
3994     // currently marked coarsely as "HasSideEffects".
3995     _mfence();
3996     return;
3997   case Intrinsics::AtomicIsLockFree: {
3998     // X86 is always lock free for 8/16/32/64 bit accesses.
3999     // TODO(jvoung): Since the result is constant when given a constant byte
4000     // size, this opens up DCE opportunities.
4001     Operand *ByteSize = Instr->getArg(0);
4002     Variable *Dest = Instr->getDest();
4003     if (auto *CI = llvm::dyn_cast<ConstantInteger32>(ByteSize)) {
4004       Constant *Result;
4005       switch (CI->getValue()) {
4006       default:
4007         // Some x86-64 processors support the cmpxchg16b instruction, which can
4008         // make 16-byte operations lock free (when used with the LOCK prefix).
4009         // However, that's not supported in 32-bit mode, so just return 0 even
4010         // for large sizes.
4011         Result = Ctx->getConstantZero(IceType_i32);
4012         break;
4013       case 1:
4014       case 2:
4015       case 4:
4016       case 8:
4017         Result = Ctx->getConstantInt32(1);
4018         break;
4019       }
4020       _mov(Dest, Result);
4021       return;
4022     }
4023     // The PNaCl ABI requires the byte size to be a compile-time constant.
4024     Func->setError("AtomicIsLockFree byte size should be compile-time const");
4025     return;
4026   }
4027   case Intrinsics::AtomicLoad: {
4028     // We require the memory address to be naturally aligned. Given that is the
4029     // case, then normal loads are atomic.
4030     if (!Intrinsics::isMemoryOrderValid(
4031             ID, getConstantMemoryOrder(Instr->getArg(1)))) {
4032       Func->setError("Unexpected memory ordering for AtomicLoad");
4033       return;
4034     }
4035     Variable *Dest = Instr->getDest();
4036     if (!Traits::Is64Bit) {
4037       if (auto *Dest64On32 = llvm::dyn_cast<Variable64On32>(Dest)) {
4038         // Follow what GCC does and use a movq instead of what lowerLoad()
4039         // normally does (split the load into two). Thus, this skips
4040         // load/arithmetic op folding. Load/arithmetic folding can't happen
4041         // anyway, since this is x86-32 and integer arithmetic only happens on
4042         // 32-bit quantities.
4043         Variable *T = makeReg(IceType_f64);
4044         X86OperandMem *Addr = formMemoryOperand(Instr->getArg(0), IceType_f64);
4045         _movq(T, Addr);
4046         // Then cast the bits back out of the XMM register to the i64 Dest.
4047         auto *Cast = InstCast::create(Func, InstCast::Bitcast, Dest, T);
4048         lowerCast(Cast);
4049         // Make sure that the atomic load isn't elided when unused.
4050         Context.insert<InstFakeUse>(Dest64On32->getLo());
4051         Context.insert<InstFakeUse>(Dest64On32->getHi());
4052         return;
4053       }
4054     }
4055     auto *Load = InstLoad::create(Func, Dest, Instr->getArg(0));
4056     lowerLoad(Load);
4057     // Make sure the atomic load isn't elided when unused, by adding a FakeUse.
4058     // Since lowerLoad may fuse the load w/ an arithmetic instruction, insert
4059     // the FakeUse on the last-inserted instruction's dest.
4060     Context.insert<InstFakeUse>(Context.getLastInserted()->getDest());
4061     return;
4062   }
4063   case Intrinsics::AtomicRMW:
4064     if (!Intrinsics::isMemoryOrderValid(
4065             ID, getConstantMemoryOrder(Instr->getArg(3)))) {
4066       Func->setError("Unexpected memory ordering for AtomicRMW");
4067       return;
4068     }
4069     lowerAtomicRMW(
4070         Instr->getDest(),
4071         static_cast<uint32_t>(
4072             llvm::cast<ConstantInteger32>(Instr->getArg(0))->getValue()),
4073         Instr->getArg(1), Instr->getArg(2));
4074     return;
4075   case Intrinsics::AtomicStore: {
4076     if (!Intrinsics::isMemoryOrderValid(
4077             ID, getConstantMemoryOrder(Instr->getArg(2)))) {
4078       Func->setError("Unexpected memory ordering for AtomicStore");
4079       return;
4080     }
4081     // We require the memory address to be naturally aligned. Given that is the
4082     // case, then normal stores are atomic. Add a fence after the store to make
4083     // it visible.
4084     Operand *Value = Instr->getArg(0);
4085     Operand *Ptr = Instr->getArg(1);
4086     if (!Traits::Is64Bit && Value->getType() == IceType_i64) {
4087       // Use a movq instead of what lowerStore() normally does (split the store
4088       // into two), following what GCC does. Cast the bits from int -> to an
4089       // xmm register first.
4090       Variable *T = makeReg(IceType_f64);
4091       auto *Cast = InstCast::create(Func, InstCast::Bitcast, T, Value);
4092       lowerCast(Cast);
4093       // Then store XMM w/ a movq.
4094       X86OperandMem *Addr = formMemoryOperand(Ptr, IceType_f64);
4095       _storeq(T, Addr);
4096       _mfence();
4097       return;
4098     }
4099     auto *Store = InstStore::create(Func, Value, Ptr);
4100     lowerStore(Store);
4101     _mfence();
4102     return;
4103   }
4104   case Intrinsics::Bswap: {
4105     Variable *Dest = Instr->getDest();
4106     Operand *Val = Instr->getArg(0);
4107     // In 32-bit mode, bswap only works on 32-bit arguments, and the argument
4108     // must be a register. Use rotate left for 16-bit bswap.
4109     if (!Traits::Is64Bit && Val->getType() == IceType_i64) {
4110       Val = legalizeUndef(Val);
4111       Variable *T_Lo = legalizeToReg(loOperand(Val));
4112       Variable *T_Hi = legalizeToReg(hiOperand(Val));
4113       auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
4114       auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
4115       _bswap(T_Lo);
4116       _bswap(T_Hi);
4117       _mov(DestLo, T_Hi);
4118       _mov(DestHi, T_Lo);
4119     } else if ((Traits::Is64Bit && Val->getType() == IceType_i64) ||
4120                Val->getType() == IceType_i32) {
4121       Variable *T = legalizeToReg(Val);
4122       _bswap(T);
4123       _mov(Dest, T);
4124     } else {
4125       assert(Val->getType() == IceType_i16);
4126       Constant *Eight = Ctx->getConstantInt16(8);
4127       Variable *T = nullptr;
4128       Val = legalize(Val);
4129       _mov(T, Val);
4130       _rol(T, Eight);
4131       _mov(Dest, T);
4132     }
4133     return;
4134   }
4135   case Intrinsics::Ctpop: {
4136     Variable *Dest = Instr->getDest();
4137     Variable *T = nullptr;
4138     Operand *Val = Instr->getArg(0);
4139     Type ValTy = Val->getType();
4140     assert(ValTy == IceType_i32 || ValTy == IceType_i64);
4141
4142     if (!Traits::Is64Bit) {
4143       T = Dest;
4144     } else {
4145       T = makeReg(IceType_i64);
4146       if (ValTy == IceType_i32) {
4147         // in x86-64, __popcountsi2 is not defined, so we cheat a bit by
4148         // converting it to a 64-bit value, and using ctpop_i64. _movzx should
4149         // ensure we will not have any bits set on Val's upper 32 bits.
4150         Variable *V = makeReg(IceType_i64);
4151         _movzx(V, Val);
4152         Val = V;
4153       }
4154       ValTy = IceType_i64;
4155     }
4156
4157     InstCall *Call =
4158         makeHelperCall(ValTy == IceType_i32 ? RuntimeHelper::H_call_ctpop_i32
4159                                             : RuntimeHelper::H_call_ctpop_i64,
4160                        T, 1);
4161     Call->addArg(Val);
4162     lowerCall(Call);
4163     // The popcount helpers always return 32-bit values, while the intrinsic's
4164     // signature matches the native POPCNT instruction and fills a 64-bit reg
4165     // (in 64-bit mode). Thus, clear the upper bits of the dest just in case
4166     // the user doesn't do that in the IR. If the user does that in the IR,
4167     // then this zero'ing instruction is dead and gets optimized out.
4168     if (!Traits::Is64Bit) {
4169       assert(T == Dest);
4170       if (Val->getType() == IceType_i64) {
4171         auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
4172         Constant *Zero = Ctx->getConstantZero(IceType_i32);
4173         _mov(DestHi, Zero);
4174       }
4175     } else {
4176       assert(Val->getType() == IceType_i64);
4177       // T is 64 bit. It needs to be copied to dest. We need to:
4178       //
4179       // T_1.32 = trunc T.64 to i32
4180       // T_2.64 = zext T_1.32 to i64
4181       // Dest.<<right_size>> = T_2.<<right_size>>
4182       //
4183       // which ensures the upper 32 bits will always be cleared. Just doing a
4184       //
4185       // mov Dest.32 = trunc T.32 to i32
4186       //
4187       // is dangerous because there's a chance the compiler will optimize this
4188       // copy out. To use _movzx we need two new registers (one 32-, and
4189       // another 64-bit wide.)
4190       Variable *T_1 = makeReg(IceType_i32);
4191       _mov(T_1, T);
4192       Variable *T_2 = makeReg(IceType_i64);
4193       _movzx(T_2, T_1);
4194       _mov(Dest, T_2);
4195     }
4196     return;
4197   }
4198   case Intrinsics::Ctlz: {
4199     // The "is zero undef" parameter is ignored and we always return a
4200     // well-defined value.
4201     Operand *Val = legalize(Instr->getArg(0));
4202     Operand *FirstVal;
4203     Operand *SecondVal = nullptr;
4204     if (!Traits::Is64Bit && Val->getType() == IceType_i64) {
4205       FirstVal = loOperand(Val);
4206       SecondVal = hiOperand(Val);
4207     } else {
4208       FirstVal = Val;
4209     }
4210     constexpr bool IsCttz = false;
4211     lowerCountZeros(IsCttz, Val->getType(), Instr->getDest(), FirstVal,
4212                     SecondVal);
4213     return;
4214   }
4215   case Intrinsics::Cttz: {
4216     // The "is zero undef" parameter is ignored and we always return a
4217     // well-defined value.
4218     Operand *Val = legalize(Instr->getArg(0));
4219     Operand *FirstVal;
4220     Operand *SecondVal = nullptr;
4221     if (!Traits::Is64Bit && Val->getType() == IceType_i64) {
4222       FirstVal = hiOperand(Val);
4223       SecondVal = loOperand(Val);
4224     } else {
4225       FirstVal = Val;
4226     }
4227     constexpr bool IsCttz = true;
4228     lowerCountZeros(IsCttz, Val->getType(), Instr->getDest(), FirstVal,
4229                     SecondVal);
4230     return;
4231   }
4232   case Intrinsics::Fabs: {
4233     Operand *Src = legalize(Instr->getArg(0));
4234     Type Ty = Src->getType();
4235     Variable *Dest = Instr->getDest();
4236     Variable *T = makeVectorOfFabsMask(Ty);
4237     // The pand instruction operates on an m128 memory operand, so if Src is an
4238     // f32 or f64, we need to make sure it's in a register.
4239     if (isVectorType(Ty)) {
4240       if (llvm::isa<X86OperandMem>(Src))
4241         Src = legalizeToReg(Src);
4242     } else {
4243       Src = legalizeToReg(Src);
4244     }
4245     _pand(T, Src);
4246     if (isVectorType(Ty))
4247       _movp(Dest, T);
4248     else
4249       _mov(Dest, T);
4250     return;
4251   }
4252   case Intrinsics::Longjmp: {
4253     InstCall *Call = makeHelperCall(RuntimeHelper::H_call_longjmp, nullptr, 2);
4254     Call->addArg(Instr->getArg(0));
4255     Call->addArg(Instr->getArg(1));
4256     lowerCall(Call);
4257     return;
4258   }
4259   case Intrinsics::Memcpy: {
4260     lowerMemcpy(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2));
4261     return;
4262   }
4263   case Intrinsics::Memmove: {
4264     lowerMemmove(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2));
4265     return;
4266   }
4267   case Intrinsics::Memset: {
4268     lowerMemset(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2));
4269     return;
4270   }
4271   case Intrinsics::NaClReadTP: {
4272     if (NeedSandboxing) {
4273       Operand *Src =
4274           dispatchToConcrete(&ConcreteTarget::createNaClReadTPSrcOperand);
4275       Variable *Dest = Instr->getDest();
4276       Variable *T = nullptr;
4277       _mov(T, Src);
4278       _mov(Dest, T);
4279     } else {
4280       InstCall *Call =
4281           makeHelperCall(RuntimeHelper::H_call_read_tp, Instr->getDest(), 0);
4282       lowerCall(Call);
4283     }
4284     return;
4285   }
4286   case Intrinsics::Setjmp: {
4287     InstCall *Call =
4288         makeHelperCall(RuntimeHelper::H_call_setjmp, Instr->getDest(), 1);
4289     Call->addArg(Instr->getArg(0));
4290     lowerCall(Call);
4291     return;
4292   }
4293   case Intrinsics::Sqrt: {
4294     Operand *Src = legalize(Instr->getArg(0));
4295     Variable *Dest = Instr->getDest();
4296     Variable *T = makeReg(Dest->getType());
4297     _sqrtss(T, Src);
4298     _mov(Dest, T);
4299     return;
4300   }
4301   case Intrinsics::Stacksave: {
4302     if (!Traits::Is64Bit || !NeedSandboxing) {
4303       Variable *esp = Func->getTarget()->getPhysicalRegister(getStackReg(),
4304                                                              Traits::WordType);
4305       Variable *Dest = Instr->getDest();
4306       _mov(Dest, esp);
4307       return;
4308     }
4309     Variable *esp = Func->getTarget()->getPhysicalRegister(
4310         Traits::RegisterSet::Reg_esp, IceType_i32);
4311     Variable *Dest = Instr->getDest();
4312     _mov(Dest, esp);
4313
4314     return;
4315   }
4316   case Intrinsics::Stackrestore: {
4317     Operand *Src = Instr->getArg(0);
4318     _mov_sp(Src);
4319     return;
4320   }
4321
4322   case Intrinsics::Trap:
4323     _ud2();
4324     return;
4325   case Intrinsics::UnknownIntrinsic:
4326     Func->setError("Should not be lowering UnknownIntrinsic");
4327     return;
4328   }
4329   return;
4330 }
4331
4332 template <typename TraitsType>
4333 void TargetX86Base<TraitsType>::lowerAtomicCmpxchg(Variable *DestPrev,
4334                                                    Operand *Ptr,
4335                                                    Operand *Expected,
4336                                                    Operand *Desired) {
4337   Type Ty = Expected->getType();
4338   if (!Traits::Is64Bit && Ty == IceType_i64) {
4339     // Reserve the pre-colored registers first, before adding any more
4340     // infinite-weight variables from formMemoryOperand's legalization.
4341     Variable *T_edx = makeReg(IceType_i32, Traits::RegisterSet::Reg_edx);
4342     Variable *T_eax = makeReg(IceType_i32, Traits::RegisterSet::Reg_eax);
4343     Variable *T_ecx = makeReg(IceType_i32, Traits::RegisterSet::Reg_ecx);
4344     Variable *T_ebx = makeReg(IceType_i32, Traits::RegisterSet::Reg_ebx);
4345     _mov(T_eax, loOperand(Expected));
4346     _mov(T_edx, hiOperand(Expected));
4347     _mov(T_ebx, loOperand(Desired));
4348     _mov(T_ecx, hiOperand(Desired));
4349     X86OperandMem *Addr = formMemoryOperand(Ptr, Ty);
4350     constexpr bool Locked = true;
4351     _cmpxchg8b(Addr, T_edx, T_eax, T_ecx, T_ebx, Locked);
4352     auto *DestLo = llvm::cast<Variable>(loOperand(DestPrev));
4353     auto *DestHi = llvm::cast<Variable>(hiOperand(DestPrev));
4354     _mov(DestLo, T_eax);
4355     _mov(DestHi, T_edx);
4356     return;
4357   }
4358   RegNumT Eax;
4359   switch (Ty) {
4360   default:
4361     llvm::report_fatal_error("Bad type for cmpxchg");
4362   case IceType_i64:
4363     Eax = Traits::getRaxOrDie();
4364     break;
4365   case IceType_i32:
4366     Eax = Traits::RegisterSet::Reg_eax;
4367     break;
4368   case IceType_i16:
4369     Eax = Traits::RegisterSet::Reg_ax;
4370     break;
4371   case IceType_i8:
4372     Eax = Traits::RegisterSet::Reg_al;
4373     break;
4374   }
4375   Variable *T_eax = makeReg(Ty, Eax);
4376   _mov(T_eax, Expected);
4377   X86OperandMem *Addr = formMemoryOperand(Ptr, Ty);
4378   Variable *DesiredReg = legalizeToReg(Desired);
4379   constexpr bool Locked = true;
4380   _cmpxchg(Addr, T_eax, DesiredReg, Locked);
4381   _mov(DestPrev, T_eax);
4382 }
4383
4384 template <typename TraitsType>
4385 bool TargetX86Base<TraitsType>::tryOptimizedCmpxchgCmpBr(Variable *Dest,
4386                                                          Operand *PtrToMem,
4387                                                          Operand *Expected,
4388                                                          Operand *Desired) {
4389   if (Func->getOptLevel() == Opt_m1)
4390     return false;
4391   // Peek ahead a few instructions and see how Dest is used.
4392   // It's very common to have:
4393   //
4394   // %x = call i32 @llvm.nacl.atomic.cmpxchg.i32(i32* ptr, i32 %expected, ...)
4395   // [%y_phi = ...] // list of phi stores
4396   // %p = icmp eq i32 %x, %expected
4397   // br i1 %p, label %l1, label %l2
4398   //
4399   // which we can optimize into:
4400   //
4401   // %x = <cmpxchg code>
4402   // [%y_phi = ...] // list of phi stores
4403   // br eq, %l1, %l2
4404   InstList::iterator I = Context.getCur();
4405   // I is currently the InstIntrinsicCall. Peek past that.
4406   // This assumes that the atomic cmpxchg has not been lowered yet,
4407   // so that the instructions seen in the scan from "Cur" is simple.
4408   assert(llvm::isa<InstIntrinsicCall>(*I));
4409   Inst *NextInst = Context.getNextInst(I);
4410   if (!NextInst)
4411     return false;
4412   // There might be phi assignments right before the compare+branch, since this
4413   // could be a backward branch for a loop. This placement of assignments is
4414   // determined by placePhiStores().
4415   CfgVector<InstAssign *> PhiAssigns;
4416   while (auto *PhiAssign = llvm::dyn_cast<InstAssign>(NextInst)) {
4417     if (PhiAssign->getDest() == Dest)
4418       return false;
4419     PhiAssigns.push_back(PhiAssign);
4420     NextInst = Context.getNextInst(I);
4421     if (!NextInst)
4422       return false;
4423   }
4424   if (auto *NextCmp = llvm::dyn_cast<InstIcmp>(NextInst)) {
4425     if (!(NextCmp->getCondition() == InstIcmp::Eq &&
4426           ((NextCmp->getSrc(0) == Dest && NextCmp->getSrc(1) == Expected) ||
4427            (NextCmp->getSrc(1) == Dest && NextCmp->getSrc(0) == Expected)))) {
4428       return false;
4429     }
4430     NextInst = Context.getNextInst(I);
4431     if (!NextInst)
4432       return false;
4433     if (auto *NextBr = llvm::dyn_cast<InstBr>(NextInst)) {
4434       if (!NextBr->isUnconditional() &&
4435           NextCmp->getDest() == NextBr->getCondition() &&
4436           NextBr->isLastUse(NextCmp->getDest())) {
4437         lowerAtomicCmpxchg(Dest, PtrToMem, Expected, Desired);
4438         for (size_t i = 0; i < PhiAssigns.size(); ++i) {
4439           // Lower the phi assignments now, before the branch (same placement
4440           // as before).
4441           InstAssign *PhiAssign = PhiAssigns[i];
4442           PhiAssign->setDeleted();
4443           lowerAssign(PhiAssign);
4444           Context.advanceNext();
4445         }
4446         _br(Traits::Cond::Br_e, NextBr->getTargetTrue(),
4447             NextBr->getTargetFalse());
4448         // Skip over the old compare and branch, by deleting them.
4449         NextCmp->setDeleted();
4450         NextBr->setDeleted();
4451         Context.advanceNext();
4452         Context.advanceNext();
4453         return true;
4454       }
4455     }
4456   }
4457   return false;
4458 }
4459
4460 template <typename TraitsType>
4461 void TargetX86Base<TraitsType>::lowerAtomicRMW(Variable *Dest,
4462                                                uint32_t Operation, Operand *Ptr,
4463                                                Operand *Val) {
4464   bool NeedsCmpxchg = false;
4465   LowerBinOp Op_Lo = nullptr;
4466   LowerBinOp Op_Hi = nullptr;
4467   switch (Operation) {
4468   default:
4469     Func->setError("Unknown AtomicRMW operation");
4470     return;
4471   case Intrinsics::AtomicAdd: {
4472     if (!Traits::Is64Bit && Dest->getType() == IceType_i64) {
4473       // All the fall-through paths must set this to true, but use this
4474       // for asserting.
4475       NeedsCmpxchg = true;
4476       Op_Lo = &TargetX86Base<TraitsType>::_add;
4477       Op_Hi = &TargetX86Base<TraitsType>::_adc;
4478       break;
4479     }
4480     X86OperandMem *Addr = formMemoryOperand(Ptr, Dest->getType());
4481     constexpr bool Locked = true;
4482     Variable *T = nullptr;
4483     _mov(T, Val);
4484     _xadd(Addr, T, Locked);
4485     _mov(Dest, T);
4486     return;
4487   }
4488   case Intrinsics::AtomicSub: {
4489     if (!Traits::Is64Bit && Dest->getType() == IceType_i64) {
4490       NeedsCmpxchg = true;
4491       Op_Lo = &TargetX86Base<TraitsType>::_sub;
4492       Op_Hi = &TargetX86Base<TraitsType>::_sbb;
4493       break;
4494     }
4495     X86OperandMem *Addr = formMemoryOperand(Ptr, Dest->getType());
4496     constexpr bool Locked = true;
4497     Variable *T = nullptr;
4498     _mov(T, Val);
4499     _neg(T);
4500     _xadd(Addr, T, Locked);
4501     _mov(Dest, T);
4502     return;
4503   }
4504   case Intrinsics::AtomicOr:
4505     // TODO(jvoung): If Dest is null or dead, then some of these
4506     // operations do not need an "exchange", but just a locked op.
4507     // That appears to be "worth" it for sub, or, and, and xor.
4508     // xadd is probably fine vs lock add for add, and xchg is fine
4509     // vs an atomic store.
4510     NeedsCmpxchg = true;
4511     Op_Lo = &TargetX86Base<TraitsType>::_or;
4512     Op_Hi = &TargetX86Base<TraitsType>::_or;
4513     break;
4514   case Intrinsics::AtomicAnd:
4515     NeedsCmpxchg = true;
4516     Op_Lo = &TargetX86Base<TraitsType>::_and;
4517     Op_Hi = &TargetX86Base<TraitsType>::_and;
4518     break;
4519   case Intrinsics::AtomicXor:
4520     NeedsCmpxchg = true;
4521     Op_Lo = &TargetX86Base<TraitsType>::_xor;
4522     Op_Hi = &TargetX86Base<TraitsType>::_xor;
4523     break;
4524   case Intrinsics::AtomicExchange:
4525     if (!Traits::Is64Bit && Dest->getType() == IceType_i64) {
4526       NeedsCmpxchg = true;
4527       // NeedsCmpxchg, but no real Op_Lo/Op_Hi need to be done. The values
4528       // just need to be moved to the ecx and ebx registers.
4529       Op_Lo = nullptr;
4530       Op_Hi = nullptr;
4531       break;
4532     }
4533     X86OperandMem *Addr = formMemoryOperand(Ptr, Dest->getType());
4534     Variable *T = nullptr;
4535     _mov(T, Val);
4536     _xchg(Addr, T);
4537     _mov(Dest, T);
4538     return;
4539   }
4540   // Otherwise, we need a cmpxchg loop.
4541   (void)NeedsCmpxchg;
4542   assert(NeedsCmpxchg);
4543   expandAtomicRMWAsCmpxchg(Op_Lo, Op_Hi, Dest, Ptr, Val);
4544 }
4545
4546 template <typename TraitsType>
4547 void TargetX86Base<TraitsType>::expandAtomicRMWAsCmpxchg(LowerBinOp Op_Lo,
4548                                                          LowerBinOp Op_Hi,
4549                                                          Variable *Dest,
4550                                                          Operand *Ptr,
4551                                                          Operand *Val) {
4552   // Expand a more complex RMW operation as a cmpxchg loop:
4553   // For 64-bit:
4554   //   mov     eax, [ptr]
4555   //   mov     edx, [ptr + 4]
4556   // .LABEL:
4557   //   mov     ebx, eax
4558   //   <Op_Lo> ebx, <desired_adj_lo>
4559   //   mov     ecx, edx
4560   //   <Op_Hi> ecx, <desired_adj_hi>
4561   //   lock cmpxchg8b [ptr]
4562   //   jne     .LABEL
4563   //   mov     <dest_lo>, eax
4564   //   mov     <dest_lo>, edx
4565   //
4566   // For 32-bit:
4567   //   mov     eax, [ptr]
4568   // .LABEL:
4569   //   mov     <reg>, eax
4570   //   op      <reg>, [desired_adj]
4571   //   lock cmpxchg [ptr], <reg>
4572   //   jne     .LABEL
4573   //   mov     <dest>, eax
4574   //
4575   // If Op_{Lo,Hi} are nullptr, then just copy the value.
4576   Val = legalize(Val);
4577   Type Ty = Val->getType();
4578   if (!Traits::Is64Bit && Ty == IceType_i64) {
4579     Variable *T_edx = makeReg(IceType_i32, Traits::RegisterSet::Reg_edx);
4580     Variable *T_eax = makeReg(IceType_i32, Traits::RegisterSet::Reg_eax);
4581     X86OperandMem *Addr = formMemoryOperand(Ptr, Ty);
4582     _mov(T_eax, loOperand(Addr));
4583     _mov(T_edx, hiOperand(Addr));
4584     Variable *T_ecx = makeReg(IceType_i32, Traits::RegisterSet::Reg_ecx);
4585     Variable *T_ebx = makeReg(IceType_i32, Traits::RegisterSet::Reg_ebx);
4586     InstX86Label *Label = InstX86Label::create(Func, this);
4587     const bool IsXchg8b = Op_Lo == nullptr && Op_Hi == nullptr;
4588     if (!IsXchg8b) {
4589       Context.insert(Label);
4590       _mov(T_ebx, T_eax);
4591       (this->*Op_Lo)(T_ebx, loOperand(Val));
4592       _mov(T_ecx, T_edx);
4593       (this->*Op_Hi)(T_ecx, hiOperand(Val));
4594     } else {
4595       // This is for xchg, which doesn't need an actual Op_Lo/Op_Hi.
4596       // It just needs the Val loaded into ebx and ecx.
4597       // That can also be done before the loop.
4598       _mov(T_ebx, loOperand(Val));
4599       _mov(T_ecx, hiOperand(Val));
4600       Context.insert(Label);
4601     }
4602     constexpr bool Locked = true;
4603     _cmpxchg8b(Addr, T_edx, T_eax, T_ecx, T_ebx, Locked);
4604     _br(Traits::Cond::Br_ne, Label);
4605     if (!IsXchg8b) {
4606       // If Val is a variable, model the extended live range of Val through
4607       // the end of the loop, since it will be re-used by the loop.
4608       if (auto *ValVar = llvm::dyn_cast<Variable>(Val)) {
4609         auto *ValLo = llvm::cast<Variable>(loOperand(ValVar));
4610         auto *ValHi = llvm::cast<Variable>(hiOperand(ValVar));
4611         Context.insert<InstFakeUse>(ValLo);
4612         Context.insert<InstFakeUse>(ValHi);
4613       }
4614     } else {
4615       // For xchg, the loop is slightly smaller and ebx/ecx are used.
4616       Context.insert<InstFakeUse>(T_ebx);
4617       Context.insert<InstFakeUse>(T_ecx);
4618     }
4619     // The address base (if any) is also reused in the loop.
4620     if (Variable *Base = Addr->getBase())
4621       Context.insert<InstFakeUse>(Base);
4622     auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
4623     auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
4624     _mov(DestLo, T_eax);
4625     _mov(DestHi, T_edx);
4626     return;
4627   }
4628   X86OperandMem *Addr = formMemoryOperand(Ptr, Ty);
4629   RegNumT Eax;
4630   switch (Ty) {
4631   default:
4632     llvm::report_fatal_error("Bad type for atomicRMW");
4633   case IceType_i64:
4634     Eax = Traits::getRaxOrDie();
4635     break;
4636   case IceType_i32:
4637     Eax = Traits::RegisterSet::Reg_eax;
4638     break;
4639   case IceType_i16:
4640     Eax = Traits::RegisterSet::Reg_ax;
4641     break;
4642   case IceType_i8:
4643     Eax = Traits::RegisterSet::Reg_al;
4644     break;
4645   }
4646   Variable *T_eax = makeReg(Ty, Eax);
4647   _mov(T_eax, Addr);
4648   auto *Label = Context.insert<InstX86Label>(this);
4649   // We want to pick a different register for T than Eax, so don't use
4650   // _mov(T == nullptr, T_eax).
4651   Variable *T = makeReg(Ty);
4652   _mov(T, T_eax);
4653   (this->*Op_Lo)(T, Val);
4654   constexpr bool Locked = true;
4655   _cmpxchg(Addr, T_eax, T, Locked);
4656   _br(Traits::Cond::Br_ne, Label);
4657   // If Val is a variable, model the extended live range of Val through
4658   // the end of the loop, since it will be re-used by the loop.
4659   if (auto *ValVar = llvm::dyn_cast<Variable>(Val)) {
4660     Context.insert<InstFakeUse>(ValVar);
4661   }
4662   // The address base (if any) is also reused in the loop.
4663   if (Variable *Base = Addr->getBase())
4664     Context.insert<InstFakeUse>(Base);
4665   _mov(Dest, T_eax);
4666 }
4667
4668 /// Lowers count {trailing, leading} zeros intrinsic.
4669 ///
4670 /// We could do constant folding here, but that should have
4671 /// been done by the front-end/middle-end optimizations.
4672 template <typename TraitsType>
4673 void TargetX86Base<TraitsType>::lowerCountZeros(bool Cttz, Type Ty,
4674                                                 Variable *Dest,
4675                                                 Operand *FirstVal,
4676                                                 Operand *SecondVal) {
4677   // TODO(jvoung): Determine if the user CPU supports LZCNT (BMI).
4678   // Then the instructions will handle the Val == 0 case much more simply
4679   // and won't require conversion from bit position to number of zeros.
4680   //
4681   // Otherwise:
4682   //   bsr IF_NOT_ZERO, Val
4683   //   mov T_DEST, ((Ty == i32) ? 63 : 127)
4684   //   cmovne T_DEST, IF_NOT_ZERO
4685   //   xor T_DEST, ((Ty == i32) ? 31 : 63)
4686   //   mov DEST, T_DEST
4687   //
4688   // NOTE: T_DEST must be a register because cmov requires its dest to be a
4689   // register. Also, bsf and bsr require their dest to be a register.
4690   //
4691   // The xor DEST, C(31|63) converts a bit position to # of leading zeroes.
4692   // E.g., for 000... 00001100, bsr will say that the most significant bit
4693   // set is at position 3, while the number of leading zeros is 28. Xor is
4694   // like (M - N) for N <= M, and converts 63 to 32, and 127 to 64 (for the
4695   // all-zeros case).
4696   //
4697   // X8632 only: Similar for 64-bit, but start w/ speculating that the upper 32
4698   // bits are all zero, and compute the result for that case (checking the
4699   // lower 32 bits). Then actually compute the result for the upper bits and
4700   // cmov in the result from the lower computation if the earlier speculation
4701   // was correct.
4702   //
4703   // Cttz, is similar, but uses bsf instead, and doesn't require the xor
4704   // bit position conversion, and the speculation is reversed.
4705
4706   // TODO(jpp): refactor this method.
4707   assert(Ty == IceType_i32 || Ty == IceType_i64);
4708   const Type DestTy = Traits::Is64Bit ? Dest->getType() : IceType_i32;
4709   Variable *T = makeReg(DestTy);
4710   Operand *FirstValRM = legalize(FirstVal, Legal_Mem | Legal_Reg);
4711   if (Cttz) {
4712     _bsf(T, FirstValRM);
4713   } else {
4714     _bsr(T, FirstValRM);
4715   }
4716   Variable *T_Dest = makeReg(DestTy);
4717   Constant *_31 = Ctx->getConstantInt32(31);
4718   Constant *_32 = Ctx->getConstantInt(DestTy, 32);
4719   Constant *_63 = Ctx->getConstantInt(DestTy, 63);
4720   Constant *_64 = Ctx->getConstantInt(DestTy, 64);
4721   if (Cttz) {
4722     if (DestTy == IceType_i64) {
4723       _mov(T_Dest, _64);
4724     } else {
4725       _mov(T_Dest, _32);
4726     }
4727   } else {
4728     Constant *_127 = Ctx->getConstantInt(DestTy, 127);
4729     if (DestTy == IceType_i64) {
4730       _mov(T_Dest, _127);
4731     } else {
4732       _mov(T_Dest, _63);
4733     }
4734   }
4735   _cmov(T_Dest, T, Traits::Cond::Br_ne);
4736   if (!Cttz) {
4737     if (DestTy == IceType_i64) {
4738       // Even though there's a _63 available at this point, that constant might
4739       // not be an i32, which will cause the xor emission to fail.
4740       Constant *_63 = Ctx->getConstantInt32(63);
4741       _xor(T_Dest, _63);
4742     } else {
4743       _xor(T_Dest, _31);
4744     }
4745   }
4746   if (Traits::Is64Bit || Ty == IceType_i32) {
4747     _mov(Dest, T_Dest);
4748     return;
4749   }
4750   _add(T_Dest, _32);
4751   auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
4752   auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
4753   // Will be using "test" on this, so we need a registerized variable.
4754   Variable *SecondVar = legalizeToReg(SecondVal);
4755   Variable *T_Dest2 = makeReg(IceType_i32);
4756   if (Cttz) {
4757     _bsf(T_Dest2, SecondVar);
4758   } else {
4759     _bsr(T_Dest2, SecondVar);
4760     _xor(T_Dest2, _31);
4761   }
4762   _test(SecondVar, SecondVar);
4763   _cmov(T_Dest2, T_Dest, Traits::Cond::Br_e);
4764   _mov(DestLo, T_Dest2);
4765   _mov(DestHi, Ctx->getConstantZero(IceType_i32));
4766 }
4767
4768 template <typename TraitsType>
4769 void TargetX86Base<TraitsType>::typedLoad(Type Ty, Variable *Dest,
4770                                           Variable *Base, Constant *Offset) {
4771   // If Offset is a ConstantRelocatable in Non-SFI mode, we will need to
4772   // legalize Mem properly.
4773   if (Offset)
4774     assert(!llvm::isa<ConstantRelocatable>(Offset));
4775
4776   auto *Mem = X86OperandMem::create(Func, Ty, Base, Offset);
4777
4778   if (isVectorType(Ty))
4779     _movp(Dest, Mem);
4780   else if (Ty == IceType_f64)
4781     _movq(Dest, Mem);
4782   else
4783     _mov(Dest, Mem);
4784 }
4785
4786 template <typename TraitsType>
4787 void TargetX86Base<TraitsType>::typedStore(Type Ty, Variable *Value,
4788                                            Variable *Base, Constant *Offset) {
4789   // If Offset is a ConstantRelocatable in Non-SFI mode, we will need to
4790   // legalize Mem properly.
4791   if (Offset)
4792     assert(!llvm::isa<ConstantRelocatable>(Offset));
4793
4794   auto *Mem = X86OperandMem::create(Func, Ty, Base, Offset);
4795
4796   if (isVectorType(Ty))
4797     _storep(Value, Mem);
4798   else if (Ty == IceType_f64)
4799     _storeq(Value, Mem);
4800   else
4801     _store(Value, Mem);
4802 }
4803
4804 template <typename TraitsType>
4805 void TargetX86Base<TraitsType>::copyMemory(Type Ty, Variable *Dest,
4806                                            Variable *Src, int32_t OffsetAmt) {
4807   Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr;
4808   // TODO(ascull): this or add nullptr test to _movp, _movq
4809   Variable *Data = makeReg(Ty);
4810
4811   typedLoad(Ty, Data, Src, Offset);
4812   typedStore(Ty, Data, Dest, Offset);
4813 }
4814
4815 template <typename TraitsType>
4816 void TargetX86Base<TraitsType>::lowerMemcpy(Operand *Dest, Operand *Src,
4817                                             Operand *Count) {
4818   // There is a load and store for each chunk in the unroll
4819   constexpr uint32_t BytesPerStorep = 16;
4820
4821   // Check if the operands are constants
4822   const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);
4823   const bool IsCountConst = CountConst != nullptr;
4824   const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;
4825
4826   if (shouldOptimizeMemIntrins() && IsCountConst &&
4827       CountValue <= BytesPerStorep * Traits::MEMCPY_UNROLL_LIMIT) {
4828     // Unlikely, but nothing to do if it does happen
4829     if (CountValue == 0)
4830       return;
4831
4832     Variable *SrcBase = legalizeToReg(Src);
4833     Variable *DestBase = legalizeToReg(Dest);
4834
4835     // Find the largest type that can be used and use it as much as possible in
4836     // reverse order. Then handle any remainder with overlapping copies. Since
4837     // the remainder will be at the end, there will be reduced pressure on the
4838     // memory unit as the accesses to the same memory are far apart.
4839     Type Ty = largestTypeInSize(CountValue);
4840     uint32_t TyWidth = typeWidthInBytes(Ty);
4841
4842     uint32_t RemainingBytes = CountValue;
4843     int32_t Offset = (CountValue & ~(TyWidth - 1)) - TyWidth;
4844     while (RemainingBytes >= TyWidth) {
4845       copyMemory(Ty, DestBase, SrcBase, Offset);
4846       RemainingBytes -= TyWidth;
4847       Offset -= TyWidth;
4848     }
4849
4850     if (RemainingBytes == 0)
4851       return;
4852
4853     // Lower the remaining bytes. Adjust to larger types in order to make use
4854     // of overlaps in the copies.
4855     Type LeftOverTy = firstTypeThatFitsSize(RemainingBytes);
4856     Offset = CountValue - typeWidthInBytes(LeftOverTy);
4857     copyMemory(LeftOverTy, DestBase, SrcBase, Offset);
4858     return;
4859   }
4860
4861   // Fall back on a function call
4862   InstCall *Call = makeHelperCall(RuntimeHelper::H_call_memcpy, nullptr, 3);
4863   Call->addArg(Dest);
4864   Call->addArg(Src);
4865   Call->addArg(Count);
4866   lowerCall(Call);
4867 }
4868
4869 template <typename TraitsType>
4870 void TargetX86Base<TraitsType>::lowerMemmove(Operand *Dest, Operand *Src,
4871                                              Operand *Count) {
4872   // There is a load and store for each chunk in the unroll
4873   constexpr uint32_t BytesPerStorep = 16;
4874
4875   // Check if the operands are constants
4876   const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);
4877   const bool IsCountConst = CountConst != nullptr;
4878   const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;
4879
4880   if (shouldOptimizeMemIntrins() && IsCountConst &&
4881       CountValue <= BytesPerStorep * Traits::MEMMOVE_UNROLL_LIMIT) {
4882     // Unlikely, but nothing to do if it does happen
4883     if (CountValue == 0)
4884       return;
4885
4886     Variable *SrcBase = legalizeToReg(Src);
4887     Variable *DestBase = legalizeToReg(Dest);
4888
4889     std::tuple<Type, Constant *, Variable *>
4890         Moves[Traits::MEMMOVE_UNROLL_LIMIT];
4891     Constant *Offset;
4892     Variable *Reg;
4893
4894     // Copy the data into registers as the source and destination could overlap
4895     // so make sure not to clobber the memory. This also means overlapping
4896     // moves can be used as we are taking a safe snapshot of the memory.
4897     Type Ty = largestTypeInSize(CountValue);
4898     uint32_t TyWidth = typeWidthInBytes(Ty);
4899
4900     uint32_t RemainingBytes = CountValue;
4901     int32_t OffsetAmt = (CountValue & ~(TyWidth - 1)) - TyWidth;
4902     size_t N = 0;
4903     while (RemainingBytes >= TyWidth) {
4904       assert(N <= Traits::MEMMOVE_UNROLL_LIMIT);
4905       Offset = Ctx->getConstantInt32(OffsetAmt);
4906       Reg = makeReg(Ty);
4907       typedLoad(Ty, Reg, SrcBase, Offset);
4908       RemainingBytes -= TyWidth;
4909       OffsetAmt -= TyWidth;
4910       Moves[N++] = std::make_tuple(Ty, Offset, Reg);
4911     }
4912
4913     if (RemainingBytes != 0) {
4914       // Lower the remaining bytes. Adjust to larger types in order to make use
4915       // of overlaps in the copies.
4916       assert(N <= Traits::MEMMOVE_UNROLL_LIMIT);
4917       Ty = firstTypeThatFitsSize(RemainingBytes);
4918       Offset = Ctx->getConstantInt32(CountValue - typeWidthInBytes(Ty));
4919       Reg = makeReg(Ty);
4920       typedLoad(Ty, Reg, SrcBase, Offset);
4921       Moves[N++] = std::make_tuple(Ty, Offset, Reg);
4922     }
4923
4924     // Copy the data out into the destination memory
4925     for (size_t i = 0; i < N; ++i) {
4926       std::tie(Ty, Offset, Reg) = Moves[i];
4927       typedStore(Ty, Reg, DestBase, Offset);
4928     }
4929
4930     return;
4931   }
4932
4933   // Fall back on a function call
4934   InstCall *Call = makeHelperCall(RuntimeHelper::H_call_memmove, nullptr, 3);
4935   Call->addArg(Dest);
4936   Call->addArg(Src);
4937   Call->addArg(Count);
4938   lowerCall(Call);
4939 }
4940
4941 template <typename TraitsType>
4942 void TargetX86Base<TraitsType>::lowerMemset(Operand *Dest, Operand *Val,
4943                                             Operand *Count) {
4944   constexpr uint32_t BytesPerStorep = 16;
4945   constexpr uint32_t BytesPerStoreq = 8;
4946   constexpr uint32_t BytesPerStorei32 = 4;
4947   assert(Val->getType() == IceType_i8);
4948
4949   // Check if the operands are constants
4950   const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);
4951   const auto *ValConst = llvm::dyn_cast<const ConstantInteger32>(Val);
4952   const bool IsCountConst = CountConst != nullptr;
4953   const bool IsValConst = ValConst != nullptr;
4954   const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;
4955   const uint32_t ValValue = IsValConst ? ValConst->getValue() : 0;
4956
4957   // Unlikely, but nothing to do if it does happen
4958   if (IsCountConst && CountValue == 0)
4959     return;
4960
4961   // TODO(ascull): if the count is constant but val is not it would be possible
4962   // to inline by spreading the value across 4 bytes and accessing subregs e.g.
4963   // eax, ax and al.
4964   if (shouldOptimizeMemIntrins() && IsCountConst && IsValConst) {
4965     Variable *Base = nullptr;
4966     Variable *VecReg = nullptr;
4967     const uint32_t SpreadValue =
4968         (ValValue << 24) | (ValValue << 16) | (ValValue << 8) | ValValue;
4969
4970     auto lowerSet = [this, &Base, SpreadValue, &VecReg](Type Ty,
4971                                                         uint32_t OffsetAmt) {
4972       assert(Base != nullptr);
4973       Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr;
4974
4975       // TODO(ascull): is 64-bit better with vector or scalar movq?
4976       auto *Mem = X86OperandMem::create(Func, Ty, Base, Offset);
4977       if (isVectorType(Ty)) {
4978         assert(VecReg != nullptr);
4979         _storep(VecReg, Mem);
4980       } else if (Ty == IceType_f64) {
4981         assert(VecReg != nullptr);
4982         _storeq(VecReg, Mem);
4983       } else {
4984         assert(Ty != IceType_i64);
4985         _store(Ctx->getConstantInt(Ty, SpreadValue), Mem);
4986       }
4987     };
4988
4989     // Find the largest type that can be used and use it as much as possible in
4990     // reverse order. Then handle any remainder with overlapping copies. Since
4991     // the remainder will be at the end, there will be reduces pressure on the
4992     // memory unit as the access to the same memory are far apart.
4993     Type Ty;
4994     if (ValValue == 0 && CountValue >= BytesPerStoreq &&
4995         CountValue <= BytesPerStorep * Traits::MEMCPY_UNROLL_LIMIT) {
4996       // When the value is zero it can be loaded into a vector register cheaply
4997       // using the xor trick.
4998       Base = legalizeToReg(Dest);
4999       VecReg = makeVectorOfZeros(IceType_v16i8);
5000       Ty = largestTypeInSize(CountValue);
5001     } else if (CountValue <= BytesPerStorei32 * Traits::MEMCPY_UNROLL_LIMIT) {
5002       // When the value is non-zero or the count is small we can't use vector
5003       // instructions so are limited to 32-bit stores.
5004       Base = legalizeToReg(Dest);
5005       constexpr uint32_t MaxSize = 4;
5006       Ty = largestTypeInSize(CountValue, MaxSize);
5007     }
5008
5009     if (Base) {
5010       uint32_t TyWidth = typeWidthInBytes(Ty);
5011
5012       uint32_t RemainingBytes = CountValue;
5013       uint32_t Offset = (CountValue & ~(TyWidth - 1)) - TyWidth;
5014       while (RemainingBytes >= TyWidth) {
5015         lowerSet(Ty, Offset);
5016         RemainingBytes -= TyWidth;
5017         Offset -= TyWidth;
5018       }
5019
5020       if (RemainingBytes == 0)
5021         return;
5022
5023       // Lower the remaining bytes. Adjust to larger types in order to make use
5024       // of overlaps in the copies.
5025       Type LeftOverTy = firstTypeThatFitsSize(RemainingBytes);
5026       Offset = CountValue - typeWidthInBytes(LeftOverTy);
5027       lowerSet(LeftOverTy, Offset);
5028       return;
5029     }
5030   }
5031
5032   // Fall back on calling the memset function. The value operand needs to be
5033   // extended to a stack slot size because the PNaCl ABI requires arguments to
5034   // be at least 32 bits wide.
5035   Operand *ValExt;
5036   if (IsValConst) {
5037     ValExt = Ctx->getConstantInt(stackSlotType(), ValValue);
5038   } else {
5039     Variable *ValExtVar = Func->makeVariable(stackSlotType());
5040     lowerCast(InstCast::create(Func, InstCast::Zext, ValExtVar, Val));
5041     ValExt = ValExtVar;
5042   }
5043   InstCall *Call = makeHelperCall(RuntimeHelper::H_call_memset, nullptr, 3);
5044   Call->addArg(Dest);
5045   Call->addArg(ValExt);
5046   Call->addArg(Count);
5047   lowerCall(Call);
5048 }
5049
5050 class AddressOptimizer {
5051   AddressOptimizer() = delete;
5052   AddressOptimizer(const AddressOptimizer &) = delete;
5053   AddressOptimizer &operator=(const AddressOptimizer &) = delete;
5054
5055 public:
5056   explicit AddressOptimizer(const Cfg *Func)
5057       : Func(Func), VMetadata(Func->getVMetadata()) {}
5058
5059   inline void dumpAddressOpt(const ConstantRelocatable *const Relocatable,
5060                              int32_t Offset, const Variable *Base,
5061                              const Variable *Index, uint16_t Shift,
5062                              const Inst *Reason) const;
5063
5064   inline const Inst *matchAssign(Variable **Var,
5065                                  ConstantRelocatable **Relocatable,
5066                                  int32_t *Offset);
5067
5068   inline const Inst *matchCombinedBaseIndex(Variable **Base, Variable **Index,
5069                                             uint16_t *Shift);
5070
5071   inline const Inst *matchShiftedIndex(Variable **Index, uint16_t *Shift);
5072
5073   inline const Inst *matchOffsetIndexOrBase(Variable **IndexOrBase,
5074                                             const uint16_t Shift,
5075                                             ConstantRelocatable **Relocatable,
5076                                             int32_t *Offset);
5077
5078 private:
5079   const Cfg *const Func;
5080   const VariablesMetadata *const VMetadata;
5081
5082   static bool isAdd(const Inst *Instr) {
5083     if (auto *Arith = llvm::dyn_cast_or_null<const InstArithmetic>(Instr)) {
5084       return (Arith->getOp() == InstArithmetic::Add);
5085     }
5086     return false;
5087   }
5088 };
5089
5090 void AddressOptimizer::dumpAddressOpt(
5091     const ConstantRelocatable *const Relocatable, int32_t Offset,
5092     const Variable *Base, const Variable *Index, uint16_t Shift,
5093     const Inst *Reason) const {
5094   if (!BuildDefs::dump())
5095     return;
5096   if (!Func->isVerbose(IceV_AddrOpt))
5097     return;
5098   OstreamLocker L(Func->getContext());
5099   Ostream &Str = Func->getContext()->getStrDump();
5100   Str << "Instruction: ";
5101   Reason->dumpDecorated(Func);
5102   Str << "  results in Base=";
5103   if (Base)
5104     Base->dump(Func);
5105   else
5106     Str << "<null>";
5107   Str << ", Index=";
5108   if (Index)
5109     Index->dump(Func);
5110   else
5111     Str << "<null>";
5112   Str << ", Shift=" << Shift << ", Offset=" << Offset
5113       << ", Relocatable=" << Relocatable << "\n";
5114 }
5115
5116 const Inst *AddressOptimizer::matchAssign(Variable **Var,
5117                                           ConstantRelocatable **Relocatable,
5118                                           int32_t *Offset) {
5119   // Var originates from Var=SrcVar ==> set Var:=SrcVar
5120   if (*Var == nullptr)
5121     return nullptr;
5122   if (const Inst *VarAssign = VMetadata->getSingleDefinition(*Var)) {
5123     assert(!VMetadata->isMultiDef(*Var));
5124     if (llvm::isa<InstAssign>(VarAssign)) {
5125       Operand *SrcOp = VarAssign->getSrc(0);
5126       assert(SrcOp);
5127       if (auto *SrcVar = llvm::dyn_cast<Variable>(SrcOp)) {
5128         if (!VMetadata->isMultiDef(SrcVar) &&
5129             // TODO: ensure SrcVar stays single-BB
5130             true) {
5131           *Var = SrcVar;
5132           return VarAssign;
5133         }
5134       } else if (auto *Const = llvm::dyn_cast<ConstantInteger32>(SrcOp)) {
5135         int32_t MoreOffset = Const->getValue();
5136         if (Utils::WouldOverflowAdd(*Offset, MoreOffset))
5137           return nullptr;
5138         *Var = nullptr;
5139         Offset += MoreOffset;
5140         return VarAssign;
5141       } else if (auto *AddReloc = llvm::dyn_cast<ConstantRelocatable>(SrcOp)) {
5142         if (*Relocatable == nullptr) {
5143           // It is always safe to fold a relocatable through assignment -- the
5144           // assignment frees a slot in the address operand that can be used to
5145           // hold the Sandbox Pointer -- if any.
5146           *Var = nullptr;
5147           *Relocatable = AddReloc;
5148           return VarAssign;
5149         }
5150       }
5151     }
5152   }
5153   return nullptr;
5154 }
5155
5156 const Inst *AddressOptimizer::matchCombinedBaseIndex(Variable **Base,
5157                                                      Variable **Index,
5158                                                      uint16_t *Shift) {
5159   // Index==nullptr && Base is Base=Var1+Var2 ==>
5160   //   set Base=Var1, Index=Var2, Shift=0
5161   if (*Base == nullptr)
5162     return nullptr;
5163   if (*Index != nullptr)
5164     return nullptr;
5165   auto *BaseInst = VMetadata->getSingleDefinition(*Base);
5166   if (BaseInst == nullptr)
5167     return nullptr;
5168   assert(!VMetadata->isMultiDef(*Base));
5169   if (BaseInst->getSrcSize() < 2)
5170     return nullptr;
5171   if (auto *Var1 = llvm::dyn_cast<Variable>(BaseInst->getSrc(0))) {
5172     if (VMetadata->isMultiDef(Var1))
5173       return nullptr;
5174     if (auto *Var2 = llvm::dyn_cast<Variable>(BaseInst->getSrc(1))) {
5175       if (VMetadata->isMultiDef(Var2))
5176         return nullptr;
5177       if (isAdd(BaseInst) &&
5178           // TODO: ensure Var1 and Var2 stay single-BB
5179           true) {
5180         *Base = Var1;
5181         *Index = Var2;
5182         *Shift = 0; // should already have been 0
5183         return BaseInst;
5184       }
5185     }
5186   }
5187   return nullptr;
5188 }
5189
5190 const Inst *AddressOptimizer::matchShiftedIndex(Variable **Index,
5191                                                 uint16_t *Shift) {
5192   // Index is Index=Var*Const && log2(Const)+Shift<=3 ==>
5193   //   Index=Var, Shift+=log2(Const)
5194   if (*Index == nullptr)
5195     return nullptr;
5196   auto *IndexInst = VMetadata->getSingleDefinition(*Index);
5197   if (IndexInst == nullptr)
5198     return nullptr;
5199   assert(!VMetadata->isMultiDef(*Index));
5200   if (IndexInst->getSrcSize() < 2)
5201     return nullptr;
5202   if (auto *ArithInst = llvm::dyn_cast<InstArithmetic>(IndexInst)) {
5203     if (auto *Var = llvm::dyn_cast<Variable>(ArithInst->getSrc(0))) {
5204       if (auto *Const =
5205               llvm::dyn_cast<ConstantInteger32>(ArithInst->getSrc(1))) {
5206         if (VMetadata->isMultiDef(Var) || Const->getType() != IceType_i32)
5207           return nullptr;
5208         switch (ArithInst->getOp()) {
5209         default:
5210           return nullptr;
5211         case InstArithmetic::Mul: {
5212           uint32_t Mult = Const->getValue();
5213           uint32_t LogMult;
5214           switch (Mult) {
5215           case 1:
5216             LogMult = 0;
5217             break;
5218           case 2:
5219             LogMult = 1;
5220             break;
5221           case 4:
5222             LogMult = 2;
5223             break;
5224           case 8:
5225             LogMult = 3;
5226             break;
5227           default:
5228             return nullptr;
5229           }
5230           if (*Shift + LogMult <= 3) {
5231             *Index = Var;
5232             *Shift += LogMult;
5233             return IndexInst;
5234           }
5235         }
5236         case InstArithmetic::Shl: {
5237           uint32_t ShiftAmount = Const->getValue();
5238           switch (ShiftAmount) {
5239           case 0:
5240           case 1:
5241           case 2:
5242           case 3:
5243             break;
5244           default:
5245             return nullptr;
5246           }
5247           if (*Shift + ShiftAmount <= 3) {
5248             *Index = Var;
5249             *Shift += ShiftAmount;
5250             return IndexInst;
5251           }
5252         }
5253         }
5254       }
5255     }
5256   }
5257   return nullptr;
5258 }
5259
5260 const Inst *AddressOptimizer::matchOffsetIndexOrBase(
5261     Variable **IndexOrBase, const uint16_t Shift,
5262     ConstantRelocatable **Relocatable, int32_t *Offset) {
5263   // Base is Base=Var+Const || Base is Base=Const+Var ==>
5264   //   set Base=Var, Offset+=Const
5265   // Base is Base=Var-Const ==>
5266   //   set Base=Var, Offset-=Const
5267   // Index is Index=Var+Const ==>
5268   //   set Index=Var, Offset+=(Const<<Shift)
5269   // Index is Index=Const+Var ==>
5270   //   set Index=Var, Offset+=(Const<<Shift)
5271   // Index is Index=Var-Const ==>
5272   //   set Index=Var, Offset-=(Const<<Shift)
5273   // Treat Index=Var Or Const as Index=Var + Const
5274   //    when Var = Var' << N and log2(Const) <= N
5275   // or when Var = (2^M) * (2^N) and log2(Const) <= (M+N)
5276
5277   if (*IndexOrBase == nullptr) {
5278     return nullptr;
5279   }
5280   const Inst *Definition = VMetadata->getSingleDefinition(*IndexOrBase);
5281   if (Definition == nullptr) {
5282     return nullptr;
5283   }
5284   assert(!VMetadata->isMultiDef(*IndexOrBase));
5285   if (auto *ArithInst = llvm::dyn_cast<const InstArithmetic>(Definition)) {
5286     switch (ArithInst->getOp()) {
5287     case InstArithmetic::Add:
5288     case InstArithmetic::Sub:
5289     case InstArithmetic::Or:
5290       break;
5291     default:
5292       return nullptr;
5293     }
5294
5295     Operand *Src0 = ArithInst->getSrc(0);
5296     Operand *Src1 = ArithInst->getSrc(1);
5297     auto *Var0 = llvm::dyn_cast<Variable>(Src0);
5298     auto *Var1 = llvm::dyn_cast<Variable>(Src1);
5299     auto *Const0 = llvm::dyn_cast<ConstantInteger32>(Src0);
5300     auto *Const1 = llvm::dyn_cast<ConstantInteger32>(Src1);
5301     auto *Reloc0 = llvm::dyn_cast<ConstantRelocatable>(Src0);
5302     auto *Reloc1 = llvm::dyn_cast<ConstantRelocatable>(Src1);
5303
5304     bool IsAdd = false;
5305     if (ArithInst->getOp() == InstArithmetic::Or) {
5306       Variable *Var = nullptr;
5307       ConstantInteger32 *Const = nullptr;
5308       if (Var0 && Const1) {
5309         Var = Var0;
5310         Const = Const1;
5311       } else if (Const0 && Var1) {
5312         Var = Var1;
5313         Const = Const0;
5314       } else {
5315         return nullptr;
5316       }
5317       auto *VarDef =
5318           llvm::dyn_cast<InstArithmetic>(VMetadata->getSingleDefinition(Var));
5319       if (VarDef == nullptr)
5320         return nullptr;
5321
5322       SizeT ZeroesAvailable = 0;
5323       if (VarDef->getOp() == InstArithmetic::Shl) {
5324         if (auto *ConstInt =
5325                 llvm::dyn_cast<ConstantInteger32>(VarDef->getSrc(1))) {
5326           ZeroesAvailable = ConstInt->getValue();
5327         }
5328       } else if (VarDef->getOp() == InstArithmetic::Mul) {
5329         SizeT PowerOfTwo = 0;
5330         if (auto *MultConst =
5331                 llvm::dyn_cast<ConstantInteger32>(VarDef->getSrc(0))) {
5332           if (llvm::isPowerOf2_32(MultConst->getValue())) {
5333             PowerOfTwo += MultConst->getValue();
5334           }
5335         }
5336         if (auto *MultConst =
5337                 llvm::dyn_cast<ConstantInteger32>(VarDef->getSrc(1))) {
5338           if (llvm::isPowerOf2_32(MultConst->getValue())) {
5339             PowerOfTwo += MultConst->getValue();
5340           }
5341         }
5342         ZeroesAvailable = llvm::Log2_32(PowerOfTwo) + 1;
5343       }
5344       SizeT ZeroesNeeded = llvm::Log2_32(Const->getValue()) + 1;
5345       if (ZeroesNeeded == 0 || ZeroesNeeded > ZeroesAvailable)
5346         return nullptr;
5347       IsAdd = true; // treat it as an add if the above conditions hold
5348     } else {
5349       IsAdd = ArithInst->getOp() == InstArithmetic::Add;
5350     }
5351
5352     Variable *NewIndexOrBase = nullptr;
5353     int32_t NewOffset = 0;
5354     ConstantRelocatable *NewRelocatable = *Relocatable;
5355     if (Var0 && Var1)
5356       // TODO(sehr): merge base/index splitting into here.
5357       return nullptr;
5358     if (!IsAdd && Var1)
5359       return nullptr;
5360     if (Var0)
5361       NewIndexOrBase = Var0;
5362     else if (Var1)
5363       NewIndexOrBase = Var1;
5364     // Don't know how to add/subtract two relocatables.
5365     if ((*Relocatable && (Reloc0 || Reloc1)) || (Reloc0 && Reloc1))
5366       return nullptr;
5367     // Don't know how to subtract a relocatable.
5368     if (!IsAdd && Reloc1)
5369       return nullptr;
5370     // Incorporate ConstantRelocatables.
5371     if (Reloc0)
5372       NewRelocatable = Reloc0;
5373     else if (Reloc1)
5374       NewRelocatable = Reloc1;
5375     // Compute the updated constant offset.
5376     if (Const0) {
5377       const int32_t MoreOffset =
5378           IsAdd ? Const0->getValue() : -Const0->getValue();
5379       if (Utils::WouldOverflowAdd(*Offset + NewOffset, MoreOffset))
5380         return nullptr;
5381       NewOffset += MoreOffset;
5382     }
5383     if (Const1) {
5384       const int32_t MoreOffset =
5385           IsAdd ? Const1->getValue() : -Const1->getValue();
5386       if (Utils::WouldOverflowAdd(*Offset + NewOffset, MoreOffset))
5387         return nullptr;
5388       NewOffset += MoreOffset;
5389     }
5390     if (Utils::WouldOverflowAdd(*Offset, NewOffset << Shift))
5391       return nullptr;
5392     *IndexOrBase = NewIndexOrBase;
5393     *Offset += (NewOffset << Shift);
5394     // Shift is always zero if this is called with the base
5395     *Relocatable = NewRelocatable;
5396     return Definition;
5397   }
5398   return nullptr;
5399 }
5400
5401 template <typename TypeTraits>
5402 typename TargetX86Base<TypeTraits>::X86OperandMem *
5403 TargetX86Base<TypeTraits>::computeAddressOpt(const Inst *Instr, Type MemType,
5404                                              Operand *Addr) {
5405   Func->resetCurrentNode();
5406   if (Func->isVerbose(IceV_AddrOpt)) {
5407     OstreamLocker L(Func->getContext());
5408     Ostream &Str = Func->getContext()->getStrDump();
5409     Str << "\nStarting computeAddressOpt for instruction:\n  ";
5410     Instr->dumpDecorated(Func);
5411   }
5412
5413   OptAddr NewAddr;
5414   NewAddr.Base = llvm::dyn_cast<Variable>(Addr);
5415   if (NewAddr.Base == nullptr)
5416     return nullptr;
5417
5418   // If the Base has more than one use or is live across multiple blocks, then
5419   // don't go further. Alternatively (?), never consider a transformation that
5420   // would change a variable that is currently *not* live across basic block
5421   // boundaries into one that *is*.
5422   if (!getFlags().getLoopInvariantCodeMotion()) {
5423     // Need multi block address opt when licm is enabled.
5424     // Might make sense to restrict to current node and loop header.
5425     if (Func->getVMetadata()->isMultiBlock(
5426             NewAddr.Base) /* || Base->getUseCount() > 1*/)
5427       return nullptr;
5428   }
5429   AddressOptimizer AddrOpt(Func);
5430   const bool MockBounds = getFlags().getMockBoundsCheck();
5431   const Inst *Reason = nullptr;
5432   bool AddressWasOptimized = false;
5433   // The following unnamed struct identifies the address mode formation steps
5434   // that could potentially create an invalid memory operand (i.e., no free
5435   // slots for RebasePtr.) We add all those variables to this struct so that we
5436   // can use memset() to reset all members to false.
5437   struct {
5438     bool AssignBase = false;
5439     bool AssignIndex = false;
5440     bool OffsetFromBase = false;
5441     bool OffsetFromIndex = false;
5442     bool CombinedBaseIndex = false;
5443   } Skip;
5444   // This points to the boolean in Skip that represents the last folding
5445   // performed. This is used to disable a pattern match that generated an
5446   // invalid address. Without this, the algorithm would never finish.
5447   bool *SkipLastFolding = nullptr;
5448   // NewAddrCheckpoint is used to rollback the address being formed in case an
5449   // invalid address is formed.
5450   OptAddr NewAddrCheckpoint;
5451   Reason = Instr;
5452   do {
5453     if (SandboxingType != ST_None) {
5454       // When sandboxing, we defer the sandboxing of NewAddr to the Concrete
5455       // Target. If our optimization was overly aggressive, then we simply undo
5456       // what the previous iteration did, and set the previous pattern's skip
5457       // bit to true.
5458       if (!legalizeOptAddrForSandbox(&NewAddr)) {
5459         *SkipLastFolding = true;
5460         SkipLastFolding = nullptr;
5461         NewAddr = NewAddrCheckpoint;
5462         Reason = nullptr;
5463       }
5464     }
5465
5466     if (Reason) {
5467       AddrOpt.dumpAddressOpt(NewAddr.Relocatable, NewAddr.Offset, NewAddr.Base,
5468                              NewAddr.Index, NewAddr.Shift, Reason);
5469       AddressWasOptimized = true;
5470       Reason = nullptr;
5471       SkipLastFolding = nullptr;
5472       memset(&Skip, 0, sizeof(Skip));
5473     }
5474
5475     NewAddrCheckpoint = NewAddr;
5476
5477     // Update Base and Index to follow through assignments to definitions.
5478     if (!Skip.AssignBase &&
5479         (Reason = AddrOpt.matchAssign(&NewAddr.Base, &NewAddr.Relocatable,
5480                                       &NewAddr.Offset))) {
5481       SkipLastFolding = &Skip.AssignBase;
5482       // Assignments of Base from a Relocatable or ConstantInt32 can result
5483       // in Base becoming nullptr.  To avoid code duplication in this loop we
5484       // prefer that Base be non-nullptr if possible.
5485       if ((NewAddr.Base == nullptr) && (NewAddr.Index != nullptr) &&
5486           NewAddr.Shift == 0) {
5487         std::swap(NewAddr.Base, NewAddr.Index);
5488       }
5489       continue;
5490     }
5491     if (!Skip.AssignBase &&
5492         (Reason = AddrOpt.matchAssign(&NewAddr.Index, &NewAddr.Relocatable,
5493                                       &NewAddr.Offset))) {
5494       SkipLastFolding = &Skip.AssignIndex;
5495       continue;
5496     }
5497
5498     if (!MockBounds) {
5499       // Transition from:
5500       //   <Relocatable + Offset>(Base) to
5501       //   <Relocatable + Offset>(Base, Index)
5502       if (!Skip.CombinedBaseIndex &&
5503           (Reason = AddrOpt.matchCombinedBaseIndex(
5504                &NewAddr.Base, &NewAddr.Index, &NewAddr.Shift))) {
5505         SkipLastFolding = &Skip.CombinedBaseIndex;
5506         continue;
5507       }
5508
5509       // Recognize multiply/shift and update Shift amount.
5510       // Index becomes Index=Var<<Const && Const+Shift<=3 ==>
5511       //   Index=Var, Shift+=Const
5512       // Index becomes Index=Const*Var && log2(Const)+Shift<=3 ==>
5513       //   Index=Var, Shift+=log2(Const)
5514       if ((Reason =
5515                AddrOpt.matchShiftedIndex(&NewAddr.Index, &NewAddr.Shift))) {
5516         continue;
5517       }
5518
5519       // If Shift is zero, the choice of Base and Index was purely arbitrary.
5520       // Recognize multiply/shift and set Shift amount.
5521       // Shift==0 && Base is Base=Var*Const && log2(Const)+Shift<=3 ==>
5522       //   swap(Index,Base)
5523       // Similar for Base=Const*Var and Base=Var<<Const
5524       if (NewAddr.Shift == 0 &&
5525           (Reason = AddrOpt.matchShiftedIndex(&NewAddr.Base, &NewAddr.Shift))) {
5526         std::swap(NewAddr.Base, NewAddr.Index);
5527         continue;
5528       }
5529     }
5530
5531     // Update Offset to reflect additions/subtractions with constants and
5532     // relocatables.
5533     // TODO: consider overflow issues with respect to Offset.
5534     if (!Skip.OffsetFromBase && (Reason = AddrOpt.matchOffsetIndexOrBase(
5535                                      &NewAddr.Base, /*Shift =*/0,
5536                                      &NewAddr.Relocatable, &NewAddr.Offset))) {
5537       SkipLastFolding = &Skip.OffsetFromBase;
5538       continue;
5539     }
5540     if (!Skip.OffsetFromIndex && (Reason = AddrOpt.matchOffsetIndexOrBase(
5541                                       &NewAddr.Index, NewAddr.Shift,
5542                                       &NewAddr.Relocatable, &NewAddr.Offset))) {
5543       SkipLastFolding = &Skip.OffsetFromIndex;
5544       continue;
5545     }
5546
5547     break;
5548   } while (Reason);
5549
5550   if (!AddressWasOptimized) {
5551     return nullptr;
5552   }
5553
5554   // Undo any addition of RebasePtr.  It will be added back when the mem
5555   // operand is sandboxed.
5556   if (NewAddr.Base == RebasePtr) {
5557     NewAddr.Base = nullptr;
5558   }
5559
5560   if (NewAddr.Index == RebasePtr) {
5561     NewAddr.Index = nullptr;
5562     NewAddr.Shift = 0;
5563   }
5564
5565   Constant *OffsetOp = nullptr;
5566   if (NewAddr.Relocatable == nullptr) {
5567     OffsetOp = Ctx->getConstantInt32(NewAddr.Offset);
5568   } else {
5569     OffsetOp =
5570         Ctx->getConstantSym(NewAddr.Relocatable->getOffset() + NewAddr.Offset,
5571                             NewAddr.Relocatable->getName());
5572   }
5573   // Vanilla ICE load instructions should not use the segment registers, and
5574   // computeAddressOpt only works at the level of Variables and Constants, not
5575   // other X86OperandMem, so there should be no mention of segment
5576   // registers there either.
5577   static constexpr auto SegmentReg =
5578       X86OperandMem::SegmentRegisters::DefaultSegment;
5579
5580   return X86OperandMem::create(Func, MemType, NewAddr.Base, OffsetOp,
5581                                NewAddr.Index, NewAddr.Shift, SegmentReg);
5582 }
5583
5584 /// Add a mock bounds check on the memory address before using it as a load or
5585 /// store operand.  The basic idea is that given a memory operand [reg], we
5586 /// would first add bounds-check code something like:
5587 ///
5588 ///   cmp reg, <lb>
5589 ///   jl out_of_line_error
5590 ///   cmp reg, <ub>
5591 ///   jg out_of_line_error
5592 ///
5593 /// In reality, the specific code will depend on how <lb> and <ub> are
5594 /// represented, e.g. an immediate, a global, or a function argument.
5595 ///
5596 /// As such, we need to enforce that the memory operand does not have the form
5597 /// [reg1+reg2], because then there is no simple cmp instruction that would
5598 /// suffice.  However, we consider [reg+offset] to be OK because the offset is
5599 /// usually small, and so <ub> could have a safety buffer built in and then we
5600 /// could instead branch to a custom out_of_line_error that does the precise
5601 /// check and jumps back if it turns out OK.
5602 ///
5603 /// For the purpose of mocking the bounds check, we'll do something like this:
5604 ///
5605 ///   cmp reg, 0
5606 ///   je label
5607 ///   cmp reg, 1
5608 ///   je label
5609 ///   label:
5610 ///
5611 /// Also note that we don't need to add a bounds check to a dereference of a
5612 /// simple global variable address.
5613 template <typename TraitsType>
5614 void TargetX86Base<TraitsType>::doMockBoundsCheck(Operand *Opnd) {
5615   if (!getFlags().getMockBoundsCheck())
5616     return;
5617   if (auto *Mem = llvm::dyn_cast<X86OperandMem>(Opnd)) {
5618     if (Mem->getIndex()) {
5619       llvm::report_fatal_error("doMockBoundsCheck: Opnd contains index reg");
5620     }
5621     Opnd = Mem->getBase();
5622   }
5623   // At this point Opnd could be nullptr, or Variable, or Constant, or perhaps
5624   // something else.  We only care if it is Variable.
5625   auto *Var = llvm::dyn_cast_or_null<Variable>(Opnd);
5626   if (Var == nullptr)
5627     return;
5628   // We use lowerStore() to copy out-args onto the stack.  This creates a memory
5629   // operand with the stack pointer as the base register.  Don't do bounds
5630   // checks on that.
5631   if (Var->getRegNum() == getStackReg())
5632     return;
5633
5634   auto *Label = InstX86Label::create(Func, this);
5635   _cmp(Opnd, Ctx->getConstantZero(IceType_i32));
5636   _br(Traits::Cond::Br_e, Label);
5637   _cmp(Opnd, Ctx->getConstantInt32(1));
5638   _br(Traits::Cond::Br_e, Label);
5639   Context.insert(Label);
5640 }
5641
5642 template <typename TraitsType>
5643 void TargetX86Base<TraitsType>::lowerLoad(const InstLoad *Load) {
5644   // A Load instruction can be treated the same as an Assign instruction, after
5645   // the source operand is transformed into an X86OperandMem operand.  Note that
5646   // the address mode optimization already creates an X86OperandMem operand, so
5647   // it doesn't need another level of transformation.
5648   Variable *DestLoad = Load->getDest();
5649   Type Ty = DestLoad->getType();
5650   Operand *Src0 = formMemoryOperand(Load->getSourceAddress(), Ty);
5651   doMockBoundsCheck(Src0);
5652   auto *Assign = InstAssign::create(Func, DestLoad, Src0);
5653   lowerAssign(Assign);
5654 }
5655
5656 template <typename TraitsType>
5657 void TargetX86Base<TraitsType>::doAddressOptLoad() {
5658   Inst *Instr = Context.getCur();
5659   Operand *Addr = Instr->getSrc(0);
5660   Variable *Dest = Instr->getDest();
5661   if (auto *OptAddr = computeAddressOpt(Instr, Dest->getType(), Addr)) {
5662     Instr->setDeleted();
5663     Context.insert<InstLoad>(Dest, OptAddr);
5664   }
5665 }
5666
5667 template <typename TraitsType>
5668 void TargetX86Base<TraitsType>::randomlyInsertNop(float Probability,
5669                                                   RandomNumberGenerator &RNG) {
5670   RandomNumberGeneratorWrapper RNGW(RNG);
5671   if (RNGW.getTrueWithProbability(Probability)) {
5672     _nop(RNGW(Traits::X86_NUM_NOP_VARIANTS));
5673   }
5674 }
5675
5676 template <typename TraitsType>
5677 void TargetX86Base<TraitsType>::lowerPhi(const InstPhi * /*Instr*/) {
5678   Func->setError("Phi found in regular instruction list");
5679 }
5680
5681 template <typename TraitsType>
5682 void TargetX86Base<TraitsType>::lowerRet(const InstRet *Instr) {
5683   Variable *Reg = nullptr;
5684   if (Instr->hasRetValue()) {
5685     Operand *RetValue = legalize(Instr->getRetValue());
5686     const Type ReturnType = RetValue->getType();
5687     assert(isVectorType(ReturnType) || isScalarFloatingType(ReturnType) ||
5688            (ReturnType == IceType_i32) || (ReturnType == IceType_i64));
5689     Reg = moveReturnValueToRegister(RetValue, ReturnType);
5690   }
5691   // Add a ret instruction even if sandboxing is enabled, because addEpilog
5692   // explicitly looks for a ret instruction as a marker for where to insert the
5693   // frame removal instructions.
5694   _ret(Reg);
5695   // Add a fake use of esp to make sure esp stays alive for the entire
5696   // function. Otherwise post-call esp adjustments get dead-code eliminated.
5697   keepEspLiveAtExit();
5698 }
5699
5700 inline uint32_t makePshufdMask(SizeT Index0, SizeT Index1, SizeT Index2,
5701                                SizeT Index3) {
5702   const SizeT Mask = (Index0 & 0x3) | ((Index1 & 0x3) << 2) |
5703                      ((Index2 & 0x3) << 4) | ((Index3 & 0x3) << 6);
5704   assert(Mask < 256);
5705   return Mask;
5706 }
5707
5708 template <typename TraitsType>
5709 Variable *TargetX86Base<TraitsType>::lowerShuffleVector_AllFromSameSrc(
5710     Variable *Src, SizeT Index0, SizeT Index1, SizeT Index2, SizeT Index3) {
5711   constexpr SizeT SrcBit = 1 << 2;
5712   assert((Index0 & SrcBit) == (Index1 & SrcBit));
5713   assert((Index0 & SrcBit) == (Index2 & SrcBit));
5714   assert((Index0 & SrcBit) == (Index3 & SrcBit));
5715   (void)SrcBit;
5716
5717   const Type SrcTy = Src->getType();
5718   auto *T = makeReg(SrcTy);
5719   auto *SrcRM = legalize(Src, Legal_Reg | Legal_Mem);
5720   auto *Mask =
5721       Ctx->getConstantInt32(makePshufdMask(Index0, Index1, Index2, Index3));
5722   _pshufd(T, SrcRM, Mask);
5723   return T;
5724 }
5725
5726 template <typename TraitsType>
5727 Variable *TargetX86Base<TraitsType>::lowerShuffleVector_TwoFromSameSrc(
5728     Variable *Src0, SizeT Index0, SizeT Index1, Variable *Src1, SizeT Index2,
5729     SizeT Index3) {
5730   constexpr SizeT SrcBit = 1 << 2;
5731   assert((Index0 & SrcBit) == (Index1 & SrcBit) || (Index1 == IGNORE_INDEX));
5732   assert((Index2 & SrcBit) == (Index3 & SrcBit) || (Index3 == IGNORE_INDEX));
5733   (void)SrcBit;
5734
5735   const Type SrcTy = Src0->getType();
5736   assert(Src1->getType() == SrcTy);
5737   auto *T = makeReg(SrcTy);
5738   auto *Src0R = legalizeToReg(Src0);
5739   auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
5740   auto *Mask =
5741       Ctx->getConstantInt32(makePshufdMask(Index0, Index1, Index2, Index3));
5742   _movp(T, Src0R);
5743   _shufps(T, Src1RM, Mask);
5744   return T;
5745 }
5746
5747 template <typename TraitsType>
5748 Variable *TargetX86Base<TraitsType>::lowerShuffleVector_UnifyFromDifferentSrcs(
5749     Variable *Src0, SizeT Index0, Variable *Src1, SizeT Index1) {
5750   return lowerShuffleVector_TwoFromSameSrc(Src0, Index0, IGNORE_INDEX, Src1,
5751                                            Index1, IGNORE_INDEX);
5752 }
5753
5754 inline SizeT makeSrcSwitchMask(SizeT Index0, SizeT Index1, SizeT Index2,
5755                                SizeT Index3) {
5756   constexpr SizeT SrcBit = 1 << 2;
5757   const SizeT Index0Bits = ((Index0 & SrcBit) == 0) ? 0 : (1 << 0);
5758   const SizeT Index1Bits = ((Index1 & SrcBit) == 0) ? 0 : (1 << 1);
5759   const SizeT Index2Bits = ((Index2 & SrcBit) == 0) ? 0 : (1 << 2);
5760   const SizeT Index3Bits = ((Index3 & SrcBit) == 0) ? 0 : (1 << 3);
5761   return Index0Bits | Index1Bits | Index2Bits | Index3Bits;
5762 }
5763
5764 template <typename TraitsType>
5765 GlobalString TargetX86Base<TraitsType>::lowerShuffleVector_NewMaskName() {
5766   GlobalString FuncName = Func->getFunctionName();
5767   const SizeT Id = PshufbMaskCount++;
5768   if (!BuildDefs::dump() || !FuncName.hasStdString()) {
5769     return GlobalString::createWithString(
5770         Ctx,
5771         "$PS" + std::to_string(FuncName.getID()) + "_" + std::to_string(Id));
5772   }
5773   return GlobalString::createWithString(
5774       Ctx, "Pshufb$" + Func->getFunctionName() + "$" + std::to_string(Id));
5775 }
5776
5777 template <typename TraitsType>
5778 ConstantRelocatable *
5779 TargetX86Base<TraitsType>::lowerShuffleVector_CreatePshufbMask(
5780     int8_t Idx0, int8_t Idx1, int8_t Idx2, int8_t Idx3, int8_t Idx4,
5781     int8_t Idx5, int8_t Idx6, int8_t Idx7, int8_t Idx8, int8_t Idx9,
5782     int8_t Idx10, int8_t Idx11, int8_t Idx12, int8_t Idx13, int8_t Idx14,
5783     int8_t Idx15) {
5784   static constexpr uint8_t NumElements = 16;
5785   const char Initializer[NumElements] = {
5786       Idx0, Idx1, Idx2,  Idx3,  Idx4,  Idx5,  Idx6,  Idx7,
5787       Idx8, Idx9, Idx10, Idx11, Idx12, Idx13, Idx14, Idx15,
5788   };
5789
5790   static constexpr Type V4VectorType = IceType_v4i32;
5791   const uint32_t MaskAlignment = typeWidthInBytesOnStack(V4VectorType);
5792   auto *Mask = VariableDeclaration::create(Func->getGlobalPool());
5793   GlobalString MaskName = lowerShuffleVector_NewMaskName();
5794   Mask->setIsConstant(true);
5795   Mask->addInitializer(VariableDeclaration::DataInitializer::create(
5796       Func->getGlobalPool(), Initializer, NumElements));
5797   Mask->setName(MaskName);
5798   // Mask needs to be 16-byte aligned, or pshufb will seg fault.
5799   Mask->setAlignment(MaskAlignment);
5800   Func->addGlobal(Mask);
5801
5802   constexpr RelocOffsetT Offset = 0;
5803   return llvm::cast<ConstantRelocatable>(Ctx->getConstantSym(Offset, MaskName));
5804 }
5805
5806 template <typename TraitsType>
5807 void TargetX86Base<TraitsType>::lowerShuffleVector_UsingPshufb(
5808     Variable *Dest, Operand *Src0, Operand *Src1, int8_t Idx0, int8_t Idx1,
5809     int8_t Idx2, int8_t Idx3, int8_t Idx4, int8_t Idx5, int8_t Idx6,
5810     int8_t Idx7, int8_t Idx8, int8_t Idx9, int8_t Idx10, int8_t Idx11,
5811     int8_t Idx12, int8_t Idx13, int8_t Idx14, int8_t Idx15) {
5812   const Type DestTy = Dest->getType();
5813   static constexpr bool NotRebased = false;
5814   static constexpr Variable *NoBase = nullptr;
5815   // We use void for the memory operand instead of DestTy because using the
5816   // latter causes a validation failure: the X86 Inst layer complains that
5817   // vector mem operands could be under aligned. Thus, using void we avoid the
5818   // validation error. Note that the mask global declaration is aligned, so it
5819   // can be used as an XMM mem operand.
5820   static constexpr Type MaskType = IceType_void;
5821 #define IDX_IN_SRC(N, S)                                                       \
5822   ((((N) & (1 << 4)) == (S << 4)) ? ((N)&0xf) : CLEAR_ALL_BITS)
5823   auto *Mask0M = X86OperandMem::create(
5824       Func, MaskType, NoBase,
5825       lowerShuffleVector_CreatePshufbMask(
5826           IDX_IN_SRC(Idx0, 0), IDX_IN_SRC(Idx1, 0), IDX_IN_SRC(Idx2, 0),
5827           IDX_IN_SRC(Idx3, 0), IDX_IN_SRC(Idx4, 0), IDX_IN_SRC(Idx5, 0),
5828           IDX_IN_SRC(Idx6, 0), IDX_IN_SRC(Idx7, 0), IDX_IN_SRC(Idx8, 0),
5829           IDX_IN_SRC(Idx9, 0), IDX_IN_SRC(Idx10, 0), IDX_IN_SRC(Idx11, 0),
5830           IDX_IN_SRC(Idx12, 0), IDX_IN_SRC(Idx13, 0), IDX_IN_SRC(Idx14, 0),
5831           IDX_IN_SRC(Idx15, 0)),
5832       NotRebased);
5833   auto *Mask1M = X86OperandMem::create(
5834       Func, MaskType, NoBase,
5835       lowerShuffleVector_CreatePshufbMask(
5836           IDX_IN_SRC(Idx0, 1), IDX_IN_SRC(Idx1, 1), IDX_IN_SRC(Idx2, 1),
5837           IDX_IN_SRC(Idx3, 1), IDX_IN_SRC(Idx4, 1), IDX_IN_SRC(Idx5, 1),
5838           IDX_IN_SRC(Idx6, 1), IDX_IN_SRC(Idx7, 1), IDX_IN_SRC(Idx8, 1),
5839           IDX_IN_SRC(Idx9, 1), IDX_IN_SRC(Idx10, 1), IDX_IN_SRC(Idx11, 1),
5840           IDX_IN_SRC(Idx12, 1), IDX_IN_SRC(Idx13, 1), IDX_IN_SRC(Idx14, 1),
5841           IDX_IN_SRC(Idx15, 1)),
5842       NotRebased);
5843 #undef IDX_IN_SRC
5844   auto *T0 = makeReg(DestTy);
5845   auto *T1 = makeReg(DestTy);
5846   auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
5847   _movp(T0, Src0RM);
5848   auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
5849   _movp(T1, Src1RM);
5850
5851   _pshufb(T1, Mask1M);
5852   _pshufb(T0, Mask0M);
5853   _por(T1, T0);
5854   _movp(Dest, T1);
5855 }
5856
5857 template <typename TraitsType>
5858 void TargetX86Base<TraitsType>::lowerShuffleVector(
5859     const InstShuffleVector *Instr) {
5860   auto *Dest = Instr->getDest();
5861   const Type DestTy = Dest->getType();
5862   auto *Src0 = llvm::cast<Variable>(Instr->getSrc(0));
5863   auto *Src1 = llvm::cast<Variable>(Instr->getSrc(1));
5864   const SizeT NumElements = typeNumElements(DestTy);
5865
5866   auto *T = makeReg(DestTy);
5867
5868   switch (DestTy) {
5869   default:
5870     llvm::report_fatal_error("Unexpected vector type.");
5871   case IceType_v16i1:
5872   case IceType_v16i8: {
5873     if (InstructionSet < Traits::SSE4_1) {
5874       // TODO(jpp): figure out how to lower with sse2.
5875       break;
5876     }
5877     static constexpr SizeT ExpectedNumElements = 16;
5878     assert(ExpectedNumElements == Instr->getNumIndexes());
5879     (void)ExpectedNumElements;
5880     const SizeT Index0 = Instr->getIndex(0)->getValue();
5881     const SizeT Index1 = Instr->getIndex(1)->getValue();
5882     const SizeT Index2 = Instr->getIndex(2)->getValue();
5883     const SizeT Index3 = Instr->getIndex(3)->getValue();
5884     const SizeT Index4 = Instr->getIndex(4)->getValue();
5885     const SizeT Index5 = Instr->getIndex(5)->getValue();
5886     const SizeT Index6 = Instr->getIndex(6)->getValue();
5887     const SizeT Index7 = Instr->getIndex(7)->getValue();
5888     const SizeT Index8 = Instr->getIndex(8)->getValue();
5889     const SizeT Index9 = Instr->getIndex(9)->getValue();
5890     const SizeT Index10 = Instr->getIndex(10)->getValue();
5891     const SizeT Index11 = Instr->getIndex(11)->getValue();
5892     const SizeT Index12 = Instr->getIndex(12)->getValue();
5893     const SizeT Index13 = Instr->getIndex(13)->getValue();
5894     const SizeT Index14 = Instr->getIndex(14)->getValue();
5895     const SizeT Index15 = Instr->getIndex(15)->getValue();
5896     lowerShuffleVector_UsingPshufb(Dest, Src0, Src1, Index0, Index1, Index2,
5897                                    Index3, Index4, Index5, Index6, Index7,
5898                                    Index8, Index9, Index10, Index11, Index12,
5899                                    Index13, Index14, Index15);
5900     return;
5901   }
5902   case IceType_v8i1:
5903   case IceType_v8i16: {
5904     if (InstructionSet < Traits::SSE4_1) {
5905       // TODO(jpp): figure out how to lower with sse2.
5906       break;
5907     }
5908     static constexpr SizeT ExpectedNumElements = 8;
5909     assert(ExpectedNumElements == Instr->getNumIndexes());
5910     (void)ExpectedNumElements;
5911     const SizeT Index0 = Instr->getIndex(0)->getValue();
5912     const SizeT Index1 = Instr->getIndex(1)->getValue();
5913     const SizeT Index2 = Instr->getIndex(2)->getValue();
5914     const SizeT Index3 = Instr->getIndex(3)->getValue();
5915     const SizeT Index4 = Instr->getIndex(4)->getValue();
5916     const SizeT Index5 = Instr->getIndex(5)->getValue();
5917     const SizeT Index6 = Instr->getIndex(6)->getValue();
5918     const SizeT Index7 = Instr->getIndex(7)->getValue();
5919 #define TO_BYTE_INDEX(I) ((I) << 1)
5920     lowerShuffleVector_UsingPshufb(
5921         Dest, Src0, Src1, TO_BYTE_INDEX(Index0), TO_BYTE_INDEX(Index0) + 1,
5922         TO_BYTE_INDEX(Index1), TO_BYTE_INDEX(Index1) + 1, TO_BYTE_INDEX(Index2),
5923         TO_BYTE_INDEX(Index2) + 1, TO_BYTE_INDEX(Index3),
5924         TO_BYTE_INDEX(Index3) + 1, TO_BYTE_INDEX(Index4),
5925         TO_BYTE_INDEX(Index4) + 1, TO_BYTE_INDEX(Index5),
5926         TO_BYTE_INDEX(Index5) + 1, TO_BYTE_INDEX(Index6),
5927         TO_BYTE_INDEX(Index6) + 1, TO_BYTE_INDEX(Index7),
5928         TO_BYTE_INDEX(Index7) + 1);
5929 #undef TO_BYTE_INDEX
5930     return;
5931   }
5932   case IceType_v4i1:
5933   case IceType_v4i32:
5934   case IceType_v4f32: {
5935     static constexpr SizeT ExpectedNumElements = 4;
5936     assert(ExpectedNumElements == Instr->getNumIndexes());
5937     const SizeT Index0 = Instr->getIndex(0)->getValue();
5938     const SizeT Index1 = Instr->getIndex(1)->getValue();
5939     const SizeT Index2 = Instr->getIndex(2)->getValue();
5940     const SizeT Index3 = Instr->getIndex(3)->getValue();
5941     Variable *T = nullptr;
5942     switch (makeSrcSwitchMask(Index0, Index1, Index2, Index3)) {
5943 #define CASE_SRCS_IN(S0, S1, S2, S3)                                           \
5944   case (((S0) << 0) | ((S1) << 1) | ((S2) << 2) | ((S3) << 3))
5945       CASE_SRCS_IN(0, 0, 0, 0) : {
5946         T = lowerShuffleVector_AllFromSameSrc(Src0, Index0, Index1, Index2,
5947                                               Index3);
5948       }
5949       break;
5950       CASE_SRCS_IN(0, 0, 0, 1) : {
5951         assert(false && "Following code is untested but likely correct; test "
5952                         "and remove assert.");
5953         auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src0, Index2,
5954                                                                   Src1, Index3);
5955         T = lowerShuffleVector_TwoFromSameSrc(Src0, Index0, Index1, Unified,
5956                                               UNIFIED_INDEX_0, UNIFIED_INDEX_1);
5957       }
5958       break;
5959       CASE_SRCS_IN(0, 0, 1, 0) : {
5960         auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src1, Index2,
5961                                                                   Src0, Index3);
5962         T = lowerShuffleVector_TwoFromSameSrc(Src0, Index0, Index1, Unified,
5963                                               UNIFIED_INDEX_0, UNIFIED_INDEX_1);
5964       }
5965       break;
5966       CASE_SRCS_IN(0, 0, 1, 1) : {
5967         assert(false && "Following code is untested but likely correct; test "
5968                         "and remove assert.");
5969         T = lowerShuffleVector_TwoFromSameSrc(Src0, Index0, Index1, Src1,
5970                                               Index2, Index3);
5971       }
5972       break;
5973       CASE_SRCS_IN(0, 1, 0, 0) : {
5974         auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src0, Index0,
5975                                                                   Src1, Index1);
5976         T = lowerShuffleVector_TwoFromSameSrc(
5977             Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Src0, Index2, Index3);
5978       }
5979       break;
5980       CASE_SRCS_IN(0, 1, 0, 1) : {
5981         if (Index0 == 0 && (Index1 - ExpectedNumElements) == 0 && Index2 == 1 &&
5982             (Index3 - ExpectedNumElements) == 1) {
5983           assert(false && "Following code is untested but likely correct; test "
5984                           "and remove assert.");
5985           auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
5986           auto *Src0R = legalizeToReg(Src0);
5987           T = makeReg(DestTy);
5988           _movp(T, Src0R);
5989           _punpckl(T, Src1RM);
5990         } else if (Index0 == Index2 && Index1 == Index3) {
5991           assert(false && "Following code is untested but likely correct; test "
5992                           "and remove assert.");
5993           auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(
5994               Src0, Index0, Src1, Index1);
5995           T = lowerShuffleVector_AllFromSameSrc(
5996               Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, UNIFIED_INDEX_0,
5997               UNIFIED_INDEX_1);
5998         } else {
5999           assert(false && "Following code is untested but likely correct; test "
6000                           "and remove assert.");
6001           auto *Unified0 = lowerShuffleVector_UnifyFromDifferentSrcs(
6002               Src0, Index0, Src1, Index1);
6003           auto *Unified1 = lowerShuffleVector_UnifyFromDifferentSrcs(
6004               Src0, Index2, Src1, Index3);
6005           T = lowerShuffleVector_TwoFromSameSrc(
6006               Unified0, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Unified1,
6007               UNIFIED_INDEX_0, UNIFIED_INDEX_1);
6008         }
6009       }
6010       break;
6011       CASE_SRCS_IN(0, 1, 1, 0) : {
6012         if (Index0 == Index3 && Index1 == Index2) {
6013           auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(
6014               Src0, Index0, Src1, Index1);
6015           T = lowerShuffleVector_AllFromSameSrc(
6016               Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, UNIFIED_INDEX_1,
6017               UNIFIED_INDEX_0);
6018         } else {
6019           auto *Unified0 = lowerShuffleVector_UnifyFromDifferentSrcs(
6020               Src0, Index0, Src1, Index1);
6021           auto *Unified1 = lowerShuffleVector_UnifyFromDifferentSrcs(
6022               Src1, Index2, Src0, Index3);
6023           T = lowerShuffleVector_TwoFromSameSrc(
6024               Unified0, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Unified1,
6025               UNIFIED_INDEX_0, UNIFIED_INDEX_1);
6026         }
6027       }
6028       break;
6029       CASE_SRCS_IN(0, 1, 1, 1) : {
6030         assert(false && "Following code is untested but likely correct; test "
6031                         "and remove assert.");
6032         auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src0, Index0,
6033                                                                   Src1, Index1);
6034         T = lowerShuffleVector_TwoFromSameSrc(
6035             Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Src1, Index2, Index3);
6036       }
6037       break;
6038       CASE_SRCS_IN(1, 0, 0, 0) : {
6039         auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src1, Index0,
6040                                                                   Src0, Index1);
6041         T = lowerShuffleVector_TwoFromSameSrc(
6042             Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Src0, Index2, Index3);
6043       }
6044       break;
6045       CASE_SRCS_IN(1, 0, 0, 1) : {
6046         if (Index0 == Index3 && Index1 == Index2) {
6047           assert(false && "Following code is untested but likely correct; test "
6048                           "and remove assert.");
6049           auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(
6050               Src1, Index0, Src0, Index1);
6051           T = lowerShuffleVector_AllFromSameSrc(
6052               Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, UNIFIED_INDEX_1,
6053               UNIFIED_INDEX_0);
6054         } else {
6055           assert(false && "Following code is untested but likely correct; test "
6056                           "and remove assert.");
6057           auto *Unified0 = lowerShuffleVector_UnifyFromDifferentSrcs(
6058               Src1, Index0, Src0, Index1);
6059           auto *Unified1 = lowerShuffleVector_UnifyFromDifferentSrcs(
6060               Src0, Index2, Src1, Index3);
6061           T = lowerShuffleVector_TwoFromSameSrc(
6062               Unified0, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Unified1,
6063               UNIFIED_INDEX_0, UNIFIED_INDEX_1);
6064         }
6065       }
6066       break;
6067       CASE_SRCS_IN(1, 0, 1, 0) : {
6068         if ((Index0 - ExpectedNumElements) == 0 && Index1 == 0 &&
6069             (Index2 - ExpectedNumElements) == 1 && Index3 == 1) {
6070           auto *Src1RM = legalize(Src0, Legal_Reg | Legal_Mem);
6071           auto *Src0R = legalizeToReg(Src1);
6072           T = makeReg(DestTy);
6073           _movp(T, Src0R);
6074           _punpckl(T, Src1RM);
6075         } else if (Index0 == Index2 && Index1 == Index3) {
6076           auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(
6077               Src1, Index0, Src0, Index1);
6078           T = lowerShuffleVector_AllFromSameSrc(
6079               Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, UNIFIED_INDEX_0,
6080               UNIFIED_INDEX_1);
6081         } else {
6082           auto *Unified0 = lowerShuffleVector_UnifyFromDifferentSrcs(
6083               Src1, Index0, Src0, Index1);
6084           auto *Unified1 = lowerShuffleVector_UnifyFromDifferentSrcs(
6085               Src1, Index2, Src0, Index3);
6086           T = lowerShuffleVector_TwoFromSameSrc(
6087               Unified0, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Unified1,
6088               UNIFIED_INDEX_0, UNIFIED_INDEX_1);
6089         }
6090       }
6091       break;
6092       CASE_SRCS_IN(1, 0, 1, 1) : {
6093         assert(false && "Following code is untested but likely correct; test "
6094                         "and remove assert.");
6095         auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src1, Index0,
6096                                                                   Src0, Index1);
6097         T = lowerShuffleVector_TwoFromSameSrc(
6098             Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Src1, Index2, Index3);
6099       }
6100       break;
6101       CASE_SRCS_IN(1, 1, 0, 0) : {
6102         T = lowerShuffleVector_TwoFromSameSrc(Src1, Index0, Index1, Src0,
6103                                               Index2, Index3);
6104       }
6105       break;
6106       CASE_SRCS_IN(1, 1, 0, 1) : {
6107         assert(false && "Following code is untested but likely correct; test "
6108                         "and remove assert.");
6109         auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src0, Index2,
6110                                                                   Src1, Index3);
6111         T = lowerShuffleVector_TwoFromSameSrc(Src1, Index0, Index1, Unified,
6112                                               UNIFIED_INDEX_0, UNIFIED_INDEX_1);
6113       }
6114       break;
6115       CASE_SRCS_IN(1, 1, 1, 0) : {
6116         auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src1, Index2,
6117                                                                   Src0, Index3);
6118         T = lowerShuffleVector_TwoFromSameSrc(Src1, Index0, Index1, Unified,
6119                                               UNIFIED_INDEX_0, UNIFIED_INDEX_1);
6120       }
6121       break;
6122       CASE_SRCS_IN(1, 1, 1, 1) : {
6123         assert(false && "Following code is untested but likely correct; test "
6124                         "and remove assert.");
6125         T = lowerShuffleVector_AllFromSameSrc(Src1, Index0, Index1, Index2,
6126                                               Index3);
6127       }
6128       break;
6129 #undef CASE_SRCS_IN
6130     }
6131
6132     assert(T != nullptr);
6133     assert(T->getType() == DestTy);
6134     _movp(Dest, T);
6135     return;
6136   } break;
6137   }
6138
6139   // Unoptimized shuffle. Perform a series of inserts and extracts.
6140   Context.insert<InstFakeDef>(T);
6141   const Type ElementType = typeElementType(DestTy);
6142   for (SizeT I = 0; I < Instr->getNumIndexes(); ++I) {
6143     auto *Index = Instr->getIndex(I);
6144     const SizeT Elem = Index->getValue();
6145     auto *ExtElmt = makeReg(ElementType);
6146     if (Elem < NumElements) {
6147       lowerExtractElement(
6148           InstExtractElement::create(Func, ExtElmt, Src0, Index));
6149     } else {
6150       lowerExtractElement(InstExtractElement::create(
6151           Func, ExtElmt, Src1,
6152           Ctx->getConstantInt32(Index->getValue() - NumElements)));
6153     }
6154     auto *NewT = makeReg(DestTy);
6155     lowerInsertElement(InstInsertElement::create(Func, NewT, T, ExtElmt,
6156                                                  Ctx->getConstantInt32(I)));
6157     T = NewT;
6158   }
6159   _movp(Dest, T);
6160 }
6161
6162 template <typename TraitsType>
6163 void TargetX86Base<TraitsType>::lowerSelect(const InstSelect *Select) {
6164   Variable *Dest = Select->getDest();
6165
6166   if (isVectorType(Dest->getType())) {
6167     lowerSelectVector(Select);
6168     return;
6169   }
6170
6171   Operand *Condition = Select->getCondition();
6172   // Handle folding opportunities.
6173   if (const Inst *Producer = FoldingInfo.getProducerFor(Condition)) {
6174     assert(Producer->isDeleted());
6175     switch (BoolFolding<Traits>::getProducerKind(Producer)) {
6176     default:
6177       break;
6178     case BoolFolding<Traits>::PK_Icmp32:
6179     case BoolFolding<Traits>::PK_Icmp64: {
6180       lowerIcmpAndConsumer(llvm::cast<InstIcmp>(Producer), Select);
6181       return;
6182     }
6183     case BoolFolding<Traits>::PK_Fcmp: {
6184       lowerFcmpAndConsumer(llvm::cast<InstFcmp>(Producer), Select);
6185       return;
6186     }
6187     }
6188   }
6189
6190   Operand *CmpResult = legalize(Condition, Legal_Reg | Legal_Mem);
6191   Operand *Zero = Ctx->getConstantZero(IceType_i32);
6192   _cmp(CmpResult, Zero);
6193   Operand *SrcT = Select->getTrueOperand();
6194   Operand *SrcF = Select->getFalseOperand();
6195   const BrCond Cond = Traits::Cond::Br_ne;
6196   lowerSelectMove(Dest, Cond, SrcT, SrcF);
6197 }
6198
6199 template <typename TraitsType>
6200 void TargetX86Base<TraitsType>::lowerSelectMove(Variable *Dest, BrCond Cond,
6201                                                 Operand *SrcT, Operand *SrcF) {
6202   Type DestTy = Dest->getType();
6203   if (typeWidthInBytes(DestTy) == 1 || isFloatingType(DestTy)) {
6204     // The cmov instruction doesn't allow 8-bit or FP operands, so we need
6205     // explicit control flow.
6206     // d=cmp e,f; a=d?b:c ==> cmp e,f; a=b; jne L1; a=c; L1:
6207     auto *Label = InstX86Label::create(Func, this);
6208     SrcT = legalize(SrcT, Legal_Reg | Legal_Imm);
6209     _mov(Dest, SrcT);
6210     _br(Cond, Label);
6211     SrcF = legalize(SrcF, Legal_Reg | Legal_Imm);
6212     _redefined(_mov(Dest, SrcF));
6213     Context.insert(Label);
6214     return;
6215   }
6216   // mov t, SrcF; cmov_cond t, SrcT; mov dest, t
6217   // But if SrcT is immediate, we might be able to do better, as the cmov
6218   // instruction doesn't allow an immediate operand:
6219   // mov t, SrcT; cmov_!cond t, SrcF; mov dest, t
6220   if (llvm::isa<Constant>(SrcT) && !llvm::isa<Constant>(SrcF)) {
6221     std::swap(SrcT, SrcF);
6222     Cond = InstImpl<TraitsType>::InstX86Base::getOppositeCondition(Cond);
6223   }
6224   if (!Traits::Is64Bit && DestTy == IceType_i64) {
6225     SrcT = legalizeUndef(SrcT);
6226     SrcF = legalizeUndef(SrcF);
6227     // Set the low portion.
6228     auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
6229     lowerSelectIntMove(DestLo, Cond, loOperand(SrcT), loOperand(SrcF));
6230     // Set the high portion.
6231     auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
6232     lowerSelectIntMove(DestHi, Cond, hiOperand(SrcT), hiOperand(SrcF));
6233     return;
6234   }
6235
6236   assert(DestTy == IceType_i16 || DestTy == IceType_i32 ||
6237          (Traits::Is64Bit && DestTy == IceType_i64));
6238   lowerSelectIntMove(Dest, Cond, SrcT, SrcF);
6239 }
6240
6241 template <typename TraitsType>
6242 void TargetX86Base<TraitsType>::lowerSelectIntMove(Variable *Dest, BrCond Cond,
6243                                                    Operand *SrcT,
6244                                                    Operand *SrcF) {
6245   Variable *T = nullptr;
6246   SrcF = legalize(SrcF);
6247   _mov(T, SrcF);
6248   SrcT = legalize(SrcT, Legal_Reg | Legal_Mem);
6249   _cmov(T, SrcT, Cond);
6250   _mov(Dest, T);
6251 }
6252
6253 template <typename TraitsType>
6254 void TargetX86Base<TraitsType>::lowerMove(Variable *Dest, Operand *Src,
6255                                           bool IsRedefinition) {
6256   assert(Dest->getType() == Src->getType());
6257   assert(!Dest->isRematerializable());
6258   if (!Traits::Is64Bit && Dest->getType() == IceType_i64) {
6259     Src = legalize(Src);
6260     Operand *SrcLo = loOperand(Src);
6261     Operand *SrcHi = hiOperand(Src);
6262     auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
6263     auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
6264     Variable *T_Lo = nullptr, *T_Hi = nullptr;
6265     _mov(T_Lo, SrcLo);
6266     _redefined(_mov(DestLo, T_Lo), IsRedefinition);
6267     _mov(T_Hi, SrcHi);
6268     _redefined(_mov(DestHi, T_Hi), IsRedefinition);
6269   } else {
6270     Operand *SrcLegal;
6271     if (Dest->hasReg()) {
6272       // If Dest already has a physical register, then only basic legalization
6273       // is needed, as the source operand can be a register, immediate, or
6274       // memory.
6275       SrcLegal = legalize(Src, Legal_Reg, Dest->getRegNum());
6276     } else {
6277       // If Dest could be a stack operand, then RI must be a physical register
6278       // or a scalar integer immediate.
6279       SrcLegal = legalize(Src, Legal_Reg | Legal_Imm);
6280     }
6281     if (isVectorType(Dest->getType())) {
6282       _redefined(_movp(Dest, SrcLegal), IsRedefinition);
6283     } else {
6284       _redefined(_mov(Dest, SrcLegal), IsRedefinition);
6285     }
6286   }
6287 }
6288
6289 template <typename TraitsType>
6290 bool TargetX86Base<TraitsType>::lowerOptimizeFcmpSelect(
6291     const InstFcmp *Fcmp, const InstSelect *Select) {
6292   Operand *CmpSrc0 = Fcmp->getSrc(0);
6293   Operand *CmpSrc1 = Fcmp->getSrc(1);
6294   Operand *SelectSrcT = Select->getTrueOperand();
6295   Operand *SelectSrcF = Select->getFalseOperand();
6296
6297   if (CmpSrc0->getType() != SelectSrcT->getType())
6298     return false;
6299
6300   // TODO(sehr, stichnot): fcmp/select patterns (e,g., minsd/maxss) go here.
6301   InstFcmp::FCond Condition = Fcmp->getCondition();
6302   switch (Condition) {
6303   default:
6304     return false;
6305   case InstFcmp::True:
6306   case InstFcmp::False:
6307   case InstFcmp::Ogt:
6308   case InstFcmp::Olt:
6309     (void)CmpSrc0;
6310     (void)CmpSrc1;
6311     (void)SelectSrcT;
6312     (void)SelectSrcF;
6313     break;
6314   }
6315   return false;
6316 }
6317
6318 template <typename TraitsType>
6319 void TargetX86Base<TraitsType>::lowerIcmp(const InstIcmp *Icmp) {
6320   Variable *Dest = Icmp->getDest();
6321   if (isVectorType(Dest->getType())) {
6322     lowerIcmpVector(Icmp);
6323   } else {
6324     constexpr Inst *Consumer = nullptr;
6325     lowerIcmpAndConsumer(Icmp, Consumer);
6326   }
6327 }
6328
6329 template <typename TraitsType>
6330 void TargetX86Base<TraitsType>::lowerSelectVector(const InstSelect *Instr) {
6331   Variable *Dest = Instr->getDest();
6332   Type DestTy = Dest->getType();
6333   Operand *SrcT = Instr->getTrueOperand();
6334   Operand *SrcF = Instr->getFalseOperand();
6335   Operand *Condition = Instr->getCondition();
6336
6337   if (!isVectorType(DestTy))
6338     llvm::report_fatal_error("Expected a vector select");
6339
6340   Type SrcTy = SrcT->getType();
6341   Variable *T = makeReg(SrcTy);
6342   Operand *SrcTRM = legalize(SrcT, Legal_Reg | Legal_Mem);
6343   Operand *SrcFRM = legalize(SrcF, Legal_Reg | Legal_Mem);
6344   if (InstructionSet >= Traits::SSE4_1) {
6345     // TODO(wala): If the condition operand is a constant, use blendps or
6346     // pblendw.
6347     //
6348     // Use blendvps or pblendvb to implement select.
6349     if (SrcTy == IceType_v4i1 || SrcTy == IceType_v4i32 ||
6350         SrcTy == IceType_v4f32) {
6351       Operand *ConditionRM = legalize(Condition, Legal_Reg | Legal_Mem);
6352       Variable *xmm0 = makeReg(IceType_v4i32, Traits::RegisterSet::Reg_xmm0);
6353       _movp(xmm0, ConditionRM);
6354       _psll(xmm0, Ctx->getConstantInt8(31));
6355       _movp(T, SrcFRM);
6356       _blendvps(T, SrcTRM, xmm0);
6357       _movp(Dest, T);
6358     } else {
6359       assert(typeNumElements(SrcTy) == 8 || typeNumElements(SrcTy) == 16);
6360       Type SignExtTy =
6361           Condition->getType() == IceType_v8i1 ? IceType_v8i16 : IceType_v16i8;
6362       Variable *xmm0 = makeReg(SignExtTy, Traits::RegisterSet::Reg_xmm0);
6363       lowerCast(InstCast::create(Func, InstCast::Sext, xmm0, Condition));
6364       _movp(T, SrcFRM);
6365       _pblendvb(T, SrcTRM, xmm0);
6366       _movp(Dest, T);
6367     }
6368     return;
6369   }
6370   // Lower select without Traits::SSE4.1:
6371   // a=d?b:c ==>
6372   //   if elementtype(d) != i1:
6373   //      d=sext(d);
6374   //   a=(b&d)|(c&~d);
6375   Variable *T2 = makeReg(SrcTy);
6376   // Sign extend the condition operand if applicable.
6377   if (SrcTy == IceType_v4f32) {
6378     // The sext operation takes only integer arguments.
6379     Variable *T3 = Func->makeVariable(IceType_v4i32);
6380     lowerCast(InstCast::create(Func, InstCast::Sext, T3, Condition));
6381     _movp(T, T3);
6382   } else if (typeElementType(SrcTy) != IceType_i1) {
6383     lowerCast(InstCast::create(Func, InstCast::Sext, T, Condition));
6384   } else {
6385     Operand *ConditionRM = legalize(Condition, Legal_Reg | Legal_Mem);
6386     _movp(T, ConditionRM);
6387   }
6388   _movp(T2, T);
6389   _pand(T, SrcTRM);
6390   _pandn(T2, SrcFRM);
6391   _por(T, T2);
6392   _movp(Dest, T);
6393
6394   return;
6395 }
6396
6397 template <typename TraitsType>
6398 void TargetX86Base<TraitsType>::lowerStore(const InstStore *Instr) {
6399   Operand *Value = Instr->getData();
6400   Operand *Addr = Instr->getAddr();
6401   X86OperandMem *NewAddr = formMemoryOperand(Addr, Value->getType());
6402   doMockBoundsCheck(NewAddr);
6403   Type Ty = NewAddr->getType();
6404
6405   if (!Traits::Is64Bit && Ty == IceType_i64) {
6406     Value = legalizeUndef(Value);
6407     Operand *ValueHi = legalize(hiOperand(Value), Legal_Reg | Legal_Imm);
6408     _store(ValueHi, llvm::cast<X86OperandMem>(hiOperand(NewAddr)));
6409     Operand *ValueLo = legalize(loOperand(Value), Legal_Reg | Legal_Imm);
6410     _store(ValueLo, llvm::cast<X86OperandMem>(loOperand(NewAddr)));
6411   } else if (isVectorType(Ty)) {
6412     _storep(legalizeToReg(Value), NewAddr);
6413   } else {
6414     Value = legalize(Value, Legal_Reg | Legal_Imm);
6415     _store(Value, NewAddr);
6416   }
6417 }
6418
6419 template <typename TraitsType>
6420 void TargetX86Base<TraitsType>::doAddressOptStore() {
6421   auto *Instr = llvm::cast<InstStore>(Context.getCur());
6422   Operand *Addr = Instr->getAddr();
6423   Operand *Data = Instr->getData();
6424   if (auto *OptAddr = computeAddressOpt(Instr, Data->getType(), Addr)) {
6425     Instr->setDeleted();
6426     auto *NewStore = Context.insert<InstStore>(Data, OptAddr);
6427     if (Instr->getDest())
6428       NewStore->setRmwBeacon(Instr->getRmwBeacon());
6429   }
6430 }
6431
6432 template <typename TraitsType>
6433 Operand *TargetX86Base<TraitsType>::lowerCmpRange(Operand *Comparison,
6434                                                   uint64_t Min, uint64_t Max) {
6435   // TODO(ascull): 64-bit should not reach here but only because it is not
6436   // implemented yet. This should be able to handle the 64-bit case.
6437   assert(Traits::Is64Bit || Comparison->getType() != IceType_i64);
6438   // Subtracting 0 is a nop so don't do it
6439   if (Min != 0) {
6440     // Avoid clobbering the comparison by copying it
6441     Variable *T = nullptr;
6442     _mov(T, Comparison);
6443     _sub(T, Ctx->getConstantInt32(Min));
6444     Comparison = T;
6445   }
6446
6447   _cmp(Comparison, Ctx->getConstantInt32(Max - Min));
6448
6449   return Comparison;
6450 }
6451
6452 template <typename TraitsType>
6453 void TargetX86Base<TraitsType>::lowerCaseCluster(const CaseCluster &Case,
6454                                                  Operand *Comparison,
6455                                                  bool DoneCmp,
6456                                                  CfgNode *DefaultTarget) {
6457   switch (Case.getKind()) {
6458   case CaseCluster::JumpTable: {
6459     InstX86Label *SkipJumpTable;
6460
6461     Operand *RangeIndex =
6462         lowerCmpRange(Comparison, Case.getLow(), Case.getHigh());
6463     if (DefaultTarget == nullptr) {
6464       // Skip over jump table logic if comparison not in range and no default
6465       SkipJumpTable = InstX86Label::create(Func, this);
6466       _br(Traits::Cond::Br_a, SkipJumpTable);
6467     } else {
6468       _br(Traits::Cond::Br_a, DefaultTarget);
6469     }
6470
6471     InstJumpTable *JumpTable = Case.getJumpTable();
6472     Context.insert(JumpTable);
6473
6474     // Make sure the index is a register of the same width as the base
6475     Variable *Index;
6476     const Type PointerType = getPointerType();
6477     if (RangeIndex->getType() != PointerType) {
6478       Index = makeReg(PointerType);
6479       if (RangeIndex->getType() == IceType_i64) {
6480         assert(Traits::Is64Bit);
6481         _mov(Index, RangeIndex); // trunc
6482       } else {
6483         _movzx(Index, RangeIndex);
6484       }
6485     } else {
6486       Index = legalizeToReg(RangeIndex);
6487     }
6488
6489     constexpr RelocOffsetT RelocOffset = 0;
6490     constexpr Variable *NoBase = nullptr;
6491     auto JTName = GlobalString::createWithString(Ctx, JumpTable->getName());
6492     Constant *Offset = Ctx->getConstantSym(RelocOffset, JTName);
6493     uint16_t Shift = typeWidthInBytesLog2(PointerType);
6494     constexpr auto Segment = X86OperandMem::SegmentRegisters::DefaultSegment;
6495
6496     Variable *Target = nullptr;
6497     if (Traits::Is64Bit && NeedSandboxing) {
6498       assert(Index != nullptr && Index->getType() == IceType_i32);
6499     }
6500     auto *TargetInMemory = X86OperandMem::create(Func, PointerType, NoBase,
6501                                                  Offset, Index, Shift, Segment);
6502     _mov(Target, TargetInMemory);
6503
6504     lowerIndirectJump(Target);
6505
6506     if (DefaultTarget == nullptr)
6507       Context.insert(SkipJumpTable);
6508     return;
6509   }
6510   case CaseCluster::Range: {
6511     if (Case.isUnitRange()) {
6512       // Single item
6513       if (!DoneCmp) {
6514         Constant *Value = Ctx->getConstantInt32(Case.getLow());
6515         _cmp(Comparison, Value);
6516       }
6517       _br(Traits::Cond::Br_e, Case.getTarget());
6518     } else if (DoneCmp && Case.isPairRange()) {
6519       // Range of two items with first item aleady compared against
6520       _br(Traits::Cond::Br_e, Case.getTarget());
6521       Constant *Value = Ctx->getConstantInt32(Case.getHigh());
6522       _cmp(Comparison, Value);
6523       _br(Traits::Cond::Br_e, Case.getTarget());
6524     } else {
6525       // Range
6526       lowerCmpRange(Comparison, Case.getLow(), Case.getHigh());
6527       _br(Traits::Cond::Br_be, Case.getTarget());
6528     }
6529     if (DefaultTarget != nullptr)
6530       _br(DefaultTarget);
6531     return;
6532   }
6533   }
6534 }
6535
6536 template <typename TraitsType>
6537 void TargetX86Base<TraitsType>::lowerSwitch(const InstSwitch *Instr) {
6538   // Group cases together and navigate through them with a binary search
6539   CaseClusterArray CaseClusters = CaseCluster::clusterizeSwitch(Func, Instr);
6540   Operand *Src0 = Instr->getComparison();
6541   CfgNode *DefaultTarget = Instr->getLabelDefault();
6542
6543   assert(CaseClusters.size() != 0); // Should always be at least one
6544
6545   if (!Traits::Is64Bit && Src0->getType() == IceType_i64) {
6546     Src0 = legalize(Src0); // get Base/Index into physical registers
6547     Operand *Src0Lo = loOperand(Src0);
6548     Operand *Src0Hi = hiOperand(Src0);
6549     if (CaseClusters.back().getHigh() > UINT32_MAX) {
6550       // TODO(ascull): handle 64-bit case properly (currently naive version)
6551       // This might be handled by a higher level lowering of switches.
6552       SizeT NumCases = Instr->getNumCases();
6553       if (NumCases >= 2) {
6554         Src0Lo = legalizeToReg(Src0Lo);
6555         Src0Hi = legalizeToReg(Src0Hi);
6556       } else {
6557         Src0Lo = legalize(Src0Lo, Legal_Reg | Legal_Mem);
6558         Src0Hi = legalize(Src0Hi, Legal_Reg | Legal_Mem);
6559       }
6560       for (SizeT I = 0; I < NumCases; ++I) {
6561         Constant *ValueLo = Ctx->getConstantInt32(Instr->getValue(I));
6562         Constant *ValueHi = Ctx->getConstantInt32(Instr->getValue(I) >> 32);
6563         InstX86Label *Label = InstX86Label::create(Func, this);
6564         _cmp(Src0Lo, ValueLo);
6565         _br(Traits::Cond::Br_ne, Label);
6566         _cmp(Src0Hi, ValueHi);
6567         _br(Traits::Cond::Br_e, Instr->getLabel(I));
6568         Context.insert(Label);
6569       }
6570       _br(Instr->getLabelDefault());
6571       return;
6572     } else {
6573       // All the values are 32-bit so just check the operand is too and then
6574       // fall through to the 32-bit implementation. This is a common case.
6575       Src0Hi = legalize(Src0Hi, Legal_Reg | Legal_Mem);
6576       Constant *Zero = Ctx->getConstantInt32(0);
6577       _cmp(Src0Hi, Zero);
6578       _br(Traits::Cond::Br_ne, DefaultTarget);
6579       Src0 = Src0Lo;
6580     }
6581   }
6582
6583   // 32-bit lowering
6584
6585   if (CaseClusters.size() == 1) {
6586     // Jump straight to default if needed. Currently a common case as jump
6587     // tables occur on their own.
6588     constexpr bool DoneCmp = false;
6589     lowerCaseCluster(CaseClusters.front(), Src0, DoneCmp, DefaultTarget);
6590     return;
6591   }
6592
6593   // Going to be using multiple times so get it in a register early
6594   Variable *Comparison = legalizeToReg(Src0);
6595
6596   // A span is over the clusters
6597   struct SearchSpan {
6598     SearchSpan(SizeT Begin, SizeT Size, InstX86Label *Label)
6599         : Begin(Begin), Size(Size), Label(Label) {}
6600
6601     SizeT Begin;
6602     SizeT Size;
6603     InstX86Label *Label;
6604   };
6605   // The stack will only grow to the height of the tree so 12 should be plenty
6606   std::stack<SearchSpan, llvm::SmallVector<SearchSpan, 12>> SearchSpanStack;
6607   SearchSpanStack.emplace(0, CaseClusters.size(), nullptr);
6608   bool DoneCmp = false;
6609
6610   while (!SearchSpanStack.empty()) {
6611     SearchSpan Span = SearchSpanStack.top();
6612     SearchSpanStack.pop();
6613
6614     if (Span.Label != nullptr)
6615       Context.insert(Span.Label);
6616
6617     switch (Span.Size) {
6618     case 0:
6619       llvm::report_fatal_error("Invalid SearchSpan size");
6620       break;
6621
6622     case 1:
6623       lowerCaseCluster(CaseClusters[Span.Begin], Comparison, DoneCmp,
6624                        SearchSpanStack.empty() ? nullptr : DefaultTarget);
6625       DoneCmp = false;
6626       break;
6627
6628     case 2: {
6629       const CaseCluster *CaseA = &CaseClusters[Span.Begin];
6630       const CaseCluster *CaseB = &CaseClusters[Span.Begin + 1];
6631
6632       // Placing a range last may allow register clobbering during the range
6633       // test. That means there is no need to clone the register. If it is a
6634       // unit range the comparison may have already been done in the binary
6635       // search (DoneCmp) and so it should be placed first. If this is a range
6636       // of two items and the comparison with the low value has already been
6637       // done, comparing with the other element is cheaper than a range test.
6638       // If the low end of the range is zero then there is no subtraction and
6639       // nothing to be gained.
6640       if (!CaseA->isUnitRange() &&
6641           !(CaseA->getLow() == 0 || (DoneCmp && CaseA->isPairRange()))) {
6642         std::swap(CaseA, CaseB);
6643         DoneCmp = false;
6644       }
6645
6646       lowerCaseCluster(*CaseA, Comparison, DoneCmp);
6647       DoneCmp = false;
6648       lowerCaseCluster(*CaseB, Comparison, DoneCmp,
6649                        SearchSpanStack.empty() ? nullptr : DefaultTarget);
6650     } break;
6651
6652     default:
6653       // Pick the middle item and branch b or ae
6654       SizeT PivotIndex = Span.Begin + (Span.Size / 2);
6655       const CaseCluster &Pivot = CaseClusters[PivotIndex];
6656       Constant *Value = Ctx->getConstantInt32(Pivot.getLow());
6657       InstX86Label *Label = InstX86Label::create(Func, this);
6658       _cmp(Comparison, Value);
6659       // TODO(ascull): does it alway have to be far?
6660       _br(Traits::Cond::Br_b, Label, InstX86Br::Far);
6661       // Lower the left and (pivot+right) sides, falling through to the right
6662       SearchSpanStack.emplace(Span.Begin, Span.Size / 2, Label);
6663       SearchSpanStack.emplace(PivotIndex, Span.Size - (Span.Size / 2), nullptr);
6664       DoneCmp = true;
6665       break;
6666     }
6667   }
6668
6669   _br(DefaultTarget);
6670 }
6671
6672 /// The following pattern occurs often in lowered C and C++ code:
6673 ///
6674 ///   %cmp     = fcmp/icmp pred <n x ty> %src0, %src1
6675 ///   %cmp.ext = sext <n x i1> %cmp to <n x ty>
6676 ///
6677 /// We can eliminate the sext operation by copying the result of pcmpeqd,
6678 /// pcmpgtd, or cmpps (which produce sign extended results) to the result of the
6679 /// sext operation.
6680 template <typename TraitsType>
6681 void TargetX86Base<TraitsType>::eliminateNextVectorSextInstruction(
6682     Variable *SignExtendedResult) {
6683   if (auto *NextCast =
6684           llvm::dyn_cast_or_null<InstCast>(Context.getNextInst())) {
6685     if (NextCast->getCastKind() == InstCast::Sext &&
6686         NextCast->getSrc(0) == SignExtendedResult) {
6687       NextCast->setDeleted();
6688       _movp(NextCast->getDest(), legalizeToReg(SignExtendedResult));
6689       // Skip over the instruction.
6690       Context.advanceNext();
6691     }
6692   }
6693 }
6694
6695 template <typename TraitsType>
6696 void TargetX86Base<TraitsType>::lowerUnreachable(
6697     const InstUnreachable * /*Instr*/) {
6698   _ud2();
6699   // Add a fake use of esp to make sure esp adjustments after the unreachable
6700   // do not get dead-code eliminated.
6701   keepEspLiveAtExit();
6702 }
6703
6704 template <typename TraitsType>
6705 void TargetX86Base<TraitsType>::lowerBreakpoint(
6706     const InstBreakpoint * /*Instr*/) {
6707   _int3();
6708 }
6709
6710 template <typename TraitsType>
6711 void TargetX86Base<TraitsType>::lowerRMW(const InstX86FakeRMW *RMW) {
6712   // If the beacon variable's live range does not end in this instruction, then
6713   // it must end in the modified Store instruction that follows. This means
6714   // that the original Store instruction is still there, either because the
6715   // value being stored is used beyond the Store instruction, or because dead
6716   // code elimination did not happen. In either case, we cancel RMW lowering
6717   // (and the caller deletes the RMW instruction).
6718   if (!RMW->isLastUse(RMW->getBeacon()))
6719     return;
6720   Operand *Src = RMW->getData();
6721   Type Ty = Src->getType();
6722   X86OperandMem *Addr = formMemoryOperand(RMW->getAddr(), Ty);
6723   doMockBoundsCheck(Addr);
6724   if (!Traits::Is64Bit && Ty == IceType_i64) {
6725     Src = legalizeUndef(Src);
6726     Operand *SrcLo = legalize(loOperand(Src), Legal_Reg | Legal_Imm);
6727     Operand *SrcHi = legalize(hiOperand(Src), Legal_Reg | Legal_Imm);
6728     auto *AddrLo = llvm::cast<X86OperandMem>(loOperand(Addr));
6729     auto *AddrHi = llvm::cast<X86OperandMem>(hiOperand(Addr));
6730     switch (RMW->getOp()) {
6731     default:
6732       // TODO(stichnot): Implement other arithmetic operators.
6733       break;
6734     case InstArithmetic::Add:
6735       _add_rmw(AddrLo, SrcLo);
6736       _adc_rmw(AddrHi, SrcHi);
6737       return;
6738     case InstArithmetic::Sub:
6739       _sub_rmw(AddrLo, SrcLo);
6740       _sbb_rmw(AddrHi, SrcHi);
6741       return;
6742     case InstArithmetic::And:
6743       _and_rmw(AddrLo, SrcLo);
6744       _and_rmw(AddrHi, SrcHi);
6745       return;
6746     case InstArithmetic::Or:
6747       _or_rmw(AddrLo, SrcLo);
6748       _or_rmw(AddrHi, SrcHi);
6749       return;
6750     case InstArithmetic::Xor:
6751       _xor_rmw(AddrLo, SrcLo);
6752       _xor_rmw(AddrHi, SrcHi);
6753       return;
6754     }
6755   } else {
6756     // x86-32: i8, i16, i32
6757     // x86-64: i8, i16, i32, i64
6758     switch (RMW->getOp()) {
6759     default:
6760       // TODO(stichnot): Implement other arithmetic operators.
6761       break;
6762     case InstArithmetic::Add:
6763       Src = legalize(Src, Legal_Reg | Legal_Imm);
6764       _add_rmw(Addr, Src);
6765       return;
6766     case InstArithmetic::Sub:
6767       Src = legalize(Src, Legal_Reg | Legal_Imm);
6768       _sub_rmw(Addr, Src);
6769       return;
6770     case InstArithmetic::And:
6771       Src = legalize(Src, Legal_Reg | Legal_Imm);
6772       _and_rmw(Addr, Src);
6773       return;
6774     case InstArithmetic::Or:
6775       Src = legalize(Src, Legal_Reg | Legal_Imm);
6776       _or_rmw(Addr, Src);
6777       return;
6778     case InstArithmetic::Xor:
6779       Src = legalize(Src, Legal_Reg | Legal_Imm);
6780       _xor_rmw(Addr, Src);
6781       return;
6782     }
6783   }
6784   llvm::report_fatal_error("Couldn't lower RMW instruction");
6785 }
6786
6787 template <typename TraitsType>
6788 void TargetX86Base<TraitsType>::lowerOther(const Inst *Instr) {
6789   if (const auto *RMW = llvm::dyn_cast<InstX86FakeRMW>(Instr)) {
6790     lowerRMW(RMW);
6791   } else {
6792     TargetLowering::lowerOther(Instr);
6793   }
6794 }
6795
6796 /// Turn an i64 Phi instruction into a pair of i32 Phi instructions, to preserve
6797 /// integrity of liveness analysis. Undef values are also turned into zeroes,
6798 /// since loOperand() and hiOperand() don't expect Undef input.  Also, in
6799 /// Non-SFI mode, add a FakeUse(RebasePtr) for every pooled constant operand.
6800 template <typename TraitsType> void TargetX86Base<TraitsType>::prelowerPhis() {
6801   if (getFlags().getUseNonsfi()) {
6802     assert(RebasePtr);
6803     CfgNode *Node = Context.getNode();
6804     uint32_t RebasePtrUseCount = 0;
6805     for (Inst &I : Node->getPhis()) {
6806       auto *Phi = llvm::dyn_cast<InstPhi>(&I);
6807       if (Phi->isDeleted())
6808         continue;
6809       for (SizeT I = 0; I < Phi->getSrcSize(); ++I) {
6810         Operand *Src = Phi->getSrc(I);
6811         // TODO(stichnot): This over-counts for +0.0, and under-counts for other
6812         // kinds of pooling.
6813         if (llvm::isa<ConstantRelocatable>(Src) ||
6814             llvm::isa<ConstantFloat>(Src) || llvm::isa<ConstantDouble>(Src)) {
6815           ++RebasePtrUseCount;
6816         }
6817       }
6818     }
6819     if (RebasePtrUseCount) {
6820       Node->getInsts().push_front(InstFakeUse::create(Func, RebasePtr));
6821     }
6822   }
6823   if (Traits::Is64Bit) {
6824     // On x86-64 we don't need to prelower phis -- the architecture can handle
6825     // 64-bit integer natively.
6826     return;
6827   }
6828
6829   // Pause constant blinding or pooling, blinding or pooling will be done later
6830   // during phi lowering assignments
6831   BoolFlagSaver B(RandomizationPoolingPaused, true);
6832   PhiLowering::prelowerPhis32Bit<TargetX86Base<TraitsType>>(
6833       this, Context.getNode(), Func);
6834 }
6835
6836 template <typename TraitsType>
6837 void TargetX86Base<TraitsType>::genTargetHelperCallFor(Inst *Instr) {
6838   uint32_t StackArgumentsSize = 0;
6839   if (auto *Arith = llvm::dyn_cast<InstArithmetic>(Instr)) {
6840     RuntimeHelper HelperID = RuntimeHelper::H_Num;
6841     Variable *Dest = Arith->getDest();
6842     Type DestTy = Dest->getType();
6843     if (!Traits::Is64Bit && DestTy == IceType_i64) {
6844       switch (Arith->getOp()) {
6845       default:
6846         return;
6847       case InstArithmetic::Udiv:
6848         HelperID = RuntimeHelper::H_udiv_i64;
6849         break;
6850       case InstArithmetic::Sdiv:
6851         HelperID = RuntimeHelper::H_sdiv_i64;
6852         break;
6853       case InstArithmetic::Urem:
6854         HelperID = RuntimeHelper::H_urem_i64;
6855         break;
6856       case InstArithmetic::Srem:
6857         HelperID = RuntimeHelper::H_srem_i64;
6858         break;
6859       }
6860     } else if (isVectorType(DestTy)) {
6861       Variable *Dest = Arith->getDest();
6862       Operand *Src0 = Arith->getSrc(0);
6863       Operand *Src1 = Arith->getSrc(1);
6864       switch (Arith->getOp()) {
6865       default:
6866         return;
6867       case InstArithmetic::Mul:
6868         if (DestTy == IceType_v16i8) {
6869           scalarizeArithmetic(Arith->getOp(), Dest, Src0, Src1);
6870           Arith->setDeleted();
6871         }
6872         return;
6873       case InstArithmetic::Shl:
6874       case InstArithmetic::Lshr:
6875       case InstArithmetic::Ashr:
6876       case InstArithmetic::Udiv:
6877       case InstArithmetic::Urem:
6878       case InstArithmetic::Sdiv:
6879       case InstArithmetic::Srem:
6880       case InstArithmetic::Frem:
6881         scalarizeArithmetic(Arith->getOp(), Dest, Src0, Src1);
6882         Arith->setDeleted();
6883         return;
6884       }
6885     } else {
6886       switch (Arith->getOp()) {
6887       default:
6888         return;
6889       case InstArithmetic::Frem:
6890         if (isFloat32Asserting32Or64(DestTy))
6891           HelperID = RuntimeHelper::H_frem_f32;
6892         else
6893           HelperID = RuntimeHelper::H_frem_f64;
6894       }
6895     }
6896     constexpr SizeT MaxSrcs = 2;
6897     InstCall *Call = makeHelperCall(HelperID, Dest, MaxSrcs);
6898     Call->addArg(Arith->getSrc(0));
6899     Call->addArg(Arith->getSrc(1));
6900     StackArgumentsSize = getCallStackArgumentsSizeBytes(Call);
6901     Context.insert(Call);
6902     Arith->setDeleted();
6903   } else if (auto *Cast = llvm::dyn_cast<InstCast>(Instr)) {
6904     InstCast::OpKind CastKind = Cast->getCastKind();
6905     Operand *Src0 = Cast->getSrc(0);
6906     const Type SrcType = Src0->getType();
6907     Variable *Dest = Cast->getDest();
6908     const Type DestTy = Dest->getType();
6909     RuntimeHelper HelperID = RuntimeHelper::H_Num;
6910     Variable *CallDest = Dest;
6911     switch (CastKind) {
6912     default:
6913       return;
6914     case InstCast::Fptosi:
6915       if (!Traits::Is64Bit && DestTy == IceType_i64) {
6916         HelperID = isFloat32Asserting32Or64(SrcType)
6917                        ? RuntimeHelper::H_fptosi_f32_i64
6918                        : RuntimeHelper::H_fptosi_f64_i64;
6919       } else {
6920         return;
6921       }
6922       break;
6923     case InstCast::Fptoui:
6924       if (isVectorType(DestTy)) {
6925         assert(DestTy == IceType_v4i32 && SrcType == IceType_v4f32);
6926         HelperID = RuntimeHelper::H_fptoui_4xi32_f32;
6927       } else if (DestTy == IceType_i64 ||
6928                  (!Traits::Is64Bit && DestTy == IceType_i32)) {
6929         if (Traits::Is64Bit) {
6930           HelperID = isFloat32Asserting32Or64(SrcType)
6931                          ? RuntimeHelper::H_fptoui_f32_i64
6932                          : RuntimeHelper::H_fptoui_f64_i64;
6933         } else if (isInt32Asserting32Or64(DestTy)) {
6934           HelperID = isFloat32Asserting32Or64(SrcType)
6935                          ? RuntimeHelper::H_fptoui_f32_i32
6936                          : RuntimeHelper::H_fptoui_f64_i32;
6937         } else {
6938           HelperID = isFloat32Asserting32Or64(SrcType)
6939                          ? RuntimeHelper::H_fptoui_f32_i64
6940                          : RuntimeHelper::H_fptoui_f64_i64;
6941         }
6942       } else {
6943         return;
6944       }
6945       break;
6946     case InstCast::Sitofp:
6947       if (!Traits::Is64Bit && SrcType == IceType_i64) {
6948         HelperID = isFloat32Asserting32Or64(DestTy)
6949                        ? RuntimeHelper::H_sitofp_i64_f32
6950                        : RuntimeHelper::H_sitofp_i64_f64;
6951       } else {
6952         return;
6953       }
6954       break;
6955     case InstCast::Uitofp:
6956       if (isVectorType(SrcType)) {
6957         assert(DestTy == IceType_v4f32 && SrcType == IceType_v4i32);
6958         HelperID = RuntimeHelper::H_uitofp_4xi32_4xf32;
6959       } else if (SrcType == IceType_i64 ||
6960                  (!Traits::Is64Bit && SrcType == IceType_i32)) {
6961         if (isInt32Asserting32Or64(SrcType)) {
6962           HelperID = isFloat32Asserting32Or64(DestTy)
6963                          ? RuntimeHelper::H_uitofp_i32_f32
6964                          : RuntimeHelper::H_uitofp_i32_f64;
6965         } else {
6966           HelperID = isFloat32Asserting32Or64(DestTy)
6967                          ? RuntimeHelper::H_uitofp_i64_f32
6968                          : RuntimeHelper::H_uitofp_i64_f64;
6969         }
6970       } else {
6971         return;
6972       }
6973       break;
6974     case InstCast::Bitcast: {
6975       if (DestTy == Src0->getType())
6976         return;
6977       switch (DestTy) {
6978       default:
6979         return;
6980       case IceType_i8:
6981         assert(Src0->getType() == IceType_v8i1);
6982         HelperID = RuntimeHelper::H_bitcast_8xi1_i8;
6983         CallDest = Func->makeVariable(IceType_i32);
6984         break;
6985       case IceType_i16:
6986         assert(Src0->getType() == IceType_v16i1);
6987         HelperID = RuntimeHelper::H_bitcast_16xi1_i16;
6988         CallDest = Func->makeVariable(IceType_i32);
6989         break;
6990       case IceType_v8i1: {
6991         assert(Src0->getType() == IceType_i8);
6992         HelperID = RuntimeHelper::H_bitcast_i8_8xi1;
6993         Variable *Src0AsI32 = Func->makeVariable(stackSlotType());
6994         // Arguments to functions are required to be at least 32 bits wide.
6995         Context.insert<InstCast>(InstCast::Zext, Src0AsI32, Src0);
6996         Src0 = Src0AsI32;
6997       } break;
6998       case IceType_v16i1: {
6999         assert(Src0->getType() == IceType_i16);
7000         HelperID = RuntimeHelper::H_bitcast_i16_16xi1;
7001         Variable *Src0AsI32 = Func->makeVariable(stackSlotType());
7002         // Arguments to functions are required to be at least 32 bits wide.
7003         Context.insert<InstCast>(InstCast::Zext, Src0AsI32, Src0);
7004         Src0 = Src0AsI32;
7005       } break;
7006       }
7007     } break;
7008     }
7009     constexpr SizeT MaxSrcs = 1;
7010     InstCall *Call = makeHelperCall(HelperID, CallDest, MaxSrcs);
7011     Call->addArg(Src0);
7012     StackArgumentsSize = getCallStackArgumentsSizeBytes(Call);
7013     Context.insert(Call);
7014     // The PNaCl ABI disallows i8/i16 return types, so truncate the helper call
7015     // result to the appropriate type as necessary.
7016     if (CallDest->getType() != Dest->getType())
7017       Context.insert<InstCast>(InstCast::Trunc, Dest, CallDest);
7018     Cast->setDeleted();
7019   } else if (auto *Intrinsic = llvm::dyn_cast<InstIntrinsicCall>(Instr)) {
7020     CfgVector<Type> ArgTypes;
7021     Type ReturnType = IceType_void;
7022     switch (Intrinsics::IntrinsicID ID = Intrinsic->getIntrinsicInfo().ID) {
7023     default:
7024       return;
7025     case Intrinsics::Ctpop: {
7026       Operand *Val = Intrinsic->getArg(0);
7027       Type ValTy = Val->getType();
7028       if (ValTy == IceType_i64)
7029         ArgTypes = {IceType_i64};
7030       else
7031         ArgTypes = {IceType_i32};
7032       ReturnType = IceType_i32;
7033     } break;
7034     case Intrinsics::Longjmp:
7035       ArgTypes = {IceType_i32, IceType_i32};
7036       ReturnType = IceType_void;
7037       break;
7038     case Intrinsics::Memcpy:
7039       ArgTypes = {IceType_i32, IceType_i32, IceType_i32};
7040       ReturnType = IceType_void;
7041       break;
7042     case Intrinsics::Memmove:
7043       ArgTypes = {IceType_i32, IceType_i32, IceType_i32};
7044       ReturnType = IceType_void;
7045       break;
7046     case Intrinsics::Memset:
7047       ArgTypes = {IceType_i32, IceType_i32, IceType_i32};
7048       ReturnType = IceType_void;
7049       break;
7050     case Intrinsics::NaClReadTP:
7051       ReturnType = IceType_i32;
7052       break;
7053     case Intrinsics::Setjmp:
7054       ArgTypes = {IceType_i32};
7055       ReturnType = IceType_i32;
7056       break;
7057     }
7058     StackArgumentsSize = getCallStackArgumentsSizeBytes(ArgTypes, ReturnType);
7059   } else if (auto *Call = llvm::dyn_cast<InstCall>(Instr)) {
7060     StackArgumentsSize = getCallStackArgumentsSizeBytes(Call);
7061   } else if (auto *Ret = llvm::dyn_cast<InstRet>(Instr)) {
7062     if (!Ret->hasRetValue())
7063       return;
7064     Operand *RetValue = Ret->getRetValue();
7065     Type ReturnType = RetValue->getType();
7066     if (!isScalarFloatingType(ReturnType))
7067       return;
7068     StackArgumentsSize = typeWidthInBytes(ReturnType);
7069   } else {
7070     return;
7071   }
7072   StackArgumentsSize = Traits::applyStackAlignment(StackArgumentsSize);
7073   updateMaxOutArgsSizeBytes(StackArgumentsSize);
7074 }
7075
7076 template <typename TraitsType>
7077 uint32_t TargetX86Base<TraitsType>::getCallStackArgumentsSizeBytes(
7078     const CfgVector<Type> &ArgTypes, Type ReturnType) {
7079   uint32_t OutArgumentsSizeBytes = 0;
7080   uint32_t XmmArgCount = 0;
7081   uint32_t GprArgCount = 0;
7082   for (Type Ty : ArgTypes) {
7083     // The PNaCl ABI requires the width of arguments to be at least 32 bits.
7084     assert(typeWidthInBytes(Ty) >= 4);
7085     if (isVectorType(Ty) && XmmArgCount < Traits::X86_MAX_XMM_ARGS) {
7086       ++XmmArgCount;
7087     } else if (isScalarFloatingType(Ty) && Traits::X86_PASS_SCALAR_FP_IN_XMM &&
7088                XmmArgCount < Traits::X86_MAX_XMM_ARGS) {
7089       ++XmmArgCount;
7090     } else if (isScalarIntegerType(Ty) &&
7091                GprArgCount < Traits::X86_MAX_GPR_ARGS) {
7092       // The 64 bit ABI allows some integers to be passed in GPRs.
7093       ++GprArgCount;
7094     } else {
7095       if (isVectorType(Ty)) {
7096         OutArgumentsSizeBytes =
7097             Traits::applyStackAlignment(OutArgumentsSizeBytes);
7098       }
7099       OutArgumentsSizeBytes += typeWidthInBytesOnStack(Ty);
7100     }
7101   }
7102   if (Traits::Is64Bit)
7103     return OutArgumentsSizeBytes;
7104   // The 32 bit ABI requires floating point values to be returned on the x87 FP
7105   // stack. Ensure there is enough space for the fstp/movs for floating returns.
7106   if (isScalarFloatingType(ReturnType)) {
7107     OutArgumentsSizeBytes =
7108         std::max(OutArgumentsSizeBytes,
7109                  static_cast<uint32_t>(typeWidthInBytesOnStack(ReturnType)));
7110   }
7111   return OutArgumentsSizeBytes;
7112 }
7113
7114 template <typename TraitsType>
7115 uint32_t TargetX86Base<TraitsType>::getCallStackArgumentsSizeBytes(
7116     const InstCall *Instr) {
7117   // Build a vector of the arguments' types.
7118   const SizeT NumArgs = Instr->getNumArgs();
7119   CfgVector<Type> ArgTypes;
7120   ArgTypes.reserve(NumArgs);
7121   for (SizeT i = 0; i < NumArgs; ++i) {
7122     Operand *Arg = Instr->getArg(i);
7123     ArgTypes.emplace_back(Arg->getType());
7124   }
7125   // Compute the return type (if any);
7126   Type ReturnType = IceType_void;
7127   Variable *Dest = Instr->getDest();
7128   if (Dest != nullptr)
7129     ReturnType = Dest->getType();
7130   return getCallStackArgumentsSizeBytes(ArgTypes, ReturnType);
7131 }
7132
7133 template <typename TraitsType>
7134 Variable *TargetX86Base<TraitsType>::makeZeroedRegister(Type Ty,
7135                                                         RegNumT RegNum) {
7136   Variable *Reg = makeReg(Ty, RegNum);
7137   switch (Ty) {
7138   case IceType_i1:
7139   case IceType_i8:
7140   case IceType_i16:
7141   case IceType_i32:
7142   case IceType_i64:
7143     // Conservatively do "mov reg, 0" to avoid modifying FLAGS.
7144     _mov(Reg, Ctx->getConstantZero(Ty));
7145     break;
7146   case IceType_f32:
7147   case IceType_f64:
7148     Context.insert<InstFakeDef>(Reg);
7149     _xorps(Reg, Reg);
7150     break;
7151   default:
7152     // All vector types use the same pxor instruction.
7153     assert(isVectorType(Ty));
7154     Context.insert<InstFakeDef>(Reg);
7155     _pxor(Reg, Reg);
7156     break;
7157   }
7158   return Reg;
7159 }
7160
7161 // There is no support for loading or emitting vector constants, so the vector
7162 // values returned from makeVectorOfZeros, makeVectorOfOnes, etc. are
7163 // initialized with register operations.
7164 //
7165 // TODO(wala): Add limited support for vector constants so that complex
7166 // initialization in registers is unnecessary.
7167
7168 template <typename TraitsType>
7169 Variable *TargetX86Base<TraitsType>::makeVectorOfZeros(Type Ty,
7170                                                        RegNumT RegNum) {
7171   return makeZeroedRegister(Ty, RegNum);
7172 }
7173
7174 template <typename TraitsType>
7175 Variable *TargetX86Base<TraitsType>::makeVectorOfMinusOnes(Type Ty,
7176                                                            RegNumT RegNum) {
7177   Variable *MinusOnes = makeReg(Ty, RegNum);
7178   // Insert a FakeDef so the live range of MinusOnes is not overestimated.
7179   Context.insert<InstFakeDef>(MinusOnes);
7180   if (Ty == IceType_f64)
7181     // Making a vector of minus ones of type f64 is currently only used for the
7182     // fabs intrinsic.  To use the f64 type to create this mask with pcmpeqq
7183     // requires SSE 4.1.  Since we're just creating a mask, pcmpeqd does the
7184     // same job and only requires SSE2.
7185     _pcmpeq(MinusOnes, MinusOnes, IceType_f32);
7186   else
7187     _pcmpeq(MinusOnes, MinusOnes);
7188   return MinusOnes;
7189 }
7190
7191 template <typename TraitsType>
7192 Variable *TargetX86Base<TraitsType>::makeVectorOfOnes(Type Ty, RegNumT RegNum) {
7193   Variable *Dest = makeVectorOfZeros(Ty, RegNum);
7194   Variable *MinusOne = makeVectorOfMinusOnes(Ty);
7195   _psub(Dest, MinusOne);
7196   return Dest;
7197 }
7198
7199 template <typename TraitsType>
7200 Variable *TargetX86Base<TraitsType>::makeVectorOfHighOrderBits(Type Ty,
7201                                                                RegNumT RegNum) {
7202   assert(Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v8i16 ||
7203          Ty == IceType_v16i8);
7204   if (Ty == IceType_v4f32 || Ty == IceType_v4i32 || Ty == IceType_v8i16) {
7205     Variable *Reg = makeVectorOfOnes(Ty, RegNum);
7206     SizeT Shift =
7207         typeWidthInBytes(typeElementType(Ty)) * Traits::X86_CHAR_BIT - 1;
7208     _psll(Reg, Ctx->getConstantInt8(Shift));
7209     return Reg;
7210   } else {
7211     // SSE has no left shift operation for vectors of 8 bit integers.
7212     constexpr uint32_t HIGH_ORDER_BITS_MASK = 0x80808080;
7213     Constant *ConstantMask = Ctx->getConstantInt32(HIGH_ORDER_BITS_MASK);
7214     Variable *Reg = makeReg(Ty, RegNum);
7215     _movd(Reg, legalize(ConstantMask, Legal_Reg | Legal_Mem));
7216     _pshufd(Reg, Reg, Ctx->getConstantZero(IceType_i8));
7217     return Reg;
7218   }
7219 }
7220
7221 /// Construct a mask in a register that can be and'ed with a floating-point
7222 /// value to mask off its sign bit. The value will be <4 x 0x7fffffff> for f32
7223 /// and v4f32, and <2 x 0x7fffffffffffffff> for f64. Construct it as vector of
7224 /// ones logically right shifted one bit.
7225 // TODO(stichnot): Fix the wala
7226 // TODO: above, to represent vector constants in memory.
7227 template <typename TraitsType>
7228 Variable *TargetX86Base<TraitsType>::makeVectorOfFabsMask(Type Ty,
7229                                                           RegNumT RegNum) {
7230   Variable *Reg = makeVectorOfMinusOnes(Ty, RegNum);
7231   _psrl(Reg, Ctx->getConstantInt8(1));
7232   return Reg;
7233 }
7234
7235 template <typename TraitsType>
7236 typename TargetX86Base<TraitsType>::X86OperandMem *
7237 TargetX86Base<TraitsType>::getMemoryOperandForStackSlot(Type Ty, Variable *Slot,
7238                                                         uint32_t Offset) {
7239   // Ensure that Loc is a stack slot.
7240   assert(Slot->mustNotHaveReg());
7241   assert(Slot->getRegNum().hasNoValue());
7242   // Compute the location of Loc in memory.
7243   // TODO(wala,stichnot): lea should not
7244   // be required. The address of the stack slot is known at compile time
7245   // (although not until after addProlog()).
7246   constexpr Type PointerType = IceType_i32;
7247   Variable *Loc = makeReg(PointerType);
7248   _lea(Loc, Slot);
7249   Constant *ConstantOffset = Ctx->getConstantInt32(Offset);
7250   return X86OperandMem::create(Func, Ty, Loc, ConstantOffset);
7251 }
7252
7253 /// Lowering helper to copy a scalar integer source operand into some 8-bit GPR.
7254 /// Src is assumed to already be legalized.  If the source operand is known to
7255 /// be a memory or immediate operand, a simple mov will suffice.  But if the
7256 /// source operand can be a physical register, then it must first be copied into
7257 /// a physical register that is truncable to 8-bit, then truncated into a
7258 /// physical register that can receive a truncation, and finally copied into the
7259 /// result 8-bit register (which in general can be any 8-bit register).  For
7260 /// example, moving %ebp into %ah may be accomplished as:
7261 ///   movl %ebp, %edx
7262 ///   mov_trunc %edx, %dl  // this redundant assignment is ultimately elided
7263 ///   movb %dl, %ah
7264 /// On the other hand, moving a memory or immediate operand into ah:
7265 ///   movb 4(%ebp), %ah
7266 ///   movb $my_imm, %ah
7267 ///
7268 /// Note #1.  On a 64-bit target, the "movb 4(%ebp), %ah" is likely not
7269 /// encodable, so RegNum=Reg_ah should NOT be given as an argument.  Instead,
7270 /// use RegNum=RegNumT() and then let the caller do a separate copy into
7271 /// Reg_ah.
7272 ///
7273 /// Note #2.  ConstantRelocatable operands are also put through this process
7274 /// (not truncated directly) because our ELF emitter does R_386_32 relocations
7275 /// but not R_386_8 relocations.
7276 ///
7277 /// Note #3.  If Src is a Variable, the result will be an infinite-weight i8
7278 /// Variable with the RCX86_IsTrunc8Rcvr register class.  As such, this helper
7279 /// is a convenient way to prevent ah/bh/ch/dh from being an (invalid) argument
7280 /// to the pinsrb instruction.
7281 template <typename TraitsType>
7282 Variable *TargetX86Base<TraitsType>::copyToReg8(Operand *Src, RegNumT RegNum) {
7283   Type Ty = Src->getType();
7284   assert(isScalarIntegerType(Ty));
7285   assert(Ty != IceType_i1);
7286   Variable *Reg = makeReg(IceType_i8, RegNum);
7287   Reg->setRegClass(RCX86_IsTrunc8Rcvr);
7288   if (llvm::isa<Variable>(Src) || llvm::isa<ConstantRelocatable>(Src)) {
7289     Variable *SrcTruncable = makeReg(Ty);
7290     switch (Ty) {
7291     case IceType_i64:
7292       SrcTruncable->setRegClass(RCX86_Is64To8);
7293       break;
7294     case IceType_i32:
7295       SrcTruncable->setRegClass(RCX86_Is32To8);
7296       break;
7297     case IceType_i16:
7298       SrcTruncable->setRegClass(RCX86_Is16To8);
7299       break;
7300     default:
7301       // i8 - just use default register class
7302       break;
7303     }
7304     Variable *SrcRcvr = makeReg(IceType_i8);
7305     SrcRcvr->setRegClass(RCX86_IsTrunc8Rcvr);
7306     _mov(SrcTruncable, Src);
7307     _mov(SrcRcvr, SrcTruncable);
7308     Src = SrcRcvr;
7309   }
7310   _mov(Reg, Src);
7311   return Reg;
7312 }
7313
7314 /// Helper for legalize() to emit the right code to lower an operand to a
7315 /// register of the appropriate type.
7316 template <typename TraitsType>
7317 Variable *TargetX86Base<TraitsType>::copyToReg(Operand *Src, RegNumT RegNum) {
7318   Type Ty = Src->getType();
7319   Variable *Reg = makeReg(Ty, RegNum);
7320   if (isVectorType(Ty)) {
7321     _movp(Reg, Src);
7322   } else {
7323     _mov(Reg, Src);
7324   }
7325   return Reg;
7326 }
7327
7328 template <typename TraitsType>
7329 Operand *TargetX86Base<TraitsType>::legalize(Operand *From, LegalMask Allowed,
7330                                              RegNumT RegNum) {
7331   const bool UseNonsfi = getFlags().getUseNonsfi();
7332   const Type Ty = From->getType();
7333   // Assert that a physical register is allowed. To date, all calls to
7334   // legalize() allow a physical register. If a physical register needs to be
7335   // explicitly disallowed, then new code will need to be written to force a
7336   // spill.
7337   assert(Allowed & Legal_Reg);
7338   // If we're asking for a specific physical register, make sure we're not
7339   // allowing any other operand kinds. (This could be future work, e.g. allow
7340   // the shl shift amount to be either an immediate or in ecx.)
7341   assert(RegNum.hasNoValue() || Allowed == Legal_Reg);
7342
7343   // Substitute with an available infinite-weight variable if possible.  Only do
7344   // this when we are not asking for a specific register, and when the
7345   // substitution is not locked to a specific register, and when the types
7346   // match, in order to capture the vast majority of opportunities and avoid
7347   // corner cases in the lowering.
7348   if (RegNum.hasNoValue()) {
7349     if (Variable *Subst = getContext().availabilityGet(From)) {
7350       // At this point we know there is a potential substitution available.
7351       if (Subst->mustHaveReg() && !Subst->hasReg()) {
7352         // At this point we know the substitution will have a register.
7353         if (From->getType() == Subst->getType()) {
7354           // At this point we know the substitution's register is compatible.
7355           return Subst;
7356         }
7357       }
7358     }
7359   }
7360
7361   if (auto *Mem = llvm::dyn_cast<X86OperandMem>(From)) {
7362     // Before doing anything with a Mem operand, we need to ensure that the
7363     // Base and Index components are in physical registers.
7364     Variable *Base = Mem->getBase();
7365     Variable *Index = Mem->getIndex();
7366     Constant *Offset = Mem->getOffset();
7367     Variable *RegBase = nullptr;
7368     Variable *RegIndex = nullptr;
7369     uint16_t Shift = Mem->getShift();
7370     if (Base) {
7371       RegBase = llvm::cast<Variable>(
7372           legalize(Base, Legal_Reg | Legal_Rematerializable));
7373     }
7374     if (Index) {
7375       // TODO(jpp): perhaps we should only allow Legal_Reg if
7376       // Base->isRematerializable.
7377       RegIndex = llvm::cast<Variable>(
7378           legalize(Index, Legal_Reg | Legal_Rematerializable));
7379     }
7380
7381     if (Base != RegBase || Index != RegIndex) {
7382       Mem = X86OperandMem::create(Func, Ty, RegBase, Offset, RegIndex, Shift,
7383                                   Mem->getSegmentRegister());
7384     }
7385
7386     // For all Memory Operands, we do randomization/pooling here.
7387     From = randomizeOrPoolImmediate(Mem);
7388
7389     if (!(Allowed & Legal_Mem)) {
7390       From = copyToReg(From, RegNum);
7391     }
7392     return From;
7393   }
7394
7395   if (auto *Const = llvm::dyn_cast<Constant>(From)) {
7396     if (llvm::isa<ConstantUndef>(Const)) {
7397       From = legalizeUndef(Const, RegNum);
7398       if (isVectorType(Ty))
7399         return From;
7400       Const = llvm::cast<Constant>(From);
7401     }
7402     // There should be no constants of vector type (other than undef).
7403     assert(!isVectorType(Ty));
7404
7405     // If the operand is a 64 bit constant integer we need to legalize it to a
7406     // register in x86-64.
7407     if (Traits::Is64Bit) {
7408       if (auto *C64 = llvm::dyn_cast<ConstantInteger64>(Const)) {
7409         if (!Utils::IsInt(32, C64->getValue())) {
7410           if (RegNum.hasValue()) {
7411             assert(Traits::getGprForType(IceType_i64, RegNum) == RegNum);
7412           }
7413           return copyToReg(Const, RegNum);
7414         }
7415       }
7416     }
7417
7418     // If the operand is an 32 bit constant integer, we should check whether we
7419     // need to randomize it or pool it.
7420     if (auto *C = llvm::dyn_cast<ConstantInteger32>(Const)) {
7421       Operand *NewConst = randomizeOrPoolImmediate(C, RegNum);
7422       if (NewConst != Const) {
7423         return NewConst;
7424       }
7425     }
7426
7427     if (auto *CR = llvm::dyn_cast<ConstantRelocatable>(Const)) {
7428       // If the operand is a ConstantRelocatable, and Legal_AddrAbs is not
7429       // specified, and UseNonsfi is indicated, we need to add RebasePtr.
7430       if (UseNonsfi && !(Allowed & Legal_AddrAbs)) {
7431         assert(Ty == IceType_i32);
7432         Variable *NewVar = makeReg(Ty, RegNum);
7433         auto *Mem = Traits::X86OperandMem::create(Func, Ty, nullptr, CR);
7434         // LEAs are not automatically sandboxed, thus we explicitly invoke
7435         // _sandbox_mem_reference.
7436         _lea(NewVar, _sandbox_mem_reference(Mem));
7437         From = NewVar;
7438       }
7439     } else if (isScalarFloatingType(Ty)) {
7440       // Convert a scalar floating point constant into an explicit memory
7441       // operand.
7442       if (auto *ConstFloat = llvm::dyn_cast<ConstantFloat>(Const)) {
7443         if (Utils::isPositiveZero(ConstFloat->getValue()))
7444           return makeZeroedRegister(Ty, RegNum);
7445       } else if (auto *ConstDouble = llvm::dyn_cast<ConstantDouble>(Const)) {
7446         if (Utils::isPositiveZero(ConstDouble->getValue()))
7447           return makeZeroedRegister(Ty, RegNum);
7448       }
7449
7450       auto *CFrom = llvm::cast<Constant>(From);
7451       assert(CFrom->getShouldBePooled());
7452       Constant *Offset = Ctx->getConstantSym(0, CFrom->getLabelName());
7453       auto *Mem = X86OperandMem::create(Func, Ty, nullptr, Offset);
7454       From = Mem;
7455     }
7456
7457     bool NeedsReg = false;
7458     if (!(Allowed & Legal_Imm) && !isScalarFloatingType(Ty))
7459       // Immediate specifically not allowed.
7460       NeedsReg = true;
7461     if (!(Allowed & Legal_Mem) && isScalarFloatingType(Ty))
7462       // On x86, FP constants are lowered to mem operands.
7463       NeedsReg = true;
7464     if (NeedsReg) {
7465       From = copyToReg(From, RegNum);
7466     }
7467     return From;
7468   }
7469
7470   if (auto *Var = llvm::dyn_cast<Variable>(From)) {
7471     // Check if the variable is guaranteed a physical register. This can happen
7472     // either when the variable is pre-colored or when it is assigned infinite
7473     // weight.
7474     bool MustHaveRegister = (Var->hasReg() || Var->mustHaveReg());
7475     bool MustRematerialize =
7476         (Var->isRematerializable() && !(Allowed & Legal_Rematerializable));
7477     // We need a new physical register for the operand if:
7478     // - Mem is not allowed and Var isn't guaranteed a physical register, or
7479     // - RegNum is required and Var->getRegNum() doesn't match, or
7480     // - Var is a rematerializable variable and rematerializable pass-through is
7481     //   not allowed (in which case we need an lea instruction).
7482     if (MustRematerialize) {
7483       assert(Ty == IceType_i32);
7484       Variable *NewVar = makeReg(Ty, RegNum);
7485       // Since Var is rematerializable, the offset will be added when the lea is
7486       // emitted.
7487       constexpr Constant *NoOffset = nullptr;
7488       auto *Mem = X86OperandMem::create(Func, Ty, Var, NoOffset);
7489       _lea(NewVar, Mem);
7490       From = NewVar;
7491     } else if ((!(Allowed & Legal_Mem) && !MustHaveRegister) ||
7492                (RegNum.hasValue() && RegNum != Var->getRegNum())) {
7493       From = copyToReg(From, RegNum);
7494     }
7495     return From;
7496   }
7497
7498   llvm::report_fatal_error("Unhandled operand kind in legalize()");
7499   return From;
7500 }
7501
7502 /// Provide a trivial wrapper to legalize() for this common usage.
7503 template <typename TraitsType>
7504 Variable *TargetX86Base<TraitsType>::legalizeToReg(Operand *From,
7505                                                    RegNumT RegNum) {
7506   return llvm::cast<Variable>(legalize(From, Legal_Reg, RegNum));
7507 }
7508
7509 /// Legalize undef values to concrete values.
7510 template <typename TraitsType>
7511 Operand *TargetX86Base<TraitsType>::legalizeUndef(Operand *From,
7512                                                   RegNumT RegNum) {
7513   Type Ty = From->getType();
7514   if (llvm::isa<ConstantUndef>(From)) {
7515     // Lower undefs to zero.  Another option is to lower undefs to an
7516     // uninitialized register; however, using an uninitialized register results
7517     // in less predictable code.
7518     //
7519     // If in the future the implementation is changed to lower undef values to
7520     // uninitialized registers, a FakeDef will be needed:
7521     //     Context.insert<InstFakeDef>(Reg);
7522     // This is in order to ensure that the live range of Reg is not
7523     // overestimated.  If the constant being lowered is a 64 bit value, then
7524     // the result should be split and the lo and hi components will need to go
7525     // in uninitialized registers.
7526     if (isVectorType(Ty))
7527       return makeVectorOfZeros(Ty, RegNum);
7528     return Ctx->getConstantZero(Ty);
7529   }
7530   return From;
7531 }
7532
7533 /// For the cmp instruction, if Src1 is an immediate, or known to be a physical
7534 /// register, we can allow Src0 to be a memory operand. Otherwise, Src0 must be
7535 /// copied into a physical register. (Actually, either Src0 or Src1 can be
7536 /// chosen for the physical register, but unfortunately we have to commit to one
7537 /// or the other before register allocation.)
7538 template <typename TraitsType>
7539 Operand *TargetX86Base<TraitsType>::legalizeSrc0ForCmp(Operand *Src0,
7540                                                        Operand *Src1) {
7541   bool IsSrc1ImmOrReg = false;
7542   if (llvm::isa<Constant>(Src1)) {
7543     IsSrc1ImmOrReg = true;
7544   } else if (auto *Var = llvm::dyn_cast<Variable>(Src1)) {
7545     if (Var->hasReg())
7546       IsSrc1ImmOrReg = true;
7547   }
7548   return legalize(Src0, IsSrc1ImmOrReg ? (Legal_Reg | Legal_Mem) : Legal_Reg);
7549 }
7550
7551 template <typename TraitsType>
7552 typename TargetX86Base<TraitsType>::X86OperandMem *
7553 TargetX86Base<TraitsType>::formMemoryOperand(Operand *Opnd, Type Ty,
7554                                              bool DoLegalize) {
7555   auto *Mem = llvm::dyn_cast<X86OperandMem>(Opnd);
7556   // It may be the case that address mode optimization already creates an
7557   // X86OperandMem, so in that case it wouldn't need another level of
7558   // transformation.
7559   if (!Mem) {
7560     auto *Base = llvm::dyn_cast<Variable>(Opnd);
7561     auto *Offset = llvm::dyn_cast<Constant>(Opnd);
7562     assert(Base || Offset);
7563     if (Offset) {
7564       // During memory operand building, we do not blind or pool the constant
7565       // offset, we will work on the whole memory operand later as one entity
7566       // later, this save one instruction. By turning blinding and pooling off,
7567       // we guarantee legalize(Offset) will return a Constant*.
7568       if (!llvm::isa<ConstantRelocatable>(Offset)) {
7569         BoolFlagSaver B(RandomizationPoolingPaused, true);
7570
7571         Offset = llvm::cast<Constant>(legalize(Offset));
7572       }
7573
7574       assert(llvm::isa<ConstantInteger32>(Offset) ||
7575              llvm::isa<ConstantRelocatable>(Offset));
7576     }
7577     // Not completely sure whether it's OK to leave IsRebased unset when
7578     // creating the mem operand.  If DoLegalize is true, it will definitely be
7579     // applied during the legalize() call, but perhaps not during the
7580     // randomizeOrPoolImmediate() call.  In any case, the emit routines will
7581     // assert that PIC legalization has been applied.
7582     Mem = X86OperandMem::create(Func, Ty, Base, Offset);
7583   }
7584   // Do legalization, which contains randomization/pooling or do
7585   // randomization/pooling.
7586   return llvm::cast<X86OperandMem>(DoLegalize ? legalize(Mem)
7587                                               : randomizeOrPoolImmediate(Mem));
7588 }
7589
7590 template <typename TraitsType>
7591 Variable *TargetX86Base<TraitsType>::makeReg(Type Type, RegNumT RegNum) {
7592   // There aren't any 64-bit integer registers for x86-32.
7593   assert(Traits::Is64Bit || Type != IceType_i64);
7594   Variable *Reg = Func->makeVariable(Type);
7595   if (RegNum.hasValue())
7596     Reg->setRegNum(RegNum);
7597   else
7598     Reg->setMustHaveReg();
7599   return Reg;
7600 }
7601
7602 template <typename TraitsType>
7603 const Type TargetX86Base<TraitsType>::TypeForSize[] = {
7604     IceType_i8, IceType_i16, IceType_i32, IceType_f64, IceType_v16i8};
7605 template <typename TraitsType>
7606 Type TargetX86Base<TraitsType>::largestTypeInSize(uint32_t Size,
7607                                                   uint32_t MaxSize) {
7608   assert(Size != 0);
7609   uint32_t TyIndex = llvm::findLastSet(Size, llvm::ZB_Undefined);
7610   uint32_t MaxIndex = MaxSize == NoSizeLimit
7611                           ? llvm::array_lengthof(TypeForSize) - 1
7612                           : llvm::findLastSet(MaxSize, llvm::ZB_Undefined);
7613   return TypeForSize[std::min(TyIndex, MaxIndex)];
7614 }
7615
7616 template <typename TraitsType>
7617 Type TargetX86Base<TraitsType>::firstTypeThatFitsSize(uint32_t Size,
7618                                                       uint32_t MaxSize) {
7619   assert(Size != 0);
7620   uint32_t TyIndex = llvm::findLastSet(Size, llvm::ZB_Undefined);
7621   if (!llvm::isPowerOf2_32(Size))
7622     ++TyIndex;
7623   uint32_t MaxIndex = MaxSize == NoSizeLimit
7624                           ? llvm::array_lengthof(TypeForSize) - 1
7625                           : llvm::findLastSet(MaxSize, llvm::ZB_Undefined);
7626   return TypeForSize[std::min(TyIndex, MaxIndex)];
7627 }
7628
7629 template <typename TraitsType> void TargetX86Base<TraitsType>::postLower() {
7630   if (Func->getOptLevel() == Opt_m1)
7631     return;
7632   markRedefinitions();
7633   Context.availabilityUpdate();
7634 }
7635
7636 template <typename TraitsType>
7637 void TargetX86Base<TraitsType>::makeRandomRegisterPermutation(
7638     llvm::SmallVectorImpl<RegNumT> &Permutation,
7639     const SmallBitVector &ExcludeRegisters, uint64_t Salt) const {
7640   Traits::makeRandomRegisterPermutation(Func, Permutation, ExcludeRegisters,
7641                                         Salt);
7642 }
7643
7644 template <typename TraitsType>
7645 void TargetX86Base<TraitsType>::emit(const ConstantInteger32 *C) const {
7646   if (!BuildDefs::dump())
7647     return;
7648   Ostream &Str = Ctx->getStrEmit();
7649   Str << "$" << C->getValue();
7650 }
7651
7652 template <typename TraitsType>
7653 void TargetX86Base<TraitsType>::emit(const ConstantInteger64 *C) const {
7654   if (!Traits::Is64Bit) {
7655     llvm::report_fatal_error("Not expecting to emit 64-bit integers");
7656   } else {
7657     if (!BuildDefs::dump())
7658       return;
7659     Ostream &Str = Ctx->getStrEmit();
7660     Str << "$" << C->getValue();
7661   }
7662 }
7663
7664 template <typename TraitsType>
7665 void TargetX86Base<TraitsType>::emit(const ConstantFloat *C) const {
7666   if (!BuildDefs::dump())
7667     return;
7668   Ostream &Str = Ctx->getStrEmit();
7669   Str << C->getLabelName();
7670 }
7671
7672 template <typename TraitsType>
7673 void TargetX86Base<TraitsType>::emit(const ConstantDouble *C) const {
7674   if (!BuildDefs::dump())
7675     return;
7676   Ostream &Str = Ctx->getStrEmit();
7677   Str << C->getLabelName();
7678 }
7679
7680 template <typename TraitsType>
7681 void TargetX86Base<TraitsType>::emit(const ConstantUndef *) const {
7682   llvm::report_fatal_error("undef value encountered by emitter.");
7683 }
7684
7685 template <class Machine>
7686 void TargetX86Base<Machine>::emit(const ConstantRelocatable *C) const {
7687   if (!BuildDefs::dump())
7688     return;
7689   assert(!getFlags().getUseNonsfi() ||
7690          C->getName().toString() == GlobalOffsetTable);
7691   Ostream &Str = Ctx->getStrEmit();
7692   Str << "$";
7693   emitWithoutPrefix(C);
7694 }
7695
7696 /// Randomize or pool an Immediate.
7697 template <typename TraitsType>
7698 Operand *
7699 TargetX86Base<TraitsType>::randomizeOrPoolImmediate(Constant *Immediate,
7700                                                     RegNumT RegNum) {
7701   assert(llvm::isa<ConstantInteger32>(Immediate) ||
7702          llvm::isa<ConstantRelocatable>(Immediate));
7703   if (getFlags().getRandomizeAndPoolImmediatesOption() == RPI_None ||
7704       RandomizationPoolingPaused == true) {
7705     // Immediates randomization/pooling off or paused
7706     return Immediate;
7707   }
7708
7709   if (Traits::Is64Bit && NeedSandboxing) {
7710     // Immediate randomization/pooling is currently disabled for x86-64
7711     // sandboxing for it could generate invalid memory operands.
7712     assert(false &&
7713            "Constant pooling/randomization is disabled for x8664 sandbox.");
7714     return Immediate;
7715   }
7716
7717   if (!Immediate->shouldBeRandomizedOrPooled()) {
7718     // the constant Immediate is not eligible for blinding/pooling
7719     return Immediate;
7720   }
7721   Ctx->statsUpdateRPImms();
7722   switch (getFlags().getRandomizeAndPoolImmediatesOption()) {
7723   default:
7724     llvm::report_fatal_error("Unsupported -randomize-pool-immediates option");
7725   case RPI_Randomize: {
7726     // blind the constant
7727     // FROM:
7728     //  imm
7729     // TO:
7730     //  insert: mov imm+cookie, Reg
7731     //  insert: lea -cookie[Reg], Reg
7732     //  => Reg
7733     // If we have already assigned a phy register, we must come from
7734     // advancedPhiLowering()=>lowerAssign(). In this case we should reuse the
7735     // assigned register as this assignment is that start of its use-def
7736     // chain. So we add RegNum argument here. Note we use 'lea' instruction
7737     // instead of 'xor' to avoid affecting the flags.
7738     Variable *Reg = makeReg(IceType_i32, RegNum);
7739     auto *Integer = llvm::cast<ConstantInteger32>(Immediate);
7740     uint32_t Value = Integer->getValue();
7741     uint32_t Cookie = Func->getConstantBlindingCookie();
7742     _mov(Reg, Ctx->getConstantInt(IceType_i32, Cookie + Value));
7743     Constant *Offset = Ctx->getConstantInt(IceType_i32, 0 - Cookie);
7744     _lea(Reg, X86OperandMem::create(Func, IceType_i32, Reg, Offset));
7745     if (Immediate->getType() == IceType_i32) {
7746       return Reg;
7747     }
7748     Variable *TruncReg = makeReg(Immediate->getType(), RegNum);
7749     _mov(TruncReg, Reg);
7750     return TruncReg;
7751   }
7752   case RPI_Pool: {
7753     // pool the constant
7754     // FROM:
7755     //  imm
7756     // TO:
7757     //  insert: mov $label, Reg
7758     //  => Reg
7759     assert(getFlags().getRandomizeAndPoolImmediatesOption() == RPI_Pool);
7760     assert(Immediate->getShouldBePooled());
7761     // if we have already assigned a phy register, we must come from
7762     // advancedPhiLowering()=>lowerAssign(). In this case we should reuse the
7763     // assigned register as this assignment is that start of its use-def
7764     // chain. So we add RegNum argument here.
7765     Variable *Reg = makeReg(Immediate->getType(), RegNum);
7766     constexpr RelocOffsetT Offset = 0;
7767     Constant *Symbol = Ctx->getConstantSym(Offset, Immediate->getLabelName());
7768     constexpr Variable *NoBase = nullptr;
7769     X86OperandMem *MemOperand =
7770         X86OperandMem::create(Func, Immediate->getType(), NoBase, Symbol);
7771     _mov(Reg, MemOperand);
7772     return Reg;
7773   }
7774   }
7775 }
7776
7777 template <typename TraitsType>
7778 typename TargetX86Base<TraitsType>::X86OperandMem *
7779 TargetX86Base<TraitsType>::randomizeOrPoolImmediate(X86OperandMem *MemOperand,
7780                                                     RegNumT RegNum) {
7781   assert(MemOperand);
7782   if (getFlags().getRandomizeAndPoolImmediatesOption() == RPI_None ||
7783       RandomizationPoolingPaused == true) {
7784     // immediates randomization/pooling is turned off
7785     return MemOperand;
7786   }
7787
7788   if (Traits::Is64Bit && NeedSandboxing) {
7789     // Immediate randomization/pooling is currently disabled for x86-64
7790     // sandboxing for it could generate invalid memory operands.
7791     assert(false &&
7792            "Constant pooling/randomization is disabled for x8664 sandbox.");
7793     return MemOperand;
7794   }
7795
7796   // If this memory operand is already a randomized one, we do not randomize it
7797   // again.
7798   if (MemOperand->getRandomized())
7799     return MemOperand;
7800
7801   auto *C = llvm::dyn_cast_or_null<Constant>(MemOperand->getOffset());
7802
7803   if (C == nullptr) {
7804     return MemOperand;
7805   }
7806
7807   if (!C->shouldBeRandomizedOrPooled()) {
7808     return MemOperand;
7809   }
7810
7811   // The offset of this mem operand should be blinded or pooled
7812   Ctx->statsUpdateRPImms();
7813   switch (getFlags().getRandomizeAndPoolImmediatesOption()) {
7814   default:
7815     llvm::report_fatal_error("Unsupported -randomize-pool-immediates option");
7816   case RPI_Randomize: {
7817     // blind the constant offset
7818     // FROM:
7819     //  offset[base, index, shift]
7820     // TO:
7821     //  insert: lea offset+cookie[base], RegTemp
7822     //  => -cookie[RegTemp, index, shift]
7823     uint32_t Value =
7824         llvm::dyn_cast<ConstantInteger32>(MemOperand->getOffset())->getValue();
7825     uint32_t Cookie = Func->getConstantBlindingCookie();
7826     Constant *Mask1 =
7827         Ctx->getConstantInt(MemOperand->getOffset()->getType(), Cookie + Value);
7828     Constant *Mask2 =
7829         Ctx->getConstantInt(MemOperand->getOffset()->getType(), 0 - Cookie);
7830
7831     X86OperandMem *TempMemOperand = X86OperandMem::create(
7832         Func, MemOperand->getType(), MemOperand->getBase(), Mask1);
7833     // If we have already assigned a physical register, we must come from
7834     // advancedPhiLowering()=>lowerAssign(). In this case we should reuse
7835     // the assigned register as this assignment is that start of its
7836     // use-def chain. So we add RegNum argument here.
7837     Variable *RegTemp = makeReg(MemOperand->getOffset()->getType(), RegNum);
7838     _lea(RegTemp, TempMemOperand);
7839
7840     X86OperandMem *NewMemOperand = X86OperandMem::create(
7841         Func, MemOperand->getType(), RegTemp, Mask2, MemOperand->getIndex(),
7842         MemOperand->getShift(), MemOperand->getSegmentRegister(),
7843         MemOperand->getIsRebased());
7844
7845     // Label this memory operand as randomized, so we won't randomize it
7846     // again in case we call legalize() multiple times on this memory
7847     // operand.
7848     NewMemOperand->setRandomized(true);
7849     return NewMemOperand;
7850   }
7851   case RPI_Pool: {
7852     // pool the constant offset
7853     // FROM:
7854     //  offset[base, index, shift]
7855     // TO:
7856     //  insert: mov $label, RegTemp
7857     //  insert: lea [base, RegTemp], RegTemp
7858     //  =>[RegTemp, index, shift]
7859
7860     // Memory operand should never exist as source operands in phi lowering
7861     // assignments, so there is no need to reuse any registers here. For
7862     // phi lowering, we should not ask for new physical registers in
7863     // general. However, if we do meet Memory Operand during phi lowering,
7864     // we should not blind or pool the immediates for now.
7865     if (RegNum.hasValue())
7866       return MemOperand;
7867     Variable *RegTemp = makeReg(IceType_i32);
7868     assert(MemOperand->getOffset()->getShouldBePooled());
7869     constexpr RelocOffsetT SymOffset = 0;
7870     Constant *Symbol =
7871         Ctx->getConstantSym(SymOffset, MemOperand->getOffset()->getLabelName());
7872     constexpr Variable *NoBase = nullptr;
7873     X86OperandMem *SymbolOperand = X86OperandMem::create(
7874         Func, MemOperand->getOffset()->getType(), NoBase, Symbol);
7875     _mov(RegTemp, SymbolOperand);
7876     // If we have a base variable here, we should add the lea instruction
7877     // to add the value of the base variable to RegTemp. If there is no
7878     // base variable, we won't need this lea instruction.
7879     if (MemOperand->getBase()) {
7880       X86OperandMem *CalculateOperand = X86OperandMem::create(
7881           Func, MemOperand->getType(), MemOperand->getBase(), nullptr, RegTemp,
7882           0, MemOperand->getSegmentRegister());
7883       _lea(RegTemp, CalculateOperand);
7884     }
7885     X86OperandMem *NewMemOperand = X86OperandMem::create(
7886         Func, MemOperand->getType(), RegTemp, nullptr, MemOperand->getIndex(),
7887         MemOperand->getShift(), MemOperand->getSegmentRegister());
7888     return NewMemOperand;
7889   }
7890   }
7891 }
7892
7893 template <typename TraitsType>
7894 void TargetX86Base<TraitsType>::emitJumpTable(
7895     const Cfg *, const InstJumpTable *JumpTable) const {
7896   if (!BuildDefs::dump())
7897     return;
7898   Ostream &Str = Ctx->getStrEmit();
7899   const bool UseNonsfi = getFlags().getUseNonsfi();
7900   const char *Prefix = UseNonsfi ? ".data.rel.ro." : ".rodata.";
7901   Str << "\t.section\t" << Prefix << JumpTable->getSectionName()
7902       << ",\"a\",@progbits\n"
7903          "\t.align\t" << typeWidthInBytes(getPointerType()) << "\n"
7904       << JumpTable->getName() << ":";
7905
7906   // On X86 ILP32 pointers are 32-bit hence the use of .long
7907   for (SizeT I = 0; I < JumpTable->getNumTargets(); ++I)
7908     Str << "\n\t.long\t" << JumpTable->getTarget(I)->getAsmName();
7909   Str << "\n";
7910 }
7911
7912 template <typename TraitsType>
7913 template <typename T>
7914 void TargetDataX86<TraitsType>::emitConstantPool(GlobalContext *Ctx) {
7915   if (!BuildDefs::dump())
7916     return;
7917   Ostream &Str = Ctx->getStrEmit();
7918   Type Ty = T::Ty;
7919   SizeT Align = typeAlignInBytes(Ty);
7920   ConstantList Pool = Ctx->getConstantPool(Ty);
7921
7922   Str << "\t.section\t.rodata.cst" << Align << ",\"aM\",@progbits," << Align
7923       << "\n";
7924   Str << "\t.align\t" << Align << "\n";
7925
7926   // If reorder-pooled-constants option is set to true, we need to shuffle the
7927   // constant pool before emitting it.
7928   if (getFlags().getReorderPooledConstants() && !Pool.empty()) {
7929     // Use the constant's kind value as the salt for creating random number
7930     // generator.
7931     Operand::OperandKind K = (*Pool.begin())->getKind();
7932     RandomNumberGenerator RNG(getFlags().getRandomSeed(),
7933                               RPE_PooledConstantReordering, K);
7934     RandomShuffle(Pool.begin(), Pool.end(),
7935                   [&RNG](uint64_t N) { return (uint32_t)RNG.next(N); });
7936   }
7937
7938   for (Constant *C : Pool) {
7939     if (!C->getShouldBePooled())
7940       continue;
7941     auto *Const = llvm::cast<typename T::IceType>(C);
7942     typename T::IceType::PrimType Value = Const->getValue();
7943     // Use memcpy() to copy bits from Value into RawValue in a way that avoids
7944     // breaking strict-aliasing rules.
7945     typename T::PrimitiveIntType RawValue;
7946     memcpy(&RawValue, &Value, sizeof(Value));
7947     char buf[30];
7948     int CharsPrinted =
7949         snprintf(buf, llvm::array_lengthof(buf), T::PrintfString, RawValue);
7950     assert(CharsPrinted >= 0);
7951     assert((size_t)CharsPrinted < llvm::array_lengthof(buf));
7952     (void)CharsPrinted; // avoid warnings if asserts are disabled
7953     Str << Const->getLabelName();
7954     Str << ":\n\t" << T::AsmTag << "\t" << buf << "\t/* " << T::TypeName << " "
7955         << Value << " */\n";
7956   }
7957 }
7958
7959 template <typename TraitsType>
7960 void TargetDataX86<TraitsType>::lowerConstants() {
7961   if (getFlags().getDisableTranslation())
7962     return;
7963   switch (getFlags().getOutFileType()) {
7964   case FT_Elf: {
7965     ELFObjectWriter *Writer = Ctx->getObjectWriter();
7966
7967     Writer->writeConstantPool<ConstantInteger32>(IceType_i8);
7968     Writer->writeConstantPool<ConstantInteger32>(IceType_i16);
7969     Writer->writeConstantPool<ConstantInteger32>(IceType_i32);
7970
7971     Writer->writeConstantPool<ConstantFloat>(IceType_f32);
7972     Writer->writeConstantPool<ConstantDouble>(IceType_f64);
7973   } break;
7974   case FT_Asm:
7975   case FT_Iasm: {
7976     OstreamLocker L(Ctx);
7977
7978     emitConstantPool<PoolTypeConverter<uint8_t>>(Ctx);
7979     emitConstantPool<PoolTypeConverter<uint16_t>>(Ctx);
7980     emitConstantPool<PoolTypeConverter<uint32_t>>(Ctx);
7981
7982     emitConstantPool<PoolTypeConverter<float>>(Ctx);
7983     emitConstantPool<PoolTypeConverter<double>>(Ctx);
7984   } break;
7985   }
7986 }
7987
7988 template <typename TraitsType>
7989 void TargetDataX86<TraitsType>::lowerJumpTables() {
7990   const bool IsPIC = getFlags().getUseNonsfi();
7991   switch (getFlags().getOutFileType()) {
7992   case FT_Elf: {
7993     ELFObjectWriter *Writer = Ctx->getObjectWriter();
7994     for (const JumpTableData &JT : Ctx->getJumpTables())
7995       Writer->writeJumpTable(JT, Traits::FK_Abs, IsPIC);
7996   } break;
7997   case FT_Asm:
7998     // Already emitted from Cfg
7999     break;
8000   case FT_Iasm: {
8001     if (!BuildDefs::dump())
8002       return;
8003     Ostream &Str = Ctx->getStrEmit();
8004     const char *Prefix = IsPIC ? ".data.rel.ro." : ".rodata.";
8005     for (const JumpTableData &JT : Ctx->getJumpTables()) {
8006       Str << "\t.section\t" << Prefix << JT.getSectionName()
8007           << ",\"a\",@progbits\n"
8008              "\t.align\t" << typeWidthInBytes(getPointerType()) << "\n"
8009           << JT.getName().toString() << ":";
8010
8011       // On X8664 ILP32 pointers are 32-bit hence the use of .long
8012       for (intptr_t TargetOffset : JT.getTargetOffsets())
8013         Str << "\n\t.long\t" << JT.getFunctionName() << "+" << TargetOffset;
8014       Str << "\n";
8015     }
8016   } break;
8017   }
8018 }
8019
8020 template <typename TraitsType>
8021 void TargetDataX86<TraitsType>::lowerGlobals(
8022     const VariableDeclarationList &Vars, const std::string &SectionSuffix) {
8023   const bool IsPIC = getFlags().getUseNonsfi();
8024   switch (getFlags().getOutFileType()) {
8025   case FT_Elf: {
8026     ELFObjectWriter *Writer = Ctx->getObjectWriter();
8027     Writer->writeDataSection(Vars, Traits::FK_Abs, SectionSuffix, IsPIC);
8028   } break;
8029   case FT_Asm:
8030   case FT_Iasm: {
8031     OstreamLocker L(Ctx);
8032     for (const VariableDeclaration *Var : Vars) {
8033       if (getFlags().matchTranslateOnly(Var->getName(), 0)) {
8034         emitGlobal(*Var, SectionSuffix);
8035       }
8036     }
8037   } break;
8038   }
8039 }
8040 } // end of namespace X86NAMESPACE
8041 } // end of namespace Ice
8042
8043 #endif // SUBZERO_SRC_ICETARGETLOWERINGX86BASEIMPL_H