src/gallium/drivers/radeon/AMDILPeepholeOptimizer.cpp

   1 //===-- AMDILPeepholeOptimizer.cpp - TODO: Add brief description -------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //==-----------------------------------------------------------------------===//
   9
  10 #define DEBUG_TYPE "PeepholeOpt"
  11 #ifdef DEBUG
  12 #define DEBUGME (DebugFlag && isCurrentDebugType(DEBUG_TYPE))
  13 #else
  14 #define DEBUGME 0
  15 #endif
  16
  17 #include "AMDILAlgorithms.tpp"
  18 #include "AMDILDevices.h"
  19 #include "AMDILGlobalManager.h"
  20 #include "AMDILKernelManager.h"
  21 #include "AMDILMachineFunctionInfo.h"
  22 #include "AMDILUtilityFunctions.h"
  23 #include "llvm/ADT/Statistic.h"
  24 #include "llvm/ADT/StringExtras.h"
  25 #include "llvm/ADT/StringRef.h"
  26 #include "llvm/ADT/Twine.h"
  27 #include "llvm/CodeGen/MachineFunction.h"
  28 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
  29 #include "llvm/Function.h"
  30 #include "llvm/Instructions.h"
  31 #include "llvm/Module.h"
  32 #include "llvm/Support/Debug.h"
  33 #include "llvm/Support/MathExtras.h"
  34
  35 #include <sstream>
  36
  37 #if 0
  38 STATISTIC(PointerAssignments, "Number of dynamic pointer "
  39     "assigments discovered");
  40 STATISTIC(PointerSubtract, "Number of pointer subtractions discovered");
  41 #endif
  42 STATISTIC(LocalFuncs, "Number of get_local_size(N) functions removed");
  43
  44 using namespace llvm;
  45 // The Peephole optimization pass is used to do simple last minute optimizations
  46 // that are required for correct code or to remove redundant functions
  47 namespace {
  48 class LLVM_LIBRARY_VISIBILITY AMDILPeepholeOpt : public FunctionPass {
  49 public:
  50   TargetMachine &TM;
  51   static char ID;
  52   AMDILPeepholeOpt(TargetMachine &tm AMDIL_OPT_LEVEL_DECL);
  53   ~AMDILPeepholeOpt();
  54   const char *getPassName() const;
  55   bool runOnFunction(Function &F);
  56   bool doInitialization(Module &M);
  57   bool doFinalization(Module &M);
  58   void getAnalysisUsage(AnalysisUsage &AU) const;
  59 protected:
  60 private:
  61   // Function to initiate all of the instruction level optimizations.
  62   bool instLevelOptimizations(BasicBlock::iterator *inst);
  63   // Quick check to see if we need to dump all of the pointers into the
  64   // arena. If this is correct, then we set all pointers to exist in arena. This
  65   // is a workaround for aliasing of pointers in a struct/union.
  66   bool dumpAllIntoArena(Function &F);
  67   // Because I don't want to invalidate any pointers while in the
  68   // safeNestedForEachFunction. I push atomic conversions to a vector and handle
  69   // it later. This function does the conversions if required.
  70   void doAtomicConversionIfNeeded(Function &F);
  71   // Because __amdil_is_constant cannot be properly evaluated if
  72   // optimizations are disabled, the call's are placed in a vector
  73   // and evaluated after the __amdil_image* functions are evaluated
  74   // which should allow the __amdil_is_constant function to be
  75   // evaluated correctly.
  76   void doIsConstCallConversionIfNeeded();
  77   bool mChanged;
  78   bool mDebug;
  79   bool mRWGOpt;
  80   bool mConvertAtomics;
  81   CodeGenOpt::Level optLevel;
  82   // Run a series of tests to see if we can optimize a CALL instruction.
  83   bool optimizeCallInst(BasicBlock::iterator *bbb);
  84   // A peephole optimization to optimize bit extract sequences.
  85   bool optimizeBitExtract(Instruction *inst);
  86   // A peephole optimization to optimize bit insert sequences.
  87   bool optimizeBitInsert(Instruction *inst);
  88   bool setupBitInsert(Instruction *base,
  89                       Instruction *&src,
  90                       Constant *&mask,
  91                       Constant *&shift);
  92   // Expand the bit field insert instruction on versions of OpenCL that
  93   // don't support it.
  94   bool expandBFI(CallInst *CI);
  95   // Expand the bit field mask instruction on version of OpenCL that
  96   // don't support it.
  97   bool expandBFM(CallInst *CI);
  98   // On 7XX and 8XX operations, we do not have 24 bit signed operations. So in
  99   // this case we need to expand them. These functions check for 24bit functions
 100   // and then expand.
 101   bool isSigned24BitOps(CallInst *CI);
 102   void expandSigned24BitOps(CallInst *CI);
 103   // One optimization that can occur is that if the required workgroup size is
 104   // specified then the result of get_local_size is known at compile time and
 105   // can be returned accordingly.
 106   bool isRWGLocalOpt(CallInst *CI);
 107   void expandRWGLocalOpt(CallInst *CI);
 108   // On northern island cards, the division is slightly less accurate than on
 109   // previous generations, so we need to utilize a more accurate division. So we
 110   // can translate the accurate divide to a normal divide on all other cards.
 111   bool convertAccurateDivide(CallInst *CI);
 112   void expandAccurateDivide(CallInst *CI);
 113   // If the alignment is set incorrectly, it can produce really inefficient
 114   // code. This checks for this scenario and fixes it if possible.
 115   bool correctMisalignedMemOp(Instruction *inst);
 116
 117   // If we are in no opt mode, then we need to make sure that
 118   // local samplers are properly propagated as constant propagation
 119   // doesn't occur and we need to know the value of kernel defined
 120   // samplers at compile time.
 121   bool propagateSamplerInst(CallInst *CI);
 122
 123   LLVMContext *mCTX;
 124   Function *mF;
 125   const AMDILSubtarget *mSTM;
 126   SmallVector< std::pair<CallInst *, Function *>, 16> atomicFuncs;
 127   SmallVector<CallInst *, 16> isConstVec;
 128 }; // class AMDILPeepholeOpt
 129   char AMDILPeepholeOpt::ID = 0;
 130 } // anonymous namespace
 131
 132 namespace llvm {
 133   FunctionPass *
 134   createAMDILPeepholeOpt(TargetMachine &tm AMDIL_OPT_LEVEL_DECL)
 135   {
 136     return new AMDILPeepholeOpt(tm AMDIL_OPT_LEVEL_VAR);
 137   }
 138 } // llvm namespace
 139
 140 AMDILPeepholeOpt::AMDILPeepholeOpt(TargetMachine &tm AMDIL_OPT_LEVEL_DECL)
 141   : FunctionPass(ID), TM(tm)
 142 {
 143   mDebug = DEBUGME;
 144   optLevel = TM.getOptLevel();
 145
 146 }
 147
 148 AMDILPeepholeOpt::~AMDILPeepholeOpt()
 149 {
 150 }
 151
 152 const char *
 153 AMDILPeepholeOpt::getPassName() const
 154 {
 155   return "AMDIL PeepHole Optimization Pass";
 156 }
 157
 158 bool
 159 containsPointerType(Type *Ty)
 160 {
 161   if (!Ty) {
 162     return false;
 163   }
 164   switch(Ty->getTypeID()) {
 165   default:
 166     return false;
 167   case Type::StructTyID: {
 168     const StructType *ST = dyn_cast<StructType>(Ty);
 169     for (StructType::element_iterator stb = ST->element_begin(),
 170            ste = ST->element_end(); stb != ste; ++stb) {
 171       if (!containsPointerType(*stb)) {
 172         continue;
 173       }
 174       return true;
 175     }
 176     break;
 177   }
 178   case Type::VectorTyID:
 179   case Type::ArrayTyID:
 180     return containsPointerType(dyn_cast<SequentialType>(Ty)->getElementType());
 181   case Type::PointerTyID:
 182     return true;
 183   };
 184   return false;
 185 }
 186
 187 bool
 188 AMDILPeepholeOpt::dumpAllIntoArena(Function &F)
 189 {
 190   bool dumpAll = false;
 191   for (Function::const_arg_iterator cab = F.arg_begin(),
 192        cae = F.arg_end(); cab != cae; ++cab) {
 193     const Argument *arg = cab;
 194     const PointerType *PT = dyn_cast<PointerType>(arg->getType());
 195     if (!PT) {
 196       continue;
 197     }
 198     Type *DereferencedType = PT->getElementType();
 199     if (!dyn_cast<StructType>(DereferencedType)
 200         ) {
 201       continue;
 202     }
 203     if (!containsPointerType(DereferencedType)) {
 204       continue;
 205     }
 206     // FIXME: Because a pointer inside of a struct/union may be aliased to
 207     // another pointer we need to take the conservative approach and place all
 208     // pointers into the arena until more advanced detection is implemented.
 209     dumpAll = true;
 210   }
 211   return dumpAll;
 212 }
 213 void
 214 AMDILPeepholeOpt::doIsConstCallConversionIfNeeded()
 215 {
 216   if (isConstVec.empty()) {
 217     return;
 218   }
 219   for (unsigned x = 0, y = isConstVec.size(); x < y; ++x) {
 220     CallInst *CI = isConstVec[x];
 221     Constant *CV = dyn_cast<Constant>(CI->getOperand(0));
 222     Type *aType = Type::getInt32Ty(*mCTX);
 223     Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1)
 224       : ConstantInt::get(aType, 0);
 225     CI->replaceAllUsesWith(Val);
 226     CI->eraseFromParent();
 227   }
 228   isConstVec.clear();
 229 }
 230 void
 231 AMDILPeepholeOpt::doAtomicConversionIfNeeded(Function &F)
 232 {
 233   // Don't do anything if we don't have any atomic operations.
 234   if (atomicFuncs.empty()) {
 235     return;
 236   }
 237   // Change the function name for the atomic if it is required
 238   uint32_t size = atomicFuncs.size();
 239   for (uint32_t x = 0; x < size; ++x) {
 240     atomicFuncs[x].first->setOperand(
 241         atomicFuncs[x].first->getNumOperands()-1,
 242         atomicFuncs[x].second);
 243
 244   }
 245   mChanged = true;
 246   if (mConvertAtomics) {
 247     return;
 248   }
 249   // If we did not convert all of the atomics, then we need to make sure that
 250   // the atomics that were not converted have their base pointers set to use the
 251   // arena path.
 252   Function::arg_iterator argB = F.arg_begin();
 253   Function::arg_iterator argE = F.arg_end();
 254   AMDILKernelManager *KM = mSTM->getKernelManager();
 255   AMDILMachineFunctionInfo *mMFI = getAnalysis<MachineFunctionAnalysis>().getMF()
 256     .getInfo<AMDILMachineFunctionInfo>();
 257   for (; argB != argE; ++argB) {
 258     if (mSTM->device()->isSupported(AMDILDeviceInfo::ArenaUAV)) {
 259       KM->setUAVID(argB,mSTM->device()->getResourceID(AMDILDevice::ARENA_UAV_ID));
 260       mMFI->uav_insert(mSTM->device()->getResourceID(AMDILDevice::ARENA_UAV_ID));
 261     } else {
 262       KM->setUAVID(argB,mSTM->device()->getResourceID(AMDILDevice::GLOBAL_ID));
 263       mMFI->uav_insert(mSTM->device()->getResourceID(AMDILDevice::GLOBAL_ID));
 264     }
 265   }
 266 }
 267
 268 bool
 269 AMDILPeepholeOpt::runOnFunction(Function &MF)
 270 {
 271   mChanged = false;
 272   mF = &MF;
 273   mSTM = &TM.getSubtarget<AMDILSubtarget>();
 274   if (mDebug) {
 275     MF.dump();
 276   }
 277   mCTX = &MF.getType()->getContext();
 278   mConvertAtomics = true;
 279   if (dumpAllIntoArena(MF)) {
 280     for (Function::const_arg_iterator cab = MF.arg_begin(),
 281          cae = MF.arg_end(); cab != cae; ++cab) {
 282       const Argument *arg = cab;
 283       AMDILKernelManager *KM = mSTM->getKernelManager();
 284       KM->setUAVID(getBasePointerValue(arg),
 285           mSTM->device()->getResourceID(AMDILDevice::GLOBAL_ID));
 286     }
 287   }
 288   mRWGOpt = mSTM->getGlobalManager()->hasRWG(MF.getName());
 289   safeNestedForEach(MF.begin(), MF.end(), MF.begin()->begin(),
 290      std::bind1st(std::mem_fun(&AMDILPeepholeOpt::instLevelOptimizations),
 291                   this));
 292
 293   doAtomicConversionIfNeeded(MF);
 294   doIsConstCallConversionIfNeeded();
 295
 296   if (mDebug) {
 297     MF.dump();
 298   }
 299   return mChanged;
 300 }
 301
 302 bool
 303 AMDILPeepholeOpt::optimizeCallInst(BasicBlock::iterator *bbb)
 304 {
 305   Instruction *inst = (*bbb);
 306   CallInst *CI = dyn_cast<CallInst>(inst);
 307   if (!CI) {
 308     return false;
 309   }
 310   if (isSigned24BitOps(CI)) {
 311     expandSigned24BitOps(CI);
 312     ++(*bbb);
 313     CI->eraseFromParent();
 314     return true;
 315   }
 316   if (isRWGLocalOpt(CI)) {
 317     expandRWGLocalOpt(CI);
 318     return false;
 319   }
 320   if (propagateSamplerInst(CI)) {
 321     return false;
 322   }
 323   if (expandBFI(CI) || expandBFM(CI)) {
 324     ++(*bbb);
 325     CI->eraseFromParent();
 326     return true;
 327   }
 328   if (convertAccurateDivide(CI)) {
 329     expandAccurateDivide(CI);
 330     ++(*bbb);
 331     CI->eraseFromParent();
 332     return true;
 333   }
 334
 335   StringRef calleeName = CI->getOperand(CI->getNumOperands()-1)->getName();
 336   if (calleeName.startswith("__amdil_is_constant")) {
 337     // If we do not have optimizations, then this
 338     // cannot be properly evaluated, so we add the
 339     // call instruction to a vector and process
 340     // them at the end of processing after the
 341     // samplers have been correctly handled.
 342     if (optLevel == CodeGenOpt::None) {
 343       isConstVec.push_back(CI);
 344       return false;
 345     } else {
 346       Constant *CV = dyn_cast<Constant>(CI->getOperand(0));
 347       Type *aType = Type::getInt32Ty(*mCTX);
 348       Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1)
 349         : ConstantInt::get(aType, 0);
 350       CI->replaceAllUsesWith(Val);
 351       ++(*bbb);
 352       CI->eraseFromParent();
 353       return true;
 354     }
 355   }
 356
 357   if (calleeName.equals("__amdil_is_asic_id_i32")) {
 358     ConstantInt *CV = dyn_cast<ConstantInt>(CI->getOperand(0));
 359     Type *aType = Type::getInt32Ty(*mCTX);
 360     Value *Val = CV;
 361     if (Val) {
 362       Val = ConstantInt::get(aType,
 363           mSTM->device()->getDeviceFlag() & CV->getZExtValue());
 364     } else {
 365       Val = ConstantInt::get(aType, 0);
 366     }
 367     CI->replaceAllUsesWith(Val);
 368     ++(*bbb);
 369     CI->eraseFromParent();
 370     return true;
 371   }
 372   Function *F = dyn_cast<Function>(CI->getOperand(CI->getNumOperands()-1));
 373   if (!F) {
 374     return false;
 375   }
 376   if (F->getName().startswith("__atom") && !CI->getNumUses()
 377       && F->getName().find("_xchg") == StringRef::npos) {
 378     std::string buffer(F->getName().str() + "_noret");
 379     F = dyn_cast<Function>(
 380           F->getParent()->getOrInsertFunction(buffer, F->getFunctionType()));
 381     atomicFuncs.push_back(std::make_pair <CallInst*, Function*>(CI, F));
 382   }
 383
 384   if (!mSTM->device()->isSupported(AMDILDeviceInfo::ArenaSegment)
 385       && !mSTM->device()->isSupported(AMDILDeviceInfo::MultiUAV)) {
 386     return false;
 387   }
 388   if (!mConvertAtomics) {
 389     return false;
 390   }
 391   StringRef name = F->getName();
 392   if (name.startswith("__atom") && name.find("_g") != StringRef::npos) {
 393     Value *ptr = CI->getOperand(0);
 394     const Value *basePtr = getBasePointerValue(ptr);
 395     const Argument *Arg = dyn_cast<Argument>(basePtr);
 396     if (Arg) {
 397       AMDILGlobalManager *GM = mSTM->getGlobalManager();
 398       int32_t id = GM->getArgID(Arg);
 399       if (id >= 0) {
 400         std::stringstream ss;
 401         ss << name.data() << "_" << id << '\n';
 402         std::string val;
 403         ss >> val;
 404         F = dyn_cast<Function>(
 405               F->getParent() ->getOrInsertFunction(val, F->getFunctionType()));
 406         atomicFuncs.push_back(std::make_pair <CallInst*, Function*>(CI, F));
 407       } else {
 408         mConvertAtomics = false;
 409       }
 410     } else {
 411       mConvertAtomics = false;
 412     }
 413   }
 414   return false;
 415 }
 416
 417 bool
 418 AMDILPeepholeOpt::setupBitInsert(Instruction *base,
 419     Instruction *&src,
 420     Constant *&mask,
 421     Constant *&shift)
 422 {
 423   if (!base) {
 424     if (mDebug) {
 425       dbgs() << "Null pointer passed into function.\n";
 426     }
 427     return false;
 428   }
 429   bool andOp = false;
 430   if (base->getOpcode() == Instruction::Shl) {
 431     shift = dyn_cast<Constant>(base->getOperand(1));
 432   } else if (base->getOpcode() == Instruction::And) {
 433     mask = dyn_cast<Constant>(base->getOperand(1));
 434     andOp = true;
 435   } else {
 436     if (mDebug) {
 437       dbgs() << "Failed setup with no Shl or And instruction on base opcode!\n";
 438     }
 439     // If the base is neither a Shl or a And, we don't fit any of the patterns above.
 440     return false;
 441   }
 442   src = dyn_cast<Instruction>(base->getOperand(0));
 443   if (!src) {
 444     if (mDebug) {
 445       dbgs() << "Failed setup since the base operand is not an instruction!\n";
 446     }
 447     return false;
 448   }
 449   // If we find an 'and' operation, then we don't need to
 450   // find the next operation as we already know the
 451   // bits that are valid at this point.
 452   if (andOp) {
 453     return true;
 454   }
 455   if (src->getOpcode() == Instruction::Shl && !shift) {
 456     shift = dyn_cast<Constant>(src->getOperand(1));
 457     src = dyn_cast<Instruction>(src->getOperand(0));
 458   } else if (src->getOpcode() == Instruction::And && !mask) {
 459     mask = dyn_cast<Constant>(src->getOperand(1));
 460   }
 461   if (!mask && !shift) {
 462     if (mDebug) {
 463       dbgs() << "Failed setup since both mask and shift are NULL!\n";
 464     }
 465     // Did not find a constant mask or a shift.
 466     return false;
 467   }
 468   return true;
 469 }
 470 bool
 471 AMDILPeepholeOpt::optimizeBitInsert(Instruction *inst)
 472 {
 473   if (!inst) {
 474     return false;
 475   }
 476   if (!inst->isBinaryOp()) {
 477     return false;
 478   }
 479   if (inst->getOpcode() != Instruction::Or) {
 480     return false;
 481   }
 482   if (optLevel == CodeGenOpt::None) {
 483     return false;
 484   }
 485   // We want to do an optimization on a sequence of ops that in the end equals a
 486   // single ISA instruction.
 487   // The base pattern for this optimization is - ((A & B) << C) | ((D & E) << F)
 488   // Some simplified versions of this pattern are as follows:
 489   // (A & B) | (D & E) when B & E == 0 && C == 0 && F == 0
 490   // ((A & B) << C) | (D & E) when B ^ E == 0 && (1 << C) >= E
 491   // (A & B) | ((D & E) << F) when B ^ E == 0 && (1 << F) >= B
 492   // (A & B) | (D << F) when (1 << F) >= B
 493   // (A << C) | (D & E) when (1 << C) >= E
 494   if (mSTM->device()->getGeneration() == AMDILDeviceInfo::HD4XXX) {
 495     // The HD4XXX hardware doesn't support the ubit_insert instruction.
 496     return false;
 497   }
 498   Type *aType = inst->getType();
 499   bool isVector = aType->isVectorTy();
 500   int numEle = 1;
 501   // This optimization only works on 32bit integers.
 502   if (aType->getScalarType()
 503       != Type::getInt32Ty(inst->getContext())) {
 504     return false;
 505   }
 506   if (isVector) {
 507     const VectorType *VT = dyn_cast<VectorType>(aType);
 508     numEle = VT->getNumElements();
 509     // We currently cannot support more than 4 elements in a intrinsic and we
 510     // cannot support Vec3 types.
 511     if (numEle > 4 || numEle == 3) {
 512       return false;
 513     }
 514   }
 515   // TODO: Handle vectors.
 516   if (isVector) {
 517     if (mDebug) {
 518       dbgs() << "!!! Vectors are not supported yet!\n";
 519     }
 520     return false;
 521   }
 522   Instruction *LHSSrc = NULL, *RHSSrc = NULL;
 523   Constant *LHSMask = NULL, *RHSMask = NULL;
 524   Constant *LHSShift = NULL, *RHSShift = NULL;
 525   Instruction *LHS = dyn_cast<Instruction>(inst->getOperand(0));
 526   Instruction *RHS = dyn_cast<Instruction>(inst->getOperand(1));
 527   if (!setupBitInsert(LHS, LHSSrc, LHSMask, LHSShift)) {
 528     if (mDebug) {
 529       dbgs() << "Found an OR Operation that failed setup!\n";
 530       inst->dump();
 531       if (LHS) { LHS->dump(); }
 532       if (LHSSrc) { LHSSrc->dump(); }
 533       if (LHSMask) { LHSMask->dump(); }
 534       if (LHSShift) { LHSShift->dump(); }
 535     }
 536     // There was an issue with the setup for BitInsert.
 537     return false;
 538   }
 539   if (!setupBitInsert(RHS, RHSSrc, RHSMask, RHSShift)) {
 540     if (mDebug) {
 541       dbgs() << "Found an OR Operation that failed setup!\n";
 542       inst->dump();
 543       if (RHS) { RHS->dump(); }
 544       if (RHSSrc) { RHSSrc->dump(); }
 545       if (RHSMask) { RHSMask->dump(); }
 546       if (RHSShift) { RHSShift->dump(); }
 547     }
 548     // There was an issue with the setup for BitInsert.
 549     return false;
 550   }
 551   if (mDebug) {
 552     dbgs() << "Found an OR operation that can possible be optimized to ubit insert!\n";
 553     dbgs() << "Op:        "; inst->dump();
 554     dbgs() << "LHS:       "; if (LHS) { LHS->dump(); } else { dbgs() << "(None)\n"; }
 555     dbgs() << "LHS Src:   "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(None)\n"; }
 556     dbgs() << "LHS Mask:  "; if (LHSMask) { LHSMask->dump(); } else { dbgs() << "(None)\n"; }
 557     dbgs() << "LHS Shift: "; if (LHSShift) { LHSShift->dump(); } else { dbgs() << "(None)\n"; }
 558     dbgs() << "RHS:       "; if (RHS) { RHS->dump(); } else { dbgs() << "(None)\n"; }
 559     dbgs() << "RHS Src:   "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(None)\n"; }
 560     dbgs() << "RHS Mask:  "; if (RHSMask) { RHSMask->dump(); } else { dbgs() << "(None)\n"; }
 561     dbgs() << "RHS Shift: "; if (RHSShift) { RHSShift->dump(); } else { dbgs() << "(None)\n"; }
 562   }
 563   Constant *offset = NULL;
 564   Constant *width = NULL;
 565   int32_t lhsMaskVal = 0, rhsMaskVal = 0;
 566   int32_t lhsShiftVal = 0, rhsShiftVal = 0;
 567   int32_t lhsMaskWidth = 0, rhsMaskWidth = 0;
 568   int32_t lhsMaskOffset = 0, rhsMaskOffset = 0;
 569   lhsMaskVal = (int32_t)(LHSMask
 570       ? dyn_cast<ConstantInt>(LHSMask)->getZExtValue() : 0);
 571   rhsMaskVal = (int32_t)(RHSMask
 572       ? dyn_cast<ConstantInt>(RHSMask)->getZExtValue() : 0);
 573   lhsShiftVal = (int32_t)(LHSShift
 574       ? dyn_cast<ConstantInt>(LHSShift)->getZExtValue() : 0);
 575   rhsShiftVal = (int32_t)(RHSShift
 576       ? dyn_cast<ConstantInt>(RHSShift)->getZExtValue() : 0);
 577   lhsMaskWidth = lhsMaskVal ? CountPopulation_32(lhsMaskVal) : 32 - lhsShiftVal;
 578   rhsMaskWidth = rhsMaskVal ? CountPopulation_32(rhsMaskVal) : 32 - rhsShiftVal;
 579   lhsMaskOffset = lhsMaskVal ? CountTrailingZeros_32(lhsMaskVal) : lhsShiftVal;
 580   rhsMaskOffset = rhsMaskVal ? CountTrailingZeros_32(rhsMaskVal) : rhsShiftVal;
 581   // TODO: Handle the case of A & B | D & ~B(i.e. inverted masks).
 582   if (mDebug) {
 583       dbgs() << "Found pattern: \'((A" << (LHSMask ? " & B)" : ")");
 584       dbgs() << (LHSShift ? " << C)" : ")") << " | ((D" ;
 585       dbgs() << (RHSMask ? " & E)" : ")");
 586       dbgs() << (RHSShift ? " << F)\'\n" : ")\'\n");
 587       dbgs() << "A = LHSSrc\t\tD = RHSSrc \n";
 588       dbgs() << "B = " << lhsMaskVal << "\t\tE = " << rhsMaskVal << "\n";
 589       dbgs() << "C = " << lhsShiftVal << "\t\tF = " << rhsShiftVal << "\n";
 590       dbgs() << "width(B) = " << lhsMaskWidth;
 591       dbgs() << "\twidth(E) = " << rhsMaskWidth << "\n";
 592       dbgs() << "offset(B) = " << lhsMaskOffset;
 593       dbgs() << "\toffset(E) = " << rhsMaskOffset << "\n";
 594       dbgs() << "Constraints: \n";
 595       dbgs() << "\t(1) B ^ E == 0\n";
 596       dbgs() << "\t(2-LHS) B is a mask\n";
 597       dbgs() << "\t(2-LHS) E is a mask\n";
 598       dbgs() << "\t(3-LHS) (offset(B)) >= (width(E) + offset(E))\n";
 599       dbgs() << "\t(3-RHS) (offset(E)) >= (width(B) + offset(B))\n";
 600   }
 601   if ((lhsMaskVal || rhsMaskVal) && !(lhsMaskVal ^ rhsMaskVal)) {
 602     if (mDebug) {
 603       dbgs() << lhsMaskVal << " ^ " << rhsMaskVal;
 604       dbgs() << " = " << (lhsMaskVal ^ rhsMaskVal) << "\n";
 605       dbgs() << "Failed constraint 1!\n";
 606     }
 607     return false;
 608   }
 609   if (mDebug) {
 610     dbgs() << "LHS = " << lhsMaskOffset << "";
 611     dbgs() << " >= (" << rhsMaskWidth << " + " << rhsMaskOffset << ") = ";
 612     dbgs() << (lhsMaskOffset >= (rhsMaskWidth + rhsMaskOffset));
 613     dbgs() << "\nRHS = " << rhsMaskOffset << "";
 614     dbgs() << " >= (" << lhsMaskWidth << " + " << lhsMaskOffset << ") = ";
 615     dbgs() << (rhsMaskOffset >= (lhsMaskWidth + lhsMaskOffset));
 616     dbgs() << "\n";
 617   }
 618   if (lhsMaskOffset >= (rhsMaskWidth + rhsMaskOffset)) {
 619     offset = ConstantInt::get(aType, lhsMaskOffset, false);
 620     width = ConstantInt::get(aType, lhsMaskWidth, false);
 621     RHSSrc = RHS;
 622     if (!isMask_32(lhsMaskVal) && !isShiftedMask_32(lhsMaskVal)) {
 623       if (mDebug) {
 624         dbgs() << "Value is not a Mask: " << lhsMaskVal << "\n";
 625         dbgs() << "Failed constraint 2!\n";
 626       }
 627       return false;
 628     }
 629     if (!LHSShift) {
 630       LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
 631           "MaskShr", LHS);
 632     } else if (lhsShiftVal != lhsMaskOffset) {
 633       LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
 634           "MaskShr", LHS);
 635     }
 636     if (mDebug) {
 637       dbgs() << "Optimizing LHS!\n";
 638     }
 639   } else if (rhsMaskOffset >= (lhsMaskWidth + lhsMaskOffset)) {
 640     offset = ConstantInt::get(aType, rhsMaskOffset, false);
 641     width = ConstantInt::get(aType, rhsMaskWidth, false);
 642     LHSSrc = RHSSrc;
 643     RHSSrc = LHS;
 644     if (!isMask_32(rhsMaskVal) && !isShiftedMask_32(rhsMaskVal)) {
 645       if (mDebug) {
 646         dbgs() << "Non-Mask: " << rhsMaskVal << "\n";
 647         dbgs() << "Failed constraint 2!\n";
 648       }
 649       return false;
 650     }
 651     if (!RHSShift) {
 652       LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
 653           "MaskShr", RHS);
 654     } else if (rhsShiftVal != rhsMaskOffset) {
 655       LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
 656           "MaskShr", RHS);
 657     }
 658     if (mDebug) {
 659       dbgs() << "Optimizing RHS!\n";
 660     }
 661   } else {
 662     if (mDebug) {
 663       dbgs() << "Failed constraint 3!\n";
 664     }
 665     return false;
 666   }
 667   if (mDebug) {
 668     dbgs() << "Width:  "; if (width) { width->dump(); } else { dbgs() << "(0)\n"; }
 669     dbgs() << "Offset: "; if (offset) { offset->dump(); } else { dbgs() << "(0)\n"; }
 670     dbgs() << "LHSSrc: "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(0)\n"; }
 671     dbgs() << "RHSSrc: "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(0)\n"; }
 672   }
 673   if (!offset || !width) {
 674     if (mDebug) {
 675       dbgs() << "Either width or offset are NULL, failed detection!\n";
 676     }
 677     return false;
 678   }
 679   // Lets create the function signature.
 680   std::vector<Type *> callTypes;
 681   callTypes.push_back(aType);
 682   callTypes.push_back(aType);
 683   callTypes.push_back(aType);
 684   callTypes.push_back(aType);
 685   FunctionType *funcType = FunctionType::get(aType, callTypes, false);
 686   std::string name = "__amdil_ubit_insert";
 687   if (isVector) { name += "_v" + itostr(numEle) + "u32"; } else { name += "_u32"; }
 688   Function *Func =
 689     dyn_cast<Function>(inst->getParent()->getParent()->getParent()->
 690         getOrInsertFunction(llvm::StringRef(name), funcType));
 691   Value *Operands[4] = {
 692     width,
 693     offset,
 694     LHSSrc,
 695     RHSSrc
 696   };
 697   CallInst *CI = CallInst::Create(Func, Operands, "BitInsertOpt");
 698   if (mDebug) {
 699     dbgs() << "Old Inst: ";
 700     inst->dump();
 701     dbgs() << "New Inst: ";
 702     CI->dump();
 703     dbgs() << "\n\n";
 704   }
 705   CI->insertBefore(inst);
 706   inst->replaceAllUsesWith(CI);
 707   return true;
 708 }
 709
 710 bool
 711 AMDILPeepholeOpt::optimizeBitExtract(Instruction *inst)
 712 {
 713   if (!inst) {
 714     return false;
 715   }
 716   if (!inst->isBinaryOp()) {
 717     return false;
 718   }
 719   if (inst->getOpcode() != Instruction::And) {
 720     return false;
 721   }
 722   if (optLevel == CodeGenOpt::None) {
 723     return false;
 724   }
 725   // We want to do some simple optimizations on Shift right/And patterns. The
 726   // basic optimization is to turn (A >> B) & C where A is a 32bit type, B is a
 727   // value smaller than 32 and C is a mask. If C is a constant value, then the
 728   // following transformation can occur. For signed integers, it turns into the
 729   // function call dst = __amdil_ibit_extract(log2(C), B, A) For unsigned
 730   // integers, it turns into the function call dst =
 731   // __amdil_ubit_extract(log2(C), B, A) The function __amdil_[u|i]bit_extract
 732   // can be found in Section 7.9 of the ATI IL spec of the stream SDK for
 733   // Evergreen hardware.
 734   if (mSTM->device()->getGeneration() == AMDILDeviceInfo::HD4XXX) {
 735     // This does not work on HD4XXX hardware.
 736     return false;
 737   }
 738   Type *aType = inst->getType();
 739   bool isVector = aType->isVectorTy();
 740   int numEle = 1;
 741   // This only works on 32bit integers
 742   if (aType->getScalarType()
 743       != Type::getInt32Ty(inst->getContext())) {
 744     return false;
 745   }
 746   if (isVector) {
 747     const VectorType *VT = dyn_cast<VectorType>(aType);
 748     numEle = VT->getNumElements();
 749     // We currently cannot support more than 4 elements in a intrinsic and we
 750     // cannot support Vec3 types.
 751     if (numEle > 4 || numEle == 3) {
 752       return false;
 753     }
 754   }
 755   BinaryOperator *ShiftInst = dyn_cast<BinaryOperator>(inst->getOperand(0));
 756   // If the first operand is not a shift instruction, then we can return as it
 757   // doesn't match this pattern.
 758   if (!ShiftInst || !ShiftInst->isShift()) {
 759     return false;
 760   }
 761   // If we are a shift left, then we need don't match this pattern.
 762   if (ShiftInst->getOpcode() == Instruction::Shl) {
 763     return false;
 764   }
 765   bool isSigned = ShiftInst->isArithmeticShift();
 766   Constant *AndMask = dyn_cast<Constant>(inst->getOperand(1));
 767   Constant *ShrVal = dyn_cast<Constant>(ShiftInst->getOperand(1));
 768   // Lets make sure that the shift value and the and mask are constant integers.
 769   if (!AndMask || !ShrVal) {
 770     return false;
 771   }
 772   Constant *newMaskConst;
 773   Constant *shiftValConst;
 774   if (isVector) {
 775     // Handle the vector case
 776     std::vector<Constant *> maskVals;
 777     std::vector<Constant *> shiftVals;
 778     ConstantVector *AndMaskVec = dyn_cast<ConstantVector>(AndMask);
 779     ConstantVector *ShrValVec = dyn_cast<ConstantVector>(ShrVal);
 780     Type *scalarType = AndMaskVec->getType()->getScalarType();
 781     assert(AndMaskVec->getNumOperands() ==
 782            ShrValVec->getNumOperands() && "cannot have a "
 783            "combination where the number of elements to a "
 784            "shift and an and are different!");
 785     for (size_t x = 0, y = AndMaskVec->getNumOperands(); x < y; ++x) {
 786       ConstantInt *AndCI = dyn_cast<ConstantInt>(AndMaskVec->getOperand(x));
 787       ConstantInt *ShiftIC = dyn_cast<ConstantInt>(ShrValVec->getOperand(x));
 788       if (!AndCI || !ShiftIC) {
 789         return false;
 790       }
 791       uint32_t maskVal = (uint32_t)AndCI->getZExtValue();
 792       if (!isMask_32(maskVal)) {
 793         return false;
 794       }
 795       maskVal = (uint32_t)CountTrailingOnes_32(maskVal);
 796       uint32_t shiftVal = (uint32_t)ShiftIC->getZExtValue();
 797       // If the mask or shiftval is greater than the bitcount, then break out.
 798       if (maskVal >= 32 || shiftVal >= 32) {
 799         return false;
 800       }
 801       // If the mask val is greater than the the number of original bits left
 802       // then this optimization is invalid.
 803       if (maskVal > (32 - shiftVal)) {
 804         return false;
 805       }
 806       maskVals.push_back(ConstantInt::get(scalarType, maskVal, isSigned));
 807       shiftVals.push_back(ConstantInt::get(scalarType, shiftVal, isSigned));
 808     }
 809     newMaskConst = ConstantVector::get(maskVals);
 810     shiftValConst = ConstantVector::get(shiftVals);
 811   } else {
 812     // Handle the scalar case
 813     uint32_t maskVal = (uint32_t)dyn_cast<ConstantInt>(AndMask)->getZExtValue();
 814     // This must be a mask value where all lower bits are set to 1 and then any
 815     // bit higher is set to 0.
 816     if (!isMask_32(maskVal)) {
 817       return false;
 818     }
 819     maskVal = (uint32_t)CountTrailingOnes_32(maskVal);
 820     // Count the number of bits set in the mask, this is the width of the
 821     // resulting bit set that is extracted from the source value.
 822     uint32_t shiftVal = (uint32_t)dyn_cast<ConstantInt>(ShrVal)->getZExtValue();
 823     // If the mask or shift val is greater than the bitcount, then break out.
 824     if (maskVal >= 32 || shiftVal >= 32) {
 825       return false;
 826     }
 827     // If the mask val is greater than the the number of original bits left then
 828     // this optimization is invalid.
 829     if (maskVal > (32 - shiftVal)) {
 830       return false;
 831     }
 832     newMaskConst = ConstantInt::get(aType, maskVal, isSigned);
 833     shiftValConst = ConstantInt::get(aType, shiftVal, isSigned);
 834   }
 835   // Lets create the function signature.
 836   std::vector<Type *> callTypes;
 837   callTypes.push_back(aType);
 838   callTypes.push_back(aType);
 839   callTypes.push_back(aType);
 840   FunctionType *funcType = FunctionType::get(aType, callTypes, false);
 841   std::string name = "__amdil_ubit_extract";
 842   if (isVector) {
 843     name += "_v" + itostr(numEle) + "i32";
 844   } else {
 845     name += "_i32";
 846   }
 847   // Lets create the function.
 848   Function *Func =
 849     dyn_cast<Function>(inst->getParent()->getParent()->getParent()->
 850                        getOrInsertFunction(llvm::StringRef(name), funcType));
 851   Value *Operands[3] = {
 852     newMaskConst,
 853     shiftValConst,
 854     ShiftInst->getOperand(0)
 855   };
 856   // Lets create the Call with the operands
 857   CallInst *CI = CallInst::Create(Func, Operands, "ByteExtractOpt");
 858   CI->insertBefore(inst);
 859   inst->replaceAllUsesWith(CI);
 860   return true;
 861 }
 862
 863 bool
 864 AMDILPeepholeOpt::expandBFI(CallInst *CI)
 865 {
 866   if (!CI || mSTM->calVersion() <= CAL_VERSION_SC_150) {
 867     return false;
 868   }
 869   Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
 870   if (!LHS->getName().startswith("__amdil_bfi")) {
 871     return false;
 872   }
 873   Type* type = CI->getOperand(0)->getType();
 874   Constant *negOneConst = NULL;
 875   if (type->isVectorTy()) {
 876     std::vector<Constant *> negOneVals;
 877     negOneConst = ConstantInt::get(CI->getContext(),
 878         APInt(32, StringRef("-1"), 10));
 879     for (size_t x = 0,
 880         y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) {
 881       negOneVals.push_back(negOneConst);
 882     }
 883     negOneConst = ConstantVector::get(negOneVals);
 884   } else {
 885     negOneConst = ConstantInt::get(CI->getContext(),
 886         APInt(32, StringRef("-1"), 10));
 887   }
 888   // __amdil_bfi => (A & B) | (~A & C)
 889   BinaryOperator *lhs =
 890     BinaryOperator::Create(Instruction::And, CI->getOperand(0),
 891         CI->getOperand(1), "bfi_and", CI);
 892   BinaryOperator *rhs =
 893     BinaryOperator::Create(Instruction::Xor, CI->getOperand(0), negOneConst,
 894         "bfi_not", CI);
 895   rhs = BinaryOperator::Create(Instruction::And, rhs, CI->getOperand(2),
 896       "bfi_and", CI);
 897   lhs = BinaryOperator::Create(Instruction::Or, lhs, rhs, "bfi_or", CI);
 898   CI->replaceAllUsesWith(lhs);
 899   return true;
 900 }
 901
 902 bool
 903 AMDILPeepholeOpt::expandBFM(CallInst *CI)
 904 {
 905   if (!CI || mSTM->calVersion() <= CAL_VERSION_SC_150) {
 906     return false;
 907   }
 908   Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
 909   if (!LHS->getName().startswith("__amdil_bfm")) {
 910     return false;
 911   }
 912   // __amdil_bfm => ((1 << (src0 & 0x1F)) - 1) << (src1 & 0x1f)
 913   Constant *newMaskConst = NULL;
 914   Constant *newShiftConst = NULL;
 915   Type* type = CI->getOperand(0)->getType();
 916   if (type->isVectorTy()) {
 917     std::vector<Constant*> newMaskVals, newShiftVals;
 918     newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F);
 919     newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1);
 920     for (size_t x = 0,
 921         y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) {
 922       newMaskVals.push_back(newMaskConst);
 923       newShiftVals.push_back(newShiftConst);
 924     }
 925     newMaskConst = ConstantVector::get(newMaskVals);
 926     newShiftConst = ConstantVector::get(newShiftVals);
 927   } else {
 928     newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F);
 929     newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1);
 930   }
 931   BinaryOperator *lhs =
 932     BinaryOperator::Create(Instruction::And, CI->getOperand(0),
 933         newMaskConst, "bfm_mask", CI);
 934   lhs = BinaryOperator::Create(Instruction::Shl, newShiftConst,
 935       lhs, "bfm_shl", CI);
 936   lhs = BinaryOperator::Create(Instruction::Sub, lhs,
 937       newShiftConst, "bfm_sub", CI);
 938   BinaryOperator *rhs =
 939     BinaryOperator::Create(Instruction::And, CI->getOperand(1),
 940         newMaskConst, "bfm_mask", CI);
 941   lhs = BinaryOperator::Create(Instruction::Shl, lhs, rhs, "bfm_shl", CI);
 942   CI->replaceAllUsesWith(lhs);
 943   return true;
 944 }
 945
 946 bool
 947 AMDILPeepholeOpt::instLevelOptimizations(BasicBlock::iterator *bbb)
 948 {
 949   Instruction *inst = (*bbb);
 950   if (optimizeCallInst(bbb)) {
 951     return true;
 952   }
 953   if (optimizeBitExtract(inst)) {
 954     return false;
 955   }
 956   if (optimizeBitInsert(inst)) {
 957     return false;
 958   }
 959   if (correctMisalignedMemOp(inst)) {
 960     return false;
 961   }
 962   return false;
 963 }
 964 bool
 965 AMDILPeepholeOpt::correctMisalignedMemOp(Instruction *inst)
 966 {
 967   LoadInst *linst = dyn_cast<LoadInst>(inst);
 968   StoreInst *sinst = dyn_cast<StoreInst>(inst);
 969   unsigned alignment;
 970   Type* Ty = inst->getType();
 971   if (linst) {
 972     alignment = linst->getAlignment();
 973     Ty = inst->getType();
 974   } else if (sinst) {
 975     alignment = sinst->getAlignment();
 976     Ty = sinst->getValueOperand()->getType();
 977   } else {
 978     return false;
 979   }
 980   unsigned size = getTypeSize(Ty);
 981   if (size == alignment || size < alignment) {
 982     return false;
 983   }
 984   if (!Ty->isStructTy()) {
 985     return false;
 986   }
 987   if (alignment < 4) {
 988     if (linst) {
 989       linst->setAlignment(0);
 990       return true;
 991     } else if (sinst) {
 992       sinst->setAlignment(0);
 993       return true;
 994     }
 995   }
 996   return false;
 997 }
 998 bool
 999 AMDILPeepholeOpt::isSigned24BitOps(CallInst *CI)
1000 {
1001   if (!CI) {
1002     return false;
1003   }
1004   Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
1005   std::string namePrefix = LHS->getName().substr(0, 14);
1006   if (namePrefix != "__amdil_imad24" && namePrefix != "__amdil_imul24"
1007       && namePrefix != "__amdil__imul24_high") {
1008     return false;
1009   }
1010   if (mSTM->device()->usesHardware(AMDILDeviceInfo::Signed24BitOps)) {
1011     return false;
1012   }
1013   return true;
1014 }
1015
1016 void
1017 AMDILPeepholeOpt::expandSigned24BitOps(CallInst *CI)
1018 {
1019   assert(isSigned24BitOps(CI) && "Must be a "
1020       "signed 24 bit operation to call this function!");
1021   Value *LHS = CI->getOperand(CI->getNumOperands()-1);
1022   // On 7XX and 8XX we do not have signed 24bit, so we need to
1023   // expand it to the following:
1024   // imul24 turns into 32bit imul
1025   // imad24 turns into 32bit imad
1026   // imul24_high turns into 32bit imulhigh
1027   if (LHS->getName().substr(0, 14) == "__amdil_imad24") {
1028     Type *aType = CI->getOperand(0)->getType();
1029     bool isVector = aType->isVectorTy();
1030     int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1;
1031     std::vector<Type*> callTypes;
1032     callTypes.push_back(CI->getOperand(0)->getType());
1033     callTypes.push_back(CI->getOperand(1)->getType());
1034     callTypes.push_back(CI->getOperand(2)->getType());
1035     FunctionType *funcType =
1036       FunctionType::get(CI->getOperand(0)->getType(), callTypes, false);
1037     std::string name = "__amdil_imad";
1038     if (isVector) {
1039       name += "_v" + itostr(numEle) + "i32";
1040     } else {
1041       name += "_i32";
1042     }
1043     Function *Func = dyn_cast<Function>(
1044                        CI->getParent()->getParent()->getParent()->
1045                        getOrInsertFunction(llvm::StringRef(name), funcType));
1046     Value *Operands[3] = {
1047       CI->getOperand(0),
1048       CI->getOperand(1),
1049       CI->getOperand(2)
1050     };
1051     CallInst *nCI = CallInst::Create(Func, Operands, "imad24");
1052     nCI->insertBefore(CI);
1053     CI->replaceAllUsesWith(nCI);
1054   } else if (LHS->getName().substr(0, 14) == "__amdil_imul24") {
1055     BinaryOperator *mulOp =
1056       BinaryOperator::Create(Instruction::Mul, CI->getOperand(0),
1057           CI->getOperand(1), "imul24", CI);
1058     CI->replaceAllUsesWith(mulOp);
1059   } else if (LHS->getName().substr(0, 19) == "__amdil_imul24_high") {
1060     Type *aType = CI->getOperand(0)->getType();
1061
1062     bool isVector = aType->isVectorTy();
1063     int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1;
1064     std::vector<Type*> callTypes;
1065     callTypes.push_back(CI->getOperand(0)->getType());
1066     callTypes.push_back(CI->getOperand(1)->getType());
1067     FunctionType *funcType =
1068       FunctionType::get(CI->getOperand(0)->getType(), callTypes, false);
1069     std::string name = "__amdil_imul_high";
1070     if (isVector) {
1071       name += "_v" + itostr(numEle) + "i32";
1072     } else {
1073       name += "_i32";
1074     }
1075     Function *Func = dyn_cast<Function>(
1076                        CI->getParent()->getParent()->getParent()->
1077                        getOrInsertFunction(llvm::StringRef(name), funcType));
1078     Value *Operands[2] = {
1079       CI->getOperand(0),
1080       CI->getOperand(1)
1081     };
1082     CallInst *nCI = CallInst::Create(Func, Operands, "imul24_high");
1083     nCI->insertBefore(CI);
1084     CI->replaceAllUsesWith(nCI);
1085   }
1086 }
1087
1088 bool
1089 AMDILPeepholeOpt::isRWGLocalOpt(CallInst *CI)
1090 {
1091   return (CI != NULL && mRWGOpt
1092           && CI->getOperand(CI->getNumOperands() - 1)->getName()
1093           == "__amdil_get_local_size_int");
1094 }
1095
1096 void
1097 AMDILPeepholeOpt::expandRWGLocalOpt(CallInst *CI)
1098 {
1099   assert(isRWGLocalOpt(CI) &&
1100          "This optmization only works when the call inst is get_local_size!");
1101   std::vector<Constant *> consts;
1102   for (uint32_t x = 0; x < 3; ++x) {
1103     uint32_t val = mSTM->getGlobalManager()->getLocal(mF->getName(), x);
1104     consts.push_back(ConstantInt::get(Type::getInt32Ty(*mCTX), val));
1105   }
1106   consts.push_back(ConstantInt::get(Type::getInt32Ty(*mCTX), 0));
1107   Value *cVec = ConstantVector::get(consts);
1108   CI->replaceAllUsesWith(cVec);
1109   ++LocalFuncs;
1110   return;
1111 }
1112
1113 bool
1114 AMDILPeepholeOpt::convertAccurateDivide(CallInst *CI)
1115 {
1116   if (!CI) {
1117     return false;
1118   }
1119   if (mSTM->device()->getGeneration() == AMDILDeviceInfo::HD6XXX
1120       && (mSTM->getDeviceName() == "cayman")) {
1121     return false;
1122   }
1123   return CI->getOperand(CI->getNumOperands() - 1)->getName().substr(0, 20)
1124       == "__amdil_improved_div";
1125 }
1126
1127 void
1128 AMDILPeepholeOpt::expandAccurateDivide(CallInst *CI)
1129 {
1130   assert(convertAccurateDivide(CI)
1131          && "expanding accurate divide can only happen if it is expandable!");
1132   BinaryOperator *divOp =
1133     BinaryOperator::Create(Instruction::FDiv, CI->getOperand(0),
1134                            CI->getOperand(1), "fdiv32", CI);
1135   CI->replaceAllUsesWith(divOp);
1136 }
1137
1138 bool
1139 AMDILPeepholeOpt::propagateSamplerInst(CallInst *CI)
1140 {
1141   if (optLevel != CodeGenOpt::None) {
1142     return false;
1143   }
1144
1145   if (!CI) {
1146     return false;
1147   }
1148
1149   unsigned funcNameIdx = 0;
1150   funcNameIdx = CI->getNumOperands() - 1;
1151   StringRef calleeName = CI->getOperand(funcNameIdx)->getName();
1152   if (calleeName != "__amdil_image2d_read_norm"
1153    && calleeName != "__amdil_image2d_read_unnorm"
1154    && calleeName != "__amdil_image3d_read_norm"
1155    && calleeName != "__amdil_image3d_read_unnorm") {
1156     return false;
1157   }
1158
1159   unsigned samplerIdx = 2;
1160   samplerIdx = 1;
1161   Value *sampler = CI->getOperand(samplerIdx);
1162   LoadInst *lInst = dyn_cast<LoadInst>(sampler);
1163   if (!lInst) {
1164     return false;
1165   }
1166
1167   if (lInst->getPointerAddressSpace() != AMDILAS::PRIVATE_ADDRESS) {
1168     return false;
1169   }
1170
1171   GlobalVariable *gv = dyn_cast<GlobalVariable>(lInst->getPointerOperand());
1172   // If we are loading from what is not a global value, then we
1173   // fail and return.
1174   if (!gv) {
1175     return false;
1176   }
1177
1178   // If we don't have an initializer or we have an initializer and
1179   // the initializer is not a 32bit integer, we fail.
1180   if (!gv->hasInitializer()
1181       || !gv->getInitializer()->getType()->isIntegerTy(32)) {
1182       return false;
1183   }
1184
1185   // Now that we have the global variable initializer, lets replace
1186   // all uses of the load instruction with the samplerVal and
1187   // reparse the __amdil_is_constant() function.
1188   Constant *samplerVal = gv->getInitializer();
1189   lInst->replaceAllUsesWith(samplerVal);
1190   return true;
1191 }
1192
1193 bool
1194 AMDILPeepholeOpt::doInitialization(Module &M)
1195 {
1196   return false;
1197 }
1198
1199 bool
1200 AMDILPeepholeOpt::doFinalization(Module &M)
1201 {
1202   return false;
1203 }
1204
1205 void
1206 AMDILPeepholeOpt::getAnalysisUsage(AnalysisUsage &AU) const
1207 {
1208   AU.addRequired<MachineFunctionAnalysis>();
1209   FunctionPass::getAnalysisUsage(AU);
1210   AU.setPreservesAll();
1211 }