1 //===-- AMDILPeepholeOptimizer.cpp - TODO: Add brief description -------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //==-----------------------------------------------------------------------===//
10 #define DEBUG_TYPE "PeepholeOpt"
12 #define DEBUGME (DebugFlag && isCurrentDebugType(DEBUG_TYPE))
17 #include "AMDILAlgorithms.tpp"
18 #include "AMDILDevices.h"
19 #include "AMDILGlobalManager.h"
20 #include "AMDILKernelManager.h"
21 #include "AMDILMachineFunctionInfo.h"
22 #include "AMDILUtilityFunctions.h"
23 #include "llvm/ADT/Statistic.h"
24 #include "llvm/ADT/StringExtras.h"
25 #include "llvm/ADT/StringRef.h"
26 #include "llvm/ADT/Twine.h"
27 #include "llvm/CodeGen/MachineFunction.h"
28 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
29 #include "llvm/Function.h"
30 #include "llvm/Instructions.h"
31 #include "llvm/Module.h"
32 #include "llvm/Support/Debug.h"
33 #include "llvm/Support/MathExtras.h"
38 STATISTIC(PointerAssignments, "Number of dynamic pointer "
39 "assigments discovered");
40 STATISTIC(PointerSubtract, "Number of pointer subtractions discovered");
42 STATISTIC(LocalFuncs, "Number of get_local_size(N) functions removed");
45 // The Peephole optimization pass is used to do simple last minute optimizations
46 // that are required for correct code or to remove redundant functions
48 class LLVM_LIBRARY_VISIBILITY AMDILPeepholeOpt : public FunctionPass {
52 AMDILPeepholeOpt(TargetMachine &tm AMDIL_OPT_LEVEL_DECL);
54 const char *getPassName() const;
55 bool runOnFunction(Function &F);
56 bool doInitialization(Module &M);
57 bool doFinalization(Module &M);
58 void getAnalysisUsage(AnalysisUsage &AU) const;
61 // Function to initiate all of the instruction level optimizations.
62 bool instLevelOptimizations(BasicBlock::iterator *inst);
63 // Quick check to see if we need to dump all of the pointers into the
64 // arena. If this is correct, then we set all pointers to exist in arena. This
65 // is a workaround for aliasing of pointers in a struct/union.
66 bool dumpAllIntoArena(Function &F);
67 // Because I don't want to invalidate any pointers while in the
68 // safeNestedForEachFunction. I push atomic conversions to a vector and handle
69 // it later. This function does the conversions if required.
70 void doAtomicConversionIfNeeded(Function &F);
71 // Because __amdil_is_constant cannot be properly evaluated if
72 // optimizations are disabled, the call's are placed in a vector
73 // and evaluated after the __amdil_image* functions are evaluated
74 // which should allow the __amdil_is_constant function to be
75 // evaluated correctly.
76 void doIsConstCallConversionIfNeeded();
81 CodeGenOpt::Level optLevel;
82 // Run a series of tests to see if we can optimize a CALL instruction.
83 bool optimizeCallInst(BasicBlock::iterator *bbb);
84 // A peephole optimization to optimize bit extract sequences.
85 bool optimizeBitExtract(Instruction *inst);
86 // A peephole optimization to optimize bit insert sequences.
87 bool optimizeBitInsert(Instruction *inst);
88 bool setupBitInsert(Instruction *base,
92 // Expand the bit field insert instruction on versions of OpenCL that
94 bool expandBFI(CallInst *CI);
95 // Expand the bit field mask instruction on version of OpenCL that
97 bool expandBFM(CallInst *CI);
98 // On 7XX and 8XX operations, we do not have 24 bit signed operations. So in
99 // this case we need to expand them. These functions check for 24bit functions
101 bool isSigned24BitOps(CallInst *CI);
102 void expandSigned24BitOps(CallInst *CI);
103 // One optimization that can occur is that if the required workgroup size is
104 // specified then the result of get_local_size is known at compile time and
105 // can be returned accordingly.
106 bool isRWGLocalOpt(CallInst *CI);
107 void expandRWGLocalOpt(CallInst *CI);
108 // On northern island cards, the division is slightly less accurate than on
109 // previous generations, so we need to utilize a more accurate division. So we
110 // can translate the accurate divide to a normal divide on all other cards.
111 bool convertAccurateDivide(CallInst *CI);
112 void expandAccurateDivide(CallInst *CI);
113 // If the alignment is set incorrectly, it can produce really inefficient
114 // code. This checks for this scenario and fixes it if possible.
115 bool correctMisalignedMemOp(Instruction *inst);
117 // If we are in no opt mode, then we need to make sure that
118 // local samplers are properly propagated as constant propagation
119 // doesn't occur and we need to know the value of kernel defined
120 // samplers at compile time.
121 bool propagateSamplerInst(CallInst *CI);
125 const AMDILSubtarget *mSTM;
126 SmallVector< std::pair<CallInst *, Function *>, 16> atomicFuncs;
127 SmallVector<CallInst *, 16> isConstVec;
128 }; // class AMDILPeepholeOpt
129 char AMDILPeepholeOpt::ID = 0;
130 } // anonymous namespace
134 createAMDILPeepholeOpt(TargetMachine &tm AMDIL_OPT_LEVEL_DECL)
136 return new AMDILPeepholeOpt(tm AMDIL_OPT_LEVEL_VAR);
140 AMDILPeepholeOpt::AMDILPeepholeOpt(TargetMachine &tm AMDIL_OPT_LEVEL_DECL)
141 : FunctionPass(ID), TM(tm)
144 optLevel = TM.getOptLevel();
148 AMDILPeepholeOpt::~AMDILPeepholeOpt()
153 AMDILPeepholeOpt::getPassName() const
155 return "AMDIL PeepHole Optimization Pass";
159 containsPointerType(Type *Ty)
164 switch(Ty->getTypeID()) {
167 case Type::StructTyID: {
168 const StructType *ST = dyn_cast<StructType>(Ty);
169 for (StructType::element_iterator stb = ST->element_begin(),
170 ste = ST->element_end(); stb != ste; ++stb) {
171 if (!containsPointerType(*stb)) {
178 case Type::VectorTyID:
179 case Type::ArrayTyID:
180 return containsPointerType(dyn_cast<SequentialType>(Ty)->getElementType());
181 case Type::PointerTyID:
188 AMDILPeepholeOpt::dumpAllIntoArena(Function &F)
190 bool dumpAll = false;
191 for (Function::const_arg_iterator cab = F.arg_begin(),
192 cae = F.arg_end(); cab != cae; ++cab) {
193 const Argument *arg = cab;
194 const PointerType *PT = dyn_cast<PointerType>(arg->getType());
198 Type *DereferencedType = PT->getElementType();
199 if (!dyn_cast<StructType>(DereferencedType)
203 if (!containsPointerType(DereferencedType)) {
206 // FIXME: Because a pointer inside of a struct/union may be aliased to
207 // another pointer we need to take the conservative approach and place all
208 // pointers into the arena until more advanced detection is implemented.
214 AMDILPeepholeOpt::doIsConstCallConversionIfNeeded()
216 if (isConstVec.empty()) {
219 for (unsigned x = 0, y = isConstVec.size(); x < y; ++x) {
220 CallInst *CI = isConstVec[x];
221 Constant *CV = dyn_cast<Constant>(CI->getOperand(0));
222 Type *aType = Type::getInt32Ty(*mCTX);
223 Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1)
224 : ConstantInt::get(aType, 0);
225 CI->replaceAllUsesWith(Val);
226 CI->eraseFromParent();
231 AMDILPeepholeOpt::doAtomicConversionIfNeeded(Function &F)
233 // Don't do anything if we don't have any atomic operations.
234 if (atomicFuncs.empty()) {
237 // Change the function name for the atomic if it is required
238 uint32_t size = atomicFuncs.size();
239 for (uint32_t x = 0; x < size; ++x) {
240 atomicFuncs[x].first->setOperand(
241 atomicFuncs[x].first->getNumOperands()-1,
242 atomicFuncs[x].second);
246 if (mConvertAtomics) {
249 // If we did not convert all of the atomics, then we need to make sure that
250 // the atomics that were not converted have their base pointers set to use the
252 Function::arg_iterator argB = F.arg_begin();
253 Function::arg_iterator argE = F.arg_end();
254 AMDILKernelManager *KM = mSTM->getKernelManager();
255 AMDILMachineFunctionInfo *mMFI = getAnalysis<MachineFunctionAnalysis>().getMF()
256 .getInfo<AMDILMachineFunctionInfo>();
257 for (; argB != argE; ++argB) {
258 if (mSTM->device()->isSupported(AMDILDeviceInfo::ArenaUAV)) {
259 KM->setUAVID(argB,mSTM->device()->getResourceID(AMDILDevice::ARENA_UAV_ID));
260 mMFI->uav_insert(mSTM->device()->getResourceID(AMDILDevice::ARENA_UAV_ID));
262 KM->setUAVID(argB,mSTM->device()->getResourceID(AMDILDevice::GLOBAL_ID));
263 mMFI->uav_insert(mSTM->device()->getResourceID(AMDILDevice::GLOBAL_ID));
269 AMDILPeepholeOpt::runOnFunction(Function &MF)
273 mSTM = &TM.getSubtarget<AMDILSubtarget>();
277 mCTX = &MF.getType()->getContext();
278 mConvertAtomics = true;
279 if (dumpAllIntoArena(MF)) {
280 for (Function::const_arg_iterator cab = MF.arg_begin(),
281 cae = MF.arg_end(); cab != cae; ++cab) {
282 const Argument *arg = cab;
283 AMDILKernelManager *KM = mSTM->getKernelManager();
284 KM->setUAVID(getBasePointerValue(arg),
285 mSTM->device()->getResourceID(AMDILDevice::GLOBAL_ID));
288 mRWGOpt = mSTM->getGlobalManager()->hasRWG(MF.getName());
289 safeNestedForEach(MF.begin(), MF.end(), MF.begin()->begin(),
290 std::bind1st(std::mem_fun(&AMDILPeepholeOpt::instLevelOptimizations),
293 doAtomicConversionIfNeeded(MF);
294 doIsConstCallConversionIfNeeded();
303 AMDILPeepholeOpt::optimizeCallInst(BasicBlock::iterator *bbb)
305 Instruction *inst = (*bbb);
306 CallInst *CI = dyn_cast<CallInst>(inst);
310 if (isSigned24BitOps(CI)) {
311 expandSigned24BitOps(CI);
313 CI->eraseFromParent();
316 if (isRWGLocalOpt(CI)) {
317 expandRWGLocalOpt(CI);
320 if (propagateSamplerInst(CI)) {
323 if (expandBFI(CI) || expandBFM(CI)) {
325 CI->eraseFromParent();
328 if (convertAccurateDivide(CI)) {
329 expandAccurateDivide(CI);
331 CI->eraseFromParent();
335 StringRef calleeName = CI->getOperand(CI->getNumOperands()-1)->getName();
336 if (calleeName.startswith("__amdil_is_constant")) {
337 // If we do not have optimizations, then this
338 // cannot be properly evaluated, so we add the
339 // call instruction to a vector and process
340 // them at the end of processing after the
341 // samplers have been correctly handled.
342 if (optLevel == CodeGenOpt::None) {
343 isConstVec.push_back(CI);
346 Constant *CV = dyn_cast<Constant>(CI->getOperand(0));
347 Type *aType = Type::getInt32Ty(*mCTX);
348 Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1)
349 : ConstantInt::get(aType, 0);
350 CI->replaceAllUsesWith(Val);
352 CI->eraseFromParent();
357 if (calleeName.equals("__amdil_is_asic_id_i32")) {
358 ConstantInt *CV = dyn_cast<ConstantInt>(CI->getOperand(0));
359 Type *aType = Type::getInt32Ty(*mCTX);
362 Val = ConstantInt::get(aType,
363 mSTM->device()->getDeviceFlag() & CV->getZExtValue());
365 Val = ConstantInt::get(aType, 0);
367 CI->replaceAllUsesWith(Val);
369 CI->eraseFromParent();
372 Function *F = dyn_cast<Function>(CI->getOperand(CI->getNumOperands()-1));
376 if (F->getName().startswith("__atom") && !CI->getNumUses()
377 && F->getName().find("_xchg") == StringRef::npos) {
378 std::string buffer(F->getName().str() + "_noret");
379 F = dyn_cast<Function>(
380 F->getParent()->getOrInsertFunction(buffer, F->getFunctionType()));
381 atomicFuncs.push_back(std::make_pair <CallInst*, Function*>(CI, F));
384 if (!mSTM->device()->isSupported(AMDILDeviceInfo::ArenaSegment)
385 && !mSTM->device()->isSupported(AMDILDeviceInfo::MultiUAV)) {
388 if (!mConvertAtomics) {
391 StringRef name = F->getName();
392 if (name.startswith("__atom") && name.find("_g") != StringRef::npos) {
393 Value *ptr = CI->getOperand(0);
394 const Value *basePtr = getBasePointerValue(ptr);
395 const Argument *Arg = dyn_cast<Argument>(basePtr);
397 AMDILGlobalManager *GM = mSTM->getGlobalManager();
398 int32_t id = GM->getArgID(Arg);
400 std::stringstream ss;
401 ss << name.data() << "_" << id << '\n';
404 F = dyn_cast<Function>(
405 F->getParent() ->getOrInsertFunction(val, F->getFunctionType()));
406 atomicFuncs.push_back(std::make_pair <CallInst*, Function*>(CI, F));
408 mConvertAtomics = false;
411 mConvertAtomics = false;
418 AMDILPeepholeOpt::setupBitInsert(Instruction *base,
425 dbgs() << "Null pointer passed into function.\n";
430 if (base->getOpcode() == Instruction::Shl) {
431 shift = dyn_cast<Constant>(base->getOperand(1));
432 } else if (base->getOpcode() == Instruction::And) {
433 mask = dyn_cast<Constant>(base->getOperand(1));
437 dbgs() << "Failed setup with no Shl or And instruction on base opcode!\n";
439 // If the base is neither a Shl or a And, we don't fit any of the patterns above.
442 src = dyn_cast<Instruction>(base->getOperand(0));
445 dbgs() << "Failed setup since the base operand is not an instruction!\n";
449 // If we find an 'and' operation, then we don't need to
450 // find the next operation as we already know the
451 // bits that are valid at this point.
455 if (src->getOpcode() == Instruction::Shl && !shift) {
456 shift = dyn_cast<Constant>(src->getOperand(1));
457 src = dyn_cast<Instruction>(src->getOperand(0));
458 } else if (src->getOpcode() == Instruction::And && !mask) {
459 mask = dyn_cast<Constant>(src->getOperand(1));
461 if (!mask && !shift) {
463 dbgs() << "Failed setup since both mask and shift are NULL!\n";
465 // Did not find a constant mask or a shift.
471 AMDILPeepholeOpt::optimizeBitInsert(Instruction *inst)
476 if (!inst->isBinaryOp()) {
479 if (inst->getOpcode() != Instruction::Or) {
482 if (optLevel == CodeGenOpt::None) {
485 // We want to do an optimization on a sequence of ops that in the end equals a
486 // single ISA instruction.
487 // The base pattern for this optimization is - ((A & B) << C) | ((D & E) << F)
488 // Some simplified versions of this pattern are as follows:
489 // (A & B) | (D & E) when B & E == 0 && C == 0 && F == 0
490 // ((A & B) << C) | (D & E) when B ^ E == 0 && (1 << C) >= E
491 // (A & B) | ((D & E) << F) when B ^ E == 0 && (1 << F) >= B
492 // (A & B) | (D << F) when (1 << F) >= B
493 // (A << C) | (D & E) when (1 << C) >= E
494 if (mSTM->device()->getGeneration() == AMDILDeviceInfo::HD4XXX) {
495 // The HD4XXX hardware doesn't support the ubit_insert instruction.
498 Type *aType = inst->getType();
499 bool isVector = aType->isVectorTy();
501 // This optimization only works on 32bit integers.
502 if (aType->getScalarType()
503 != Type::getInt32Ty(inst->getContext())) {
507 const VectorType *VT = dyn_cast<VectorType>(aType);
508 numEle = VT->getNumElements();
509 // We currently cannot support more than 4 elements in a intrinsic and we
510 // cannot support Vec3 types.
511 if (numEle > 4 || numEle == 3) {
515 // TODO: Handle vectors.
518 dbgs() << "!!! Vectors are not supported yet!\n";
522 Instruction *LHSSrc = NULL, *RHSSrc = NULL;
523 Constant *LHSMask = NULL, *RHSMask = NULL;
524 Constant *LHSShift = NULL, *RHSShift = NULL;
525 Instruction *LHS = dyn_cast<Instruction>(inst->getOperand(0));
526 Instruction *RHS = dyn_cast<Instruction>(inst->getOperand(1));
527 if (!setupBitInsert(LHS, LHSSrc, LHSMask, LHSShift)) {
529 dbgs() << "Found an OR Operation that failed setup!\n";
531 if (LHS) { LHS->dump(); }
532 if (LHSSrc) { LHSSrc->dump(); }
533 if (LHSMask) { LHSMask->dump(); }
534 if (LHSShift) { LHSShift->dump(); }
536 // There was an issue with the setup for BitInsert.
539 if (!setupBitInsert(RHS, RHSSrc, RHSMask, RHSShift)) {
541 dbgs() << "Found an OR Operation that failed setup!\n";
543 if (RHS) { RHS->dump(); }
544 if (RHSSrc) { RHSSrc->dump(); }
545 if (RHSMask) { RHSMask->dump(); }
546 if (RHSShift) { RHSShift->dump(); }
548 // There was an issue with the setup for BitInsert.
552 dbgs() << "Found an OR operation that can possible be optimized to ubit insert!\n";
553 dbgs() << "Op: "; inst->dump();
554 dbgs() << "LHS: "; if (LHS) { LHS->dump(); } else { dbgs() << "(None)\n"; }
555 dbgs() << "LHS Src: "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(None)\n"; }
556 dbgs() << "LHS Mask: "; if (LHSMask) { LHSMask->dump(); } else { dbgs() << "(None)\n"; }
557 dbgs() << "LHS Shift: "; if (LHSShift) { LHSShift->dump(); } else { dbgs() << "(None)\n"; }
558 dbgs() << "RHS: "; if (RHS) { RHS->dump(); } else { dbgs() << "(None)\n"; }
559 dbgs() << "RHS Src: "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(None)\n"; }
560 dbgs() << "RHS Mask: "; if (RHSMask) { RHSMask->dump(); } else { dbgs() << "(None)\n"; }
561 dbgs() << "RHS Shift: "; if (RHSShift) { RHSShift->dump(); } else { dbgs() << "(None)\n"; }
563 Constant *offset = NULL;
564 Constant *width = NULL;
565 int32_t lhsMaskVal = 0, rhsMaskVal = 0;
566 int32_t lhsShiftVal = 0, rhsShiftVal = 0;
567 int32_t lhsMaskWidth = 0, rhsMaskWidth = 0;
568 int32_t lhsMaskOffset = 0, rhsMaskOffset = 0;
569 lhsMaskVal = (int32_t)(LHSMask
570 ? dyn_cast<ConstantInt>(LHSMask)->getZExtValue() : 0);
571 rhsMaskVal = (int32_t)(RHSMask
572 ? dyn_cast<ConstantInt>(RHSMask)->getZExtValue() : 0);
573 lhsShiftVal = (int32_t)(LHSShift
574 ? dyn_cast<ConstantInt>(LHSShift)->getZExtValue() : 0);
575 rhsShiftVal = (int32_t)(RHSShift
576 ? dyn_cast<ConstantInt>(RHSShift)->getZExtValue() : 0);
577 lhsMaskWidth = lhsMaskVal ? CountPopulation_32(lhsMaskVal) : 32 - lhsShiftVal;
578 rhsMaskWidth = rhsMaskVal ? CountPopulation_32(rhsMaskVal) : 32 - rhsShiftVal;
579 lhsMaskOffset = lhsMaskVal ? CountTrailingZeros_32(lhsMaskVal) : lhsShiftVal;
580 rhsMaskOffset = rhsMaskVal ? CountTrailingZeros_32(rhsMaskVal) : rhsShiftVal;
581 // TODO: Handle the case of A & B | D & ~B(i.e. inverted masks).
583 dbgs() << "Found pattern: \'((A" << (LHSMask ? " & B)" : ")");
584 dbgs() << (LHSShift ? " << C)" : ")") << " | ((D" ;
585 dbgs() << (RHSMask ? " & E)" : ")");
586 dbgs() << (RHSShift ? " << F)\'\n" : ")\'\n");
587 dbgs() << "A = LHSSrc\t\tD = RHSSrc \n";
588 dbgs() << "B = " << lhsMaskVal << "\t\tE = " << rhsMaskVal << "\n";
589 dbgs() << "C = " << lhsShiftVal << "\t\tF = " << rhsShiftVal << "\n";
590 dbgs() << "width(B) = " << lhsMaskWidth;
591 dbgs() << "\twidth(E) = " << rhsMaskWidth << "\n";
592 dbgs() << "offset(B) = " << lhsMaskOffset;
593 dbgs() << "\toffset(E) = " << rhsMaskOffset << "\n";
594 dbgs() << "Constraints: \n";
595 dbgs() << "\t(1) B ^ E == 0\n";
596 dbgs() << "\t(2-LHS) B is a mask\n";
597 dbgs() << "\t(2-LHS) E is a mask\n";
598 dbgs() << "\t(3-LHS) (offset(B)) >= (width(E) + offset(E))\n";
599 dbgs() << "\t(3-RHS) (offset(E)) >= (width(B) + offset(B))\n";
601 if ((lhsMaskVal || rhsMaskVal) && !(lhsMaskVal ^ rhsMaskVal)) {
603 dbgs() << lhsMaskVal << " ^ " << rhsMaskVal;
604 dbgs() << " = " << (lhsMaskVal ^ rhsMaskVal) << "\n";
605 dbgs() << "Failed constraint 1!\n";
610 dbgs() << "LHS = " << lhsMaskOffset << "";
611 dbgs() << " >= (" << rhsMaskWidth << " + " << rhsMaskOffset << ") = ";
612 dbgs() << (lhsMaskOffset >= (rhsMaskWidth + rhsMaskOffset));
613 dbgs() << "\nRHS = " << rhsMaskOffset << "";
614 dbgs() << " >= (" << lhsMaskWidth << " + " << lhsMaskOffset << ") = ";
615 dbgs() << (rhsMaskOffset >= (lhsMaskWidth + lhsMaskOffset));
618 if (lhsMaskOffset >= (rhsMaskWidth + rhsMaskOffset)) {
619 offset = ConstantInt::get(aType, lhsMaskOffset, false);
620 width = ConstantInt::get(aType, lhsMaskWidth, false);
622 if (!isMask_32(lhsMaskVal) && !isShiftedMask_32(lhsMaskVal)) {
624 dbgs() << "Value is not a Mask: " << lhsMaskVal << "\n";
625 dbgs() << "Failed constraint 2!\n";
630 LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
632 } else if (lhsShiftVal != lhsMaskOffset) {
633 LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
637 dbgs() << "Optimizing LHS!\n";
639 } else if (rhsMaskOffset >= (lhsMaskWidth + lhsMaskOffset)) {
640 offset = ConstantInt::get(aType, rhsMaskOffset, false);
641 width = ConstantInt::get(aType, rhsMaskWidth, false);
644 if (!isMask_32(rhsMaskVal) && !isShiftedMask_32(rhsMaskVal)) {
646 dbgs() << "Non-Mask: " << rhsMaskVal << "\n";
647 dbgs() << "Failed constraint 2!\n";
652 LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
654 } else if (rhsShiftVal != rhsMaskOffset) {
655 LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
659 dbgs() << "Optimizing RHS!\n";
663 dbgs() << "Failed constraint 3!\n";
668 dbgs() << "Width: "; if (width) { width->dump(); } else { dbgs() << "(0)\n"; }
669 dbgs() << "Offset: "; if (offset) { offset->dump(); } else { dbgs() << "(0)\n"; }
670 dbgs() << "LHSSrc: "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(0)\n"; }
671 dbgs() << "RHSSrc: "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(0)\n"; }
673 if (!offset || !width) {
675 dbgs() << "Either width or offset are NULL, failed detection!\n";
679 // Lets create the function signature.
680 std::vector<Type *> callTypes;
681 callTypes.push_back(aType);
682 callTypes.push_back(aType);
683 callTypes.push_back(aType);
684 callTypes.push_back(aType);
685 FunctionType *funcType = FunctionType::get(aType, callTypes, false);
686 std::string name = "__amdil_ubit_insert";
687 if (isVector) { name += "_v" + itostr(numEle) + "u32"; } else { name += "_u32"; }
689 dyn_cast<Function>(inst->getParent()->getParent()->getParent()->
690 getOrInsertFunction(llvm::StringRef(name), funcType));
691 Value *Operands[4] = {
697 CallInst *CI = CallInst::Create(Func, Operands, "BitInsertOpt");
699 dbgs() << "Old Inst: ";
701 dbgs() << "New Inst: ";
705 CI->insertBefore(inst);
706 inst->replaceAllUsesWith(CI);
711 AMDILPeepholeOpt::optimizeBitExtract(Instruction *inst)
716 if (!inst->isBinaryOp()) {
719 if (inst->getOpcode() != Instruction::And) {
722 if (optLevel == CodeGenOpt::None) {
725 // We want to do some simple optimizations on Shift right/And patterns. The
726 // basic optimization is to turn (A >> B) & C where A is a 32bit type, B is a
727 // value smaller than 32 and C is a mask. If C is a constant value, then the
728 // following transformation can occur. For signed integers, it turns into the
729 // function call dst = __amdil_ibit_extract(log2(C), B, A) For unsigned
730 // integers, it turns into the function call dst =
731 // __amdil_ubit_extract(log2(C), B, A) The function __amdil_[u|i]bit_extract
732 // can be found in Section 7.9 of the ATI IL spec of the stream SDK for
733 // Evergreen hardware.
734 if (mSTM->device()->getGeneration() == AMDILDeviceInfo::HD4XXX) {
735 // This does not work on HD4XXX hardware.
738 Type *aType = inst->getType();
739 bool isVector = aType->isVectorTy();
741 // This only works on 32bit integers
742 if (aType->getScalarType()
743 != Type::getInt32Ty(inst->getContext())) {
747 const VectorType *VT = dyn_cast<VectorType>(aType);
748 numEle = VT->getNumElements();
749 // We currently cannot support more than 4 elements in a intrinsic and we
750 // cannot support Vec3 types.
751 if (numEle > 4 || numEle == 3) {
755 BinaryOperator *ShiftInst = dyn_cast<BinaryOperator>(inst->getOperand(0));
756 // If the first operand is not a shift instruction, then we can return as it
757 // doesn't match this pattern.
758 if (!ShiftInst || !ShiftInst->isShift()) {
761 // If we are a shift left, then we need don't match this pattern.
762 if (ShiftInst->getOpcode() == Instruction::Shl) {
765 bool isSigned = ShiftInst->isArithmeticShift();
766 Constant *AndMask = dyn_cast<Constant>(inst->getOperand(1));
767 Constant *ShrVal = dyn_cast<Constant>(ShiftInst->getOperand(1));
768 // Lets make sure that the shift value and the and mask are constant integers.
769 if (!AndMask || !ShrVal) {
772 Constant *newMaskConst;
773 Constant *shiftValConst;
775 // Handle the vector case
776 std::vector<Constant *> maskVals;
777 std::vector<Constant *> shiftVals;
778 ConstantVector *AndMaskVec = dyn_cast<ConstantVector>(AndMask);
779 ConstantVector *ShrValVec = dyn_cast<ConstantVector>(ShrVal);
780 Type *scalarType = AndMaskVec->getType()->getScalarType();
781 assert(AndMaskVec->getNumOperands() ==
782 ShrValVec->getNumOperands() && "cannot have a "
783 "combination where the number of elements to a "
784 "shift and an and are different!");
785 for (size_t x = 0, y = AndMaskVec->getNumOperands(); x < y; ++x) {
786 ConstantInt *AndCI = dyn_cast<ConstantInt>(AndMaskVec->getOperand(x));
787 ConstantInt *ShiftIC = dyn_cast<ConstantInt>(ShrValVec->getOperand(x));
788 if (!AndCI || !ShiftIC) {
791 uint32_t maskVal = (uint32_t)AndCI->getZExtValue();
792 if (!isMask_32(maskVal)) {
795 maskVal = (uint32_t)CountTrailingOnes_32(maskVal);
796 uint32_t shiftVal = (uint32_t)ShiftIC->getZExtValue();
797 // If the mask or shiftval is greater than the bitcount, then break out.
798 if (maskVal >= 32 || shiftVal >= 32) {
801 // If the mask val is greater than the the number of original bits left
802 // then this optimization is invalid.
803 if (maskVal > (32 - shiftVal)) {
806 maskVals.push_back(ConstantInt::get(scalarType, maskVal, isSigned));
807 shiftVals.push_back(ConstantInt::get(scalarType, shiftVal, isSigned));
809 newMaskConst = ConstantVector::get(maskVals);
810 shiftValConst = ConstantVector::get(shiftVals);
812 // Handle the scalar case
813 uint32_t maskVal = (uint32_t)dyn_cast<ConstantInt>(AndMask)->getZExtValue();
814 // This must be a mask value where all lower bits are set to 1 and then any
815 // bit higher is set to 0.
816 if (!isMask_32(maskVal)) {
819 maskVal = (uint32_t)CountTrailingOnes_32(maskVal);
820 // Count the number of bits set in the mask, this is the width of the
821 // resulting bit set that is extracted from the source value.
822 uint32_t shiftVal = (uint32_t)dyn_cast<ConstantInt>(ShrVal)->getZExtValue();
823 // If the mask or shift val is greater than the bitcount, then break out.
824 if (maskVal >= 32 || shiftVal >= 32) {
827 // If the mask val is greater than the the number of original bits left then
828 // this optimization is invalid.
829 if (maskVal > (32 - shiftVal)) {
832 newMaskConst = ConstantInt::get(aType, maskVal, isSigned);
833 shiftValConst = ConstantInt::get(aType, shiftVal, isSigned);
835 // Lets create the function signature.
836 std::vector<Type *> callTypes;
837 callTypes.push_back(aType);
838 callTypes.push_back(aType);
839 callTypes.push_back(aType);
840 FunctionType *funcType = FunctionType::get(aType, callTypes, false);
841 std::string name = "__amdil_ubit_extract";
843 name += "_v" + itostr(numEle) + "i32";
847 // Lets create the function.
849 dyn_cast<Function>(inst->getParent()->getParent()->getParent()->
850 getOrInsertFunction(llvm::StringRef(name), funcType));
851 Value *Operands[3] = {
854 ShiftInst->getOperand(0)
856 // Lets create the Call with the operands
857 CallInst *CI = CallInst::Create(Func, Operands, "ByteExtractOpt");
858 CI->insertBefore(inst);
859 inst->replaceAllUsesWith(CI);
864 AMDILPeepholeOpt::expandBFI(CallInst *CI)
866 if (!CI || mSTM->calVersion() <= CAL_VERSION_SC_150) {
869 Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
870 if (!LHS->getName().startswith("__amdil_bfi")) {
873 Type* type = CI->getOperand(0)->getType();
874 Constant *negOneConst = NULL;
875 if (type->isVectorTy()) {
876 std::vector<Constant *> negOneVals;
877 negOneConst = ConstantInt::get(CI->getContext(),
878 APInt(32, StringRef("-1"), 10));
880 y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) {
881 negOneVals.push_back(negOneConst);
883 negOneConst = ConstantVector::get(negOneVals);
885 negOneConst = ConstantInt::get(CI->getContext(),
886 APInt(32, StringRef("-1"), 10));
888 // __amdil_bfi => (A & B) | (~A & C)
889 BinaryOperator *lhs =
890 BinaryOperator::Create(Instruction::And, CI->getOperand(0),
891 CI->getOperand(1), "bfi_and", CI);
892 BinaryOperator *rhs =
893 BinaryOperator::Create(Instruction::Xor, CI->getOperand(0), negOneConst,
895 rhs = BinaryOperator::Create(Instruction::And, rhs, CI->getOperand(2),
897 lhs = BinaryOperator::Create(Instruction::Or, lhs, rhs, "bfi_or", CI);
898 CI->replaceAllUsesWith(lhs);
903 AMDILPeepholeOpt::expandBFM(CallInst *CI)
905 if (!CI || mSTM->calVersion() <= CAL_VERSION_SC_150) {
908 Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
909 if (!LHS->getName().startswith("__amdil_bfm")) {
912 // __amdil_bfm => ((1 << (src0 & 0x1F)) - 1) << (src1 & 0x1f)
913 Constant *newMaskConst = NULL;
914 Constant *newShiftConst = NULL;
915 Type* type = CI->getOperand(0)->getType();
916 if (type->isVectorTy()) {
917 std::vector<Constant*> newMaskVals, newShiftVals;
918 newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F);
919 newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1);
921 y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) {
922 newMaskVals.push_back(newMaskConst);
923 newShiftVals.push_back(newShiftConst);
925 newMaskConst = ConstantVector::get(newMaskVals);
926 newShiftConst = ConstantVector::get(newShiftVals);
928 newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F);
929 newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1);
931 BinaryOperator *lhs =
932 BinaryOperator::Create(Instruction::And, CI->getOperand(0),
933 newMaskConst, "bfm_mask", CI);
934 lhs = BinaryOperator::Create(Instruction::Shl, newShiftConst,
936 lhs = BinaryOperator::Create(Instruction::Sub, lhs,
937 newShiftConst, "bfm_sub", CI);
938 BinaryOperator *rhs =
939 BinaryOperator::Create(Instruction::And, CI->getOperand(1),
940 newMaskConst, "bfm_mask", CI);
941 lhs = BinaryOperator::Create(Instruction::Shl, lhs, rhs, "bfm_shl", CI);
942 CI->replaceAllUsesWith(lhs);
947 AMDILPeepholeOpt::instLevelOptimizations(BasicBlock::iterator *bbb)
949 Instruction *inst = (*bbb);
950 if (optimizeCallInst(bbb)) {
953 if (optimizeBitExtract(inst)) {
956 if (optimizeBitInsert(inst)) {
959 if (correctMisalignedMemOp(inst)) {
965 AMDILPeepholeOpt::correctMisalignedMemOp(Instruction *inst)
967 LoadInst *linst = dyn_cast<LoadInst>(inst);
968 StoreInst *sinst = dyn_cast<StoreInst>(inst);
970 Type* Ty = inst->getType();
972 alignment = linst->getAlignment();
973 Ty = inst->getType();
975 alignment = sinst->getAlignment();
976 Ty = sinst->getValueOperand()->getType();
980 unsigned size = getTypeSize(Ty);
981 if (size == alignment || size < alignment) {
984 if (!Ty->isStructTy()) {
989 linst->setAlignment(0);
992 sinst->setAlignment(0);
999 AMDILPeepholeOpt::isSigned24BitOps(CallInst *CI)
1004 Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
1005 std::string namePrefix = LHS->getName().substr(0, 14);
1006 if (namePrefix != "__amdil_imad24" && namePrefix != "__amdil_imul24"
1007 && namePrefix != "__amdil__imul24_high") {
1010 if (mSTM->device()->usesHardware(AMDILDeviceInfo::Signed24BitOps)) {
1017 AMDILPeepholeOpt::expandSigned24BitOps(CallInst *CI)
1019 assert(isSigned24BitOps(CI) && "Must be a "
1020 "signed 24 bit operation to call this function!");
1021 Value *LHS = CI->getOperand(CI->getNumOperands()-1);
1022 // On 7XX and 8XX we do not have signed 24bit, so we need to
1023 // expand it to the following:
1024 // imul24 turns into 32bit imul
1025 // imad24 turns into 32bit imad
1026 // imul24_high turns into 32bit imulhigh
1027 if (LHS->getName().substr(0, 14) == "__amdil_imad24") {
1028 Type *aType = CI->getOperand(0)->getType();
1029 bool isVector = aType->isVectorTy();
1030 int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1;
1031 std::vector<Type*> callTypes;
1032 callTypes.push_back(CI->getOperand(0)->getType());
1033 callTypes.push_back(CI->getOperand(1)->getType());
1034 callTypes.push_back(CI->getOperand(2)->getType());
1035 FunctionType *funcType =
1036 FunctionType::get(CI->getOperand(0)->getType(), callTypes, false);
1037 std::string name = "__amdil_imad";
1039 name += "_v" + itostr(numEle) + "i32";
1043 Function *Func = dyn_cast<Function>(
1044 CI->getParent()->getParent()->getParent()->
1045 getOrInsertFunction(llvm::StringRef(name), funcType));
1046 Value *Operands[3] = {
1051 CallInst *nCI = CallInst::Create(Func, Operands, "imad24");
1052 nCI->insertBefore(CI);
1053 CI->replaceAllUsesWith(nCI);
1054 } else if (LHS->getName().substr(0, 14) == "__amdil_imul24") {
1055 BinaryOperator *mulOp =
1056 BinaryOperator::Create(Instruction::Mul, CI->getOperand(0),
1057 CI->getOperand(1), "imul24", CI);
1058 CI->replaceAllUsesWith(mulOp);
1059 } else if (LHS->getName().substr(0, 19) == "__amdil_imul24_high") {
1060 Type *aType = CI->getOperand(0)->getType();
1062 bool isVector = aType->isVectorTy();
1063 int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1;
1064 std::vector<Type*> callTypes;
1065 callTypes.push_back(CI->getOperand(0)->getType());
1066 callTypes.push_back(CI->getOperand(1)->getType());
1067 FunctionType *funcType =
1068 FunctionType::get(CI->getOperand(0)->getType(), callTypes, false);
1069 std::string name = "__amdil_imul_high";
1071 name += "_v" + itostr(numEle) + "i32";
1075 Function *Func = dyn_cast<Function>(
1076 CI->getParent()->getParent()->getParent()->
1077 getOrInsertFunction(llvm::StringRef(name), funcType));
1078 Value *Operands[2] = {
1082 CallInst *nCI = CallInst::Create(Func, Operands, "imul24_high");
1083 nCI->insertBefore(CI);
1084 CI->replaceAllUsesWith(nCI);
1089 AMDILPeepholeOpt::isRWGLocalOpt(CallInst *CI)
1091 return (CI != NULL && mRWGOpt
1092 && CI->getOperand(CI->getNumOperands() - 1)->getName()
1093 == "__amdil_get_local_size_int");
1097 AMDILPeepholeOpt::expandRWGLocalOpt(CallInst *CI)
1099 assert(isRWGLocalOpt(CI) &&
1100 "This optmization only works when the call inst is get_local_size!");
1101 std::vector<Constant *> consts;
1102 for (uint32_t x = 0; x < 3; ++x) {
1103 uint32_t val = mSTM->getGlobalManager()->getLocal(mF->getName(), x);
1104 consts.push_back(ConstantInt::get(Type::getInt32Ty(*mCTX), val));
1106 consts.push_back(ConstantInt::get(Type::getInt32Ty(*mCTX), 0));
1107 Value *cVec = ConstantVector::get(consts);
1108 CI->replaceAllUsesWith(cVec);
1114 AMDILPeepholeOpt::convertAccurateDivide(CallInst *CI)
1119 if (mSTM->device()->getGeneration() == AMDILDeviceInfo::HD6XXX
1120 && (mSTM->getDeviceName() == "cayman")) {
1123 return CI->getOperand(CI->getNumOperands() - 1)->getName().substr(0, 20)
1124 == "__amdil_improved_div";
1128 AMDILPeepholeOpt::expandAccurateDivide(CallInst *CI)
1130 assert(convertAccurateDivide(CI)
1131 && "expanding accurate divide can only happen if it is expandable!");
1132 BinaryOperator *divOp =
1133 BinaryOperator::Create(Instruction::FDiv, CI->getOperand(0),
1134 CI->getOperand(1), "fdiv32", CI);
1135 CI->replaceAllUsesWith(divOp);
1139 AMDILPeepholeOpt::propagateSamplerInst(CallInst *CI)
1141 if (optLevel != CodeGenOpt::None) {
1149 unsigned funcNameIdx = 0;
1150 funcNameIdx = CI->getNumOperands() - 1;
1151 StringRef calleeName = CI->getOperand(funcNameIdx)->getName();
1152 if (calleeName != "__amdil_image2d_read_norm"
1153 && calleeName != "__amdil_image2d_read_unnorm"
1154 && calleeName != "__amdil_image3d_read_norm"
1155 && calleeName != "__amdil_image3d_read_unnorm") {
1159 unsigned samplerIdx = 2;
1161 Value *sampler = CI->getOperand(samplerIdx);
1162 LoadInst *lInst = dyn_cast<LoadInst>(sampler);
1167 if (lInst->getPointerAddressSpace() != AMDILAS::PRIVATE_ADDRESS) {
1171 GlobalVariable *gv = dyn_cast<GlobalVariable>(lInst->getPointerOperand());
1172 // If we are loading from what is not a global value, then we
1178 // If we don't have an initializer or we have an initializer and
1179 // the initializer is not a 32bit integer, we fail.
1180 if (!gv->hasInitializer()
1181 || !gv->getInitializer()->getType()->isIntegerTy(32)) {
1185 // Now that we have the global variable initializer, lets replace
1186 // all uses of the load instruction with the samplerVal and
1187 // reparse the __amdil_is_constant() function.
1188 Constant *samplerVal = gv->getInitializer();
1189 lInst->replaceAllUsesWith(samplerVal);
1194 AMDILPeepholeOpt::doInitialization(Module &M)
1200 AMDILPeepholeOpt::doFinalization(Module &M)
1206 AMDILPeepholeOpt::getAnalysisUsage(AnalysisUsage &AU) const
1208 AU.addRequired<MachineFunctionAnalysis>();
1209 FunctionPass::getAnalysisUsage(AU);
1210 AU.setPreservesAll();