lib/Target/AMDGPU/R600ISelLowering.cpp

   1 //===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 /// \brief Custom DAG lowering for R600
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "R600ISelLowering.h"
  16 #include "AMDGPUFrameLowering.h"
  17 #include "AMDGPUIntrinsicInfo.h"
  18 #include "AMDGPUSubtarget.h"
  19 #include "R600Defines.h"
  20 #include "R600InstrInfo.h"
  21 #include "R600MachineFunctionInfo.h"
  22 #include "llvm/Analysis/ValueTracking.h"
  23 #include "llvm/CodeGen/CallingConvLower.h"
  24 #include "llvm/CodeGen/MachineFrameInfo.h"
  25 #include "llvm/CodeGen/MachineInstrBuilder.h"
  26 #include "llvm/CodeGen/MachineRegisterInfo.h"
  27 #include "llvm/CodeGen/SelectionDAG.h"
  28 #include "llvm/IR/Argument.h"
  29 #include "llvm/IR/Function.h"
  30
  31 using namespace llvm;
  32
  33 R600TargetLowering::R600TargetLowering(const TargetMachine &TM,
  34                                        const R600Subtarget &STI)
  35     : AMDGPUTargetLowering(TM, STI), Gen(STI.getGeneration()) {
  36   addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
  37   addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
  38   addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
  39   addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
  40   addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
  41   addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
  42
  43   computeRegisterProperties(STI.getRegisterInfo());
  44
  45   // Legalize loads and stores to the private address space.
  46   setOperationAction(ISD::LOAD, MVT::i32, Custom);
  47   setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
  48   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
  49
  50   // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
  51   // spaces, so it is custom lowered to handle those where it isn't.
  52   for (MVT VT : MVT::integer_valuetypes()) {
  53     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
  54     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Custom);
  55     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Custom);
  56
  57     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
  58     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Custom);
  59     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Custom);
  60
  61     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
  62     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Custom);
  63     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Custom);
  64   }
  65
  66   // Workaround for LegalizeDAG asserting on expansion of i1 vector loads.
  67   setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, MVT::v2i1, Expand);
  68   setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, MVT::v2i1, Expand);
  69   setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i32, MVT::v2i1, Expand);
  70
  71   setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i1, Expand);
  72   setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i1, Expand);
  73   setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i1, Expand);
  74
  75
  76   setOperationAction(ISD::STORE, MVT::i8, Custom);
  77   setOperationAction(ISD::STORE, MVT::i32, Custom);
  78   setOperationAction(ISD::STORE, MVT::v2i32, Custom);
  79   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
  80
  81   setTruncStoreAction(MVT::i32, MVT::i8, Custom);
  82   setTruncStoreAction(MVT::i32, MVT::i16, Custom);
  83
  84   // Workaround for LegalizeDAG asserting on expansion of i1 vector stores.
  85   setTruncStoreAction(MVT::v2i32, MVT::v2i1, Expand);
  86   setTruncStoreAction(MVT::v4i32, MVT::v4i1, Expand);
  87
  88   // Set condition code actions
  89   setCondCodeAction(ISD::SETO,   MVT::f32, Expand);
  90   setCondCodeAction(ISD::SETUO,  MVT::f32, Expand);
  91   setCondCodeAction(ISD::SETLT,  MVT::f32, Expand);
  92   setCondCodeAction(ISD::SETLE,  MVT::f32, Expand);
  93   setCondCodeAction(ISD::SETOLT, MVT::f32, Expand);
  94   setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
  95   setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
  96   setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
  97   setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
  98   setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
  99   setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
 100   setCondCodeAction(ISD::SETULE, MVT::f32, Expand);
 101
 102   setCondCodeAction(ISD::SETLE, MVT::i32, Expand);
 103   setCondCodeAction(ISD::SETLT, MVT::i32, Expand);
 104   setCondCodeAction(ISD::SETULE, MVT::i32, Expand);
 105   setCondCodeAction(ISD::SETULT, MVT::i32, Expand);
 106
 107   setOperationAction(ISD::FCOS, MVT::f32, Custom);
 108   setOperationAction(ISD::FSIN, MVT::f32, Custom);
 109
 110   setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
 111   setOperationAction(ISD::SETCC, MVT::v2i32, Expand);
 112
 113   setOperationAction(ISD::BR_CC, MVT::i32, Expand);
 114   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
 115   setOperationAction(ISD::BRCOND, MVT::Other, Custom);
 116
 117   setOperationAction(ISD::FSUB, MVT::f32, Expand);
 118
 119   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
 120   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
 121
 122   setOperationAction(ISD::SETCC, MVT::i32, Expand);
 123   setOperationAction(ISD::SETCC, MVT::f32, Expand);
 124   setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
 125   setOperationAction(ISD::FP_TO_SINT, MVT::i1, Custom);
 126   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
 127   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
 128
 129   setOperationAction(ISD::SELECT, MVT::i32, Expand);
 130   setOperationAction(ISD::SELECT, MVT::f32, Expand);
 131   setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
 132   setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
 133
 134   // ADD, SUB overflow.
 135   // TODO: turn these into Legal?
 136   if (Subtarget->hasCARRY())
 137     setOperationAction(ISD::UADDO, MVT::i32, Custom);
 138
 139   if (Subtarget->hasBORROW())
 140     setOperationAction(ISD::USUBO, MVT::i32, Custom);
 141
 142   // Expand sign extension of vectors
 143   if (!Subtarget->hasBFE())
 144     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
 145
 146   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Expand);
 147   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Expand);
 148
 149   if (!Subtarget->hasBFE())
 150     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
 151   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Expand);
 152   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Expand);
 153
 154   if (!Subtarget->hasBFE())
 155     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
 156   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand);
 157   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand);
 158
 159   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
 160   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand);
 161   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand);
 162
 163   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand);
 164
 165   setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
 166
 167   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom);
 168   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom);
 169   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
 170   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
 171
 172   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom);
 173   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom);
 174   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
 175   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
 176
 177   // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32
 178   //  to be Legal/Custom in order to avoid library calls.
 179   setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
 180   setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
 181   setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
 182
 183   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
 184
 185   const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
 186   for (MVT VT : ScalarIntVTs) {
 187     setOperationAction(ISD::ADDC, VT, Expand);
 188     setOperationAction(ISD::SUBC, VT, Expand);
 189     setOperationAction(ISD::ADDE, VT, Expand);
 190     setOperationAction(ISD::SUBE, VT, Expand);
 191   }
 192
 193   setSchedulingPreference(Sched::Source);
 194
 195
 196   setTargetDAGCombine(ISD::FP_ROUND);
 197   setTargetDAGCombine(ISD::FP_TO_SINT);
 198   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
 199   setTargetDAGCombine(ISD::SELECT_CC);
 200   setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
 201 }
 202
 203 const R600Subtarget *R600TargetLowering::getSubtarget() const {
 204   return static_cast<const R600Subtarget *>(Subtarget);
 205 }
 206
 207 static inline bool isEOP(MachineBasicBlock::iterator I) {
 208   return std::next(I)->getOpcode() == AMDGPU::RETURN;
 209 }
 210
 211 MachineBasicBlock *
 212 R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
 213                                                 MachineBasicBlock *BB) const {
 214   MachineFunction * MF = BB->getParent();
 215   MachineRegisterInfo &MRI = MF->getRegInfo();
 216   MachineBasicBlock::iterator I = MI;
 217   const R600InstrInfo *TII = getSubtarget()->getInstrInfo();
 218
 219   switch (MI.getOpcode()) {
 220   default:
 221     // Replace LDS_*_RET instruction that don't have any uses with the
 222     // equivalent LDS_*_NORET instruction.
 223     if (TII->isLDSRetInstr(MI.getOpcode())) {
 224       int DstIdx = TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
 225       assert(DstIdx != -1);
 226       MachineInstrBuilder NewMI;
 227       // FIXME: getLDSNoRetOp method only handles LDS_1A1D LDS ops. Add
 228       //        LDS_1A2D support and remove this special case.
 229       if (!MRI.use_empty(MI.getOperand(DstIdx).getReg()) ||
 230           MI.getOpcode() == AMDGPU::LDS_CMPST_RET)
 231         return BB;
 232
 233       NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
 234                       TII->get(AMDGPU::getLDSNoRetOp(MI.getOpcode())));
 235       for (unsigned i = 1, e = MI.getNumOperands(); i < e; ++i) {
 236         NewMI.addOperand(MI.getOperand(i));
 237       }
 238     } else {
 239       return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
 240     }
 241     break;
 242   case AMDGPU::CLAMP_R600: {
 243     MachineInstr *NewMI = TII->buildDefaultInstruction(
 244         *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(),
 245         MI.getOperand(1).getReg());
 246     TII->addFlag(*NewMI, 0, MO_FLAG_CLAMP);
 247     break;
 248   }
 249
 250   case AMDGPU::FABS_R600: {
 251     MachineInstr *NewMI = TII->buildDefaultInstruction(
 252         *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(),
 253         MI.getOperand(1).getReg());
 254     TII->addFlag(*NewMI, 0, MO_FLAG_ABS);
 255     break;
 256   }
 257
 258   case AMDGPU::FNEG_R600: {
 259     MachineInstr *NewMI = TII->buildDefaultInstruction(
 260         *BB, I, AMDGPU::MOV, MI.getOperand(0).getReg(),
 261         MI.getOperand(1).getReg());
 262     TII->addFlag(*NewMI, 0, MO_FLAG_NEG);
 263     break;
 264   }
 265
 266   case AMDGPU::MASK_WRITE: {
 267     unsigned maskedRegister = MI.getOperand(0).getReg();
 268     assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
 269     MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
 270     TII->addFlag(*defInstr, 0, MO_FLAG_MASK);
 271     break;
 272   }
 273
 274   case AMDGPU::MOV_IMM_F32:
 275     TII->buildMovImm(*BB, I, MI.getOperand(0).getReg(), MI.getOperand(1)
 276                                                             .getFPImm()
 277                                                             ->getValueAPF()
 278                                                             .bitcastToAPInt()
 279                                                             .getZExtValue());
 280     break;
 281   case AMDGPU::MOV_IMM_I32:
 282     TII->buildMovImm(*BB, I, MI.getOperand(0).getReg(),
 283                      MI.getOperand(1).getImm());
 284     break;
 285   case AMDGPU::MOV_IMM_GLOBAL_ADDR: {
 286     //TODO: Perhaps combine this instruction with the next if possible
 287     auto MIB = TII->buildDefaultInstruction(
 288         *BB, MI, AMDGPU::MOV, MI.getOperand(0).getReg(), AMDGPU::ALU_LITERAL_X);
 289     int Idx = TII->getOperandIdx(*MIB, AMDGPU::OpName::literal);
 290     //TODO: Ugh this is rather ugly
 291     MIB->getOperand(Idx) = MI.getOperand(1);
 292     break;
 293   }
 294   case AMDGPU::CONST_COPY: {
 295     MachineInstr *NewMI = TII->buildDefaultInstruction(
 296         *BB, MI, AMDGPU::MOV, MI.getOperand(0).getReg(), AMDGPU::ALU_CONST);
 297     TII->setImmOperand(*NewMI, AMDGPU::OpName::src0_sel,
 298                        MI.getOperand(1).getImm());
 299     break;
 300   }
 301
 302   case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
 303   case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
 304   case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
 305     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
 306         .addOperand(MI.getOperand(0))
 307         .addOperand(MI.getOperand(1))
 308         .addImm(isEOP(I)); // Set End of program bit
 309     break;
 310   }
 311   case AMDGPU::RAT_STORE_TYPED_eg: {
 312     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
 313         .addOperand(MI.getOperand(0))
 314         .addOperand(MI.getOperand(1))
 315         .addOperand(MI.getOperand(2))
 316         .addImm(isEOP(I)); // Set End of program bit
 317     break;
 318   }
 319
 320   case AMDGPU::TXD: {
 321     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 322     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 323     MachineOperand &RID = MI.getOperand(4);
 324     MachineOperand &SID = MI.getOperand(5);
 325     unsigned TextureId = MI.getOperand(6).getImm();
 326     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
 327     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
 328
 329     switch (TextureId) {
 330     case 5: // Rect
 331       CTX = CTY = 0;
 332       break;
 333     case 6: // Shadow1D
 334       SrcW = SrcZ;
 335       break;
 336     case 7: // Shadow2D
 337       SrcW = SrcZ;
 338       break;
 339     case 8: // ShadowRect
 340       CTX = CTY = 0;
 341       SrcW = SrcZ;
 342       break;
 343     case 9: // 1DArray
 344       SrcZ = SrcY;
 345       CTZ = 0;
 346       break;
 347     case 10: // 2DArray
 348       CTZ = 0;
 349       break;
 350     case 11: // Shadow1DArray
 351       SrcZ = SrcY;
 352       CTZ = 0;
 353       break;
 354     case 12: // Shadow2DArray
 355       CTZ = 0;
 356       break;
 357     }
 358     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H),
 359             T0)
 360         .addOperand(MI.getOperand(3))
 361         .addImm(SrcX)
 362         .addImm(SrcY)
 363         .addImm(SrcZ)
 364         .addImm(SrcW)
 365         .addImm(0)
 366         .addImm(0)
 367         .addImm(0)
 368         .addImm(0)
 369         .addImm(1)
 370         .addImm(2)
 371         .addImm(3)
 372         .addOperand(RID)
 373         .addOperand(SID)
 374         .addImm(CTX)
 375         .addImm(CTY)
 376         .addImm(CTZ)
 377         .addImm(CTW);
 378     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V),
 379             T1)
 380         .addOperand(MI.getOperand(2))
 381         .addImm(SrcX)
 382         .addImm(SrcY)
 383         .addImm(SrcZ)
 384         .addImm(SrcW)
 385         .addImm(0)
 386         .addImm(0)
 387         .addImm(0)
 388         .addImm(0)
 389         .addImm(1)
 390         .addImm(2)
 391         .addImm(3)
 392         .addOperand(RID)
 393         .addOperand(SID)
 394         .addImm(CTX)
 395         .addImm(CTY)
 396         .addImm(CTZ)
 397         .addImm(CTW);
 398     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
 399         .addOperand(MI.getOperand(0))
 400         .addOperand(MI.getOperand(1))
 401         .addImm(SrcX)
 402         .addImm(SrcY)
 403         .addImm(SrcZ)
 404         .addImm(SrcW)
 405         .addImm(0)
 406         .addImm(0)
 407         .addImm(0)
 408         .addImm(0)
 409         .addImm(1)
 410         .addImm(2)
 411         .addImm(3)
 412         .addOperand(RID)
 413         .addOperand(SID)
 414         .addImm(CTX)
 415         .addImm(CTY)
 416         .addImm(CTZ)
 417         .addImm(CTW)
 418         .addReg(T0, RegState::Implicit)
 419         .addReg(T1, RegState::Implicit);
 420     break;
 421   }
 422
 423   case AMDGPU::TXD_SHADOW: {
 424     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 425     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 426     MachineOperand &RID = MI.getOperand(4);
 427     MachineOperand &SID = MI.getOperand(5);
 428     unsigned TextureId = MI.getOperand(6).getImm();
 429     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
 430     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
 431
 432     switch (TextureId) {
 433     case 5: // Rect
 434       CTX = CTY = 0;
 435       break;
 436     case 6: // Shadow1D
 437       SrcW = SrcZ;
 438       break;
 439     case 7: // Shadow2D
 440       SrcW = SrcZ;
 441       break;
 442     case 8: // ShadowRect
 443       CTX = CTY = 0;
 444       SrcW = SrcZ;
 445       break;
 446     case 9: // 1DArray
 447       SrcZ = SrcY;
 448       CTZ = 0;
 449       break;
 450     case 10: // 2DArray
 451       CTZ = 0;
 452       break;
 453     case 11: // Shadow1DArray
 454       SrcZ = SrcY;
 455       CTZ = 0;
 456       break;
 457     case 12: // Shadow2DArray
 458       CTZ = 0;
 459       break;
 460     }
 461
 462     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H),
 463             T0)
 464         .addOperand(MI.getOperand(3))
 465         .addImm(SrcX)
 466         .addImm(SrcY)
 467         .addImm(SrcZ)
 468         .addImm(SrcW)
 469         .addImm(0)
 470         .addImm(0)
 471         .addImm(0)
 472         .addImm(0)
 473         .addImm(1)
 474         .addImm(2)
 475         .addImm(3)
 476         .addOperand(RID)
 477         .addOperand(SID)
 478         .addImm(CTX)
 479         .addImm(CTY)
 480         .addImm(CTZ)
 481         .addImm(CTW);
 482     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V),
 483             T1)
 484         .addOperand(MI.getOperand(2))
 485         .addImm(SrcX)
 486         .addImm(SrcY)
 487         .addImm(SrcZ)
 488         .addImm(SrcW)
 489         .addImm(0)
 490         .addImm(0)
 491         .addImm(0)
 492         .addImm(0)
 493         .addImm(1)
 494         .addImm(2)
 495         .addImm(3)
 496         .addOperand(RID)
 497         .addOperand(SID)
 498         .addImm(CTX)
 499         .addImm(CTY)
 500         .addImm(CTZ)
 501         .addImm(CTW);
 502     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
 503         .addOperand(MI.getOperand(0))
 504         .addOperand(MI.getOperand(1))
 505         .addImm(SrcX)
 506         .addImm(SrcY)
 507         .addImm(SrcZ)
 508         .addImm(SrcW)
 509         .addImm(0)
 510         .addImm(0)
 511         .addImm(0)
 512         .addImm(0)
 513         .addImm(1)
 514         .addImm(2)
 515         .addImm(3)
 516         .addOperand(RID)
 517         .addOperand(SID)
 518         .addImm(CTX)
 519         .addImm(CTY)
 520         .addImm(CTZ)
 521         .addImm(CTW)
 522         .addReg(T0, RegState::Implicit)
 523         .addReg(T1, RegState::Implicit);
 524     break;
 525   }
 526
 527   case AMDGPU::BRANCH:
 528     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
 529         .addOperand(MI.getOperand(0));
 530     break;
 531
 532   case AMDGPU::BRANCH_COND_f32: {
 533     MachineInstr *NewMI =
 534         BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 535                 AMDGPU::PREDICATE_BIT)
 536             .addOperand(MI.getOperand(1))
 537             .addImm(OPCODE_IS_NOT_ZERO)
 538             .addImm(0); // Flags
 539     TII->addFlag(*NewMI, 0, MO_FLAG_PUSH);
 540     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
 541         .addOperand(MI.getOperand(0))
 542         .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 543     break;
 544   }
 545
 546   case AMDGPU::BRANCH_COND_i32: {
 547     MachineInstr *NewMI =
 548         BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 549                 AMDGPU::PREDICATE_BIT)
 550             .addOperand(MI.getOperand(1))
 551             .addImm(OPCODE_IS_NOT_ZERO_INT)
 552             .addImm(0); // Flags
 553     TII->addFlag(*NewMI, 0, MO_FLAG_PUSH);
 554     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
 555         .addOperand(MI.getOperand(0))
 556         .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 557     break;
 558   }
 559
 560   case AMDGPU::EG_ExportSwz:
 561   case AMDGPU::R600_ExportSwz: {
 562     // Instruction is left unmodified if its not the last one of its type
 563     bool isLastInstructionOfItsType = true;
 564     unsigned InstExportType = MI.getOperand(1).getImm();
 565     for (MachineBasicBlock::iterator NextExportInst = std::next(I),
 566          EndBlock = BB->end(); NextExportInst != EndBlock;
 567          NextExportInst = std::next(NextExportInst)) {
 568       if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
 569           NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
 570         unsigned CurrentInstExportType = NextExportInst->getOperand(1)
 571             .getImm();
 572         if (CurrentInstExportType == InstExportType) {
 573           isLastInstructionOfItsType = false;
 574           break;
 575         }
 576       }
 577     }
 578     bool EOP = isEOP(I);
 579     if (!EOP && !isLastInstructionOfItsType)
 580       return BB;
 581     unsigned CfInst = (MI.getOpcode() == AMDGPU::EG_ExportSwz) ? 84 : 40;
 582     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI.getOpcode()))
 583         .addOperand(MI.getOperand(0))
 584         .addOperand(MI.getOperand(1))
 585         .addOperand(MI.getOperand(2))
 586         .addOperand(MI.getOperand(3))
 587         .addOperand(MI.getOperand(4))
 588         .addOperand(MI.getOperand(5))
 589         .addOperand(MI.getOperand(6))
 590         .addImm(CfInst)
 591         .addImm(EOP);
 592     break;
 593   }
 594   case AMDGPU::RETURN: {
 595     return BB;
 596   }
 597   }
 598
 599   MI.eraseFromParent();
 600   return BB;
 601 }
 602
 603 //===----------------------------------------------------------------------===//
 604 // Custom DAG Lowering Operations
 605 //===----------------------------------------------------------------------===//
 606
 607 SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
 608   MachineFunction &MF = DAG.getMachineFunction();
 609   R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
 610   switch (Op.getOpcode()) {
 611   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 612   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
 613   case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
 614   case ISD::SHL_PARTS: return LowerSHLParts(Op, DAG);
 615   case ISD::SRA_PARTS:
 616   case ISD::SRL_PARTS: return LowerSRXParts(Op, DAG);
 617   case ISD::UADDO: return LowerUADDSUBO(Op, DAG, ISD::ADD, AMDGPUISD::CARRY);
 618   case ISD::USUBO: return LowerUADDSUBO(Op, DAG, ISD::SUB, AMDGPUISD::BORROW);
 619   case ISD::FCOS:
 620   case ISD::FSIN: return LowerTrig(Op, DAG);
 621   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
 622   case ISD::STORE: return LowerSTORE(Op, DAG);
 623   case ISD::LOAD: {
 624     SDValue Result = LowerLOAD(Op, DAG);
 625     assert((!Result.getNode() ||
 626             Result.getNode()->getNumValues() == 2) &&
 627            "Load should return a value and a chain");
 628     return Result;
 629   }
 630
 631   case ISD::BRCOND: return LowerBRCOND(Op, DAG);
 632   case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
 633   case ISD::FrameIndex: return lowerFrameIndex(Op, DAG);
 634   case ISD::INTRINSIC_VOID: {
 635     SDValue Chain = Op.getOperand(0);
 636     unsigned IntrinsicID =
 637                          cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 638     switch (IntrinsicID) {
 639     case AMDGPUIntrinsic::r600_store_swizzle: {
 640       SDLoc DL(Op);
 641       const SDValue Args[8] = {
 642         Chain,
 643         Op.getOperand(2), // Export Value
 644         Op.getOperand(3), // ArrayBase
 645         Op.getOperand(4), // Type
 646         DAG.getConstant(0, DL, MVT::i32), // SWZ_X
 647         DAG.getConstant(1, DL, MVT::i32), // SWZ_Y
 648         DAG.getConstant(2, DL, MVT::i32), // SWZ_Z
 649         DAG.getConstant(3, DL, MVT::i32) // SWZ_W
 650       };
 651       return DAG.getNode(AMDGPUISD::EXPORT, DL, Op.getValueType(), Args);
 652     }
 653
 654     // default for switch(IntrinsicID)
 655     default: break;
 656     }
 657     // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
 658     break;
 659   }
 660   case ISD::INTRINSIC_WO_CHAIN: {
 661     unsigned IntrinsicID =
 662                          cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
 663     EVT VT = Op.getValueType();
 664     SDLoc DL(Op);
 665     switch(IntrinsicID) {
 666     default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 667     case AMDGPUIntrinsic::r600_tex:
 668     case AMDGPUIntrinsic::r600_texc: {
 669       unsigned TextureOp;
 670       switch (IntrinsicID) {
 671       case AMDGPUIntrinsic::r600_tex:
 672         TextureOp = 0;
 673         break;
 674       case AMDGPUIntrinsic::r600_texc:
 675         TextureOp = 1;
 676         break;
 677       default:
 678         llvm_unreachable("Unknow Texture Operation");
 679       }
 680
 681       SDValue TexArgs[19] = {
 682         DAG.getConstant(TextureOp, DL, MVT::i32),
 683         Op.getOperand(1),
 684         DAG.getConstant(0, DL, MVT::i32),
 685         DAG.getConstant(1, DL, MVT::i32),
 686         DAG.getConstant(2, DL, MVT::i32),
 687         DAG.getConstant(3, DL, MVT::i32),
 688         Op.getOperand(2),
 689         Op.getOperand(3),
 690         Op.getOperand(4),
 691         DAG.getConstant(0, DL, MVT::i32),
 692         DAG.getConstant(1, DL, MVT::i32),
 693         DAG.getConstant(2, DL, MVT::i32),
 694         DAG.getConstant(3, DL, MVT::i32),
 695         Op.getOperand(5),
 696         Op.getOperand(6),
 697         Op.getOperand(7),
 698         Op.getOperand(8),
 699         Op.getOperand(9),
 700         Op.getOperand(10)
 701       };
 702       return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs);
 703     }
 704     case AMDGPUIntrinsic::r600_dot4: {
 705       SDValue Args[8] = {
 706       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 707           DAG.getConstant(0, DL, MVT::i32)),
 708       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 709           DAG.getConstant(0, DL, MVT::i32)),
 710       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 711           DAG.getConstant(1, DL, MVT::i32)),
 712       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 713           DAG.getConstant(1, DL, MVT::i32)),
 714       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 715           DAG.getConstant(2, DL, MVT::i32)),
 716       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 717           DAG.getConstant(2, DL, MVT::i32)),
 718       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 719           DAG.getConstant(3, DL, MVT::i32)),
 720       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 721           DAG.getConstant(3, DL, MVT::i32))
 722       };
 723       return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args);
 724     }
 725
 726     case Intrinsic::r600_implicitarg_ptr: {
 727       MVT PtrVT = getPointerTy(DAG.getDataLayout(), AMDGPUAS::PARAM_I_ADDRESS);
 728       uint32_t ByteOffset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT);
 729       return DAG.getConstant(ByteOffset, DL, PtrVT);
 730     }
 731     case Intrinsic::r600_read_ngroups_x:
 732       return LowerImplicitParameter(DAG, VT, DL, 0);
 733     case Intrinsic::r600_read_ngroups_y:
 734       return LowerImplicitParameter(DAG, VT, DL, 1);
 735     case Intrinsic::r600_read_ngroups_z:
 736       return LowerImplicitParameter(DAG, VT, DL, 2);
 737     case Intrinsic::r600_read_global_size_x:
 738       return LowerImplicitParameter(DAG, VT, DL, 3);
 739     case Intrinsic::r600_read_global_size_y:
 740       return LowerImplicitParameter(DAG, VT, DL, 4);
 741     case Intrinsic::r600_read_global_size_z:
 742       return LowerImplicitParameter(DAG, VT, DL, 5);
 743     case Intrinsic::r600_read_local_size_x:
 744       return LowerImplicitParameter(DAG, VT, DL, 6);
 745     case Intrinsic::r600_read_local_size_y:
 746       return LowerImplicitParameter(DAG, VT, DL, 7);
 747     case Intrinsic::r600_read_local_size_z:
 748       return LowerImplicitParameter(DAG, VT, DL, 8);
 749
 750     case Intrinsic::r600_read_workdim:
 751     case AMDGPUIntrinsic::AMDGPU_read_workdim: { // Legacy name.
 752       uint32_t ByteOffset = getImplicitParameterOffset(MFI, GRID_DIM);
 753       return LowerImplicitParameter(DAG, VT, DL, ByteOffset / 4);
 754     }
 755
 756     case Intrinsic::r600_read_tgid_x:
 757       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 758                                   AMDGPU::T1_X, VT);
 759     case Intrinsic::r600_read_tgid_y:
 760       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 761                                   AMDGPU::T1_Y, VT);
 762     case Intrinsic::r600_read_tgid_z:
 763       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 764                                   AMDGPU::T1_Z, VT);
 765     case Intrinsic::r600_read_tidig_x:
 766       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 767                                   AMDGPU::T0_X, VT);
 768     case Intrinsic::r600_read_tidig_y:
 769       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 770                                   AMDGPU::T0_Y, VT);
 771     case Intrinsic::r600_read_tidig_z:
 772       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 773                                   AMDGPU::T0_Z, VT);
 774
 775     case Intrinsic::r600_recipsqrt_ieee:
 776       return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
 777
 778     case Intrinsic::r600_recipsqrt_clamped:
 779       return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
 780     }
 781
 782     // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
 783     break;
 784   }
 785   } // end switch(Op.getOpcode())
 786   return SDValue();
 787 }
 788
 789 void R600TargetLowering::ReplaceNodeResults(SDNode *N,
 790                                             SmallVectorImpl<SDValue> &Results,
 791                                             SelectionDAG &DAG) const {
 792   switch (N->getOpcode()) {
 793   default:
 794     AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);
 795     return;
 796   case ISD::FP_TO_UINT:
 797     if (N->getValueType(0) == MVT::i1) {
 798       Results.push_back(lowerFP_TO_UINT(N->getOperand(0), DAG));
 799       return;
 800     }
 801     // Fall-through. Since we don't care about out of bounds values
 802     // we can use FP_TO_SINT for uints too. The DAGLegalizer code for uint
 803     // considers some extra cases which are not necessary here.
 804   case ISD::FP_TO_SINT: {
 805     if (N->getValueType(0) == MVT::i1) {
 806       Results.push_back(lowerFP_TO_SINT(N->getOperand(0), DAG));
 807       return;
 808     }
 809
 810     SDValue Result;
 811     if (expandFP_TO_SINT(N, Result, DAG))
 812       Results.push_back(Result);
 813     return;
 814   }
 815   case ISD::SDIVREM: {
 816     SDValue Op = SDValue(N, 1);
 817     SDValue RES = LowerSDIVREM(Op, DAG);
 818     Results.push_back(RES);
 819     Results.push_back(RES.getValue(1));
 820     break;
 821   }
 822   case ISD::UDIVREM: {
 823     SDValue Op = SDValue(N, 0);
 824     LowerUDIVREM64(Op, DAG, Results);
 825     break;
 826   }
 827   }
 828 }
 829
 830 SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG,
 831                                                    SDValue Vector) const {
 832
 833   SDLoc DL(Vector);
 834   EVT VecVT = Vector.getValueType();
 835   EVT EltVT = VecVT.getVectorElementType();
 836   SmallVector<SDValue, 8> Args;
 837
 838   for (unsigned i = 0, e = VecVT.getVectorNumElements();
 839                                                            i != e; ++i) {
 840     Args.push_back(DAG.getNode(
 841         ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vector,
 842         DAG.getConstant(i, DL, getVectorIdxTy(DAG.getDataLayout()))));
 843   }
 844
 845   return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args);
 846 }
 847
 848 SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
 849                                                     SelectionDAG &DAG) const {
 850
 851   SDLoc DL(Op);
 852   SDValue Vector = Op.getOperand(0);
 853   SDValue Index = Op.getOperand(1);
 854
 855   if (isa<ConstantSDNode>(Index) ||
 856       Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
 857     return Op;
 858
 859   Vector = vectorToVerticalVector(DAG, Vector);
 860   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(),
 861                      Vector, Index);
 862 }
 863
 864 SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
 865                                                    SelectionDAG &DAG) const {
 866   SDLoc DL(Op);
 867   SDValue Vector = Op.getOperand(0);
 868   SDValue Value = Op.getOperand(1);
 869   SDValue Index = Op.getOperand(2);
 870
 871   if (isa<ConstantSDNode>(Index) ||
 872       Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
 873     return Op;
 874
 875   Vector = vectorToVerticalVector(DAG, Vector);
 876   SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(),
 877                                Vector, Value, Index);
 878   return vectorToVerticalVector(DAG, Insert);
 879 }
 880
 881 SDValue R600TargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
 882                                                SDValue Op,
 883                                                SelectionDAG &DAG) const {
 884
 885   GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
 886   if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
 887     return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
 888
 889   const DataLayout &DL = DAG.getDataLayout();
 890   const GlobalValue *GV = GSD->getGlobal();
 891   MVT ConstPtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
 892
 893   SDValue GA = DAG.getTargetGlobalAddress(GV, SDLoc(GSD), ConstPtrVT);
 894   return DAG.getNode(AMDGPUISD::CONST_DATA_PTR, SDLoc(GSD), ConstPtrVT, GA);
 895 }
 896
 897 SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
 898   // On hw >= R700, COS/SIN input must be between -1. and 1.
 899   // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5)
 900   EVT VT = Op.getValueType();
 901   SDValue Arg = Op.getOperand(0);
 902   SDLoc DL(Op);
 903
 904   // TODO: Should this propagate fast-math-flags?
 905   SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT,
 906       DAG.getNode(ISD::FADD, DL, VT,
 907         DAG.getNode(ISD::FMUL, DL, VT, Arg,
 908           DAG.getConstantFP(0.15915494309, DL, MVT::f32)),
 909         DAG.getConstantFP(0.5, DL, MVT::f32)));
 910   unsigned TrigNode;
 911   switch (Op.getOpcode()) {
 912   case ISD::FCOS:
 913     TrigNode = AMDGPUISD::COS_HW;
 914     break;
 915   case ISD::FSIN:
 916     TrigNode = AMDGPUISD::SIN_HW;
 917     break;
 918   default:
 919     llvm_unreachable("Wrong trig opcode");
 920   }
 921   SDValue TrigVal = DAG.getNode(TrigNode, DL, VT,
 922       DAG.getNode(ISD::FADD, DL, VT, FractPart,
 923         DAG.getConstantFP(-0.5, DL, MVT::f32)));
 924   if (Gen >= R600Subtarget::R700)
 925     return TrigVal;
 926   // On R600 hw, COS/SIN input must be between -Pi and Pi.
 927   return DAG.getNode(ISD::FMUL, DL, VT, TrigVal,
 928       DAG.getConstantFP(3.14159265359, DL, MVT::f32));
 929 }
 930
 931 SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const {
 932   SDLoc DL(Op);
 933   EVT VT = Op.getValueType();
 934
 935   SDValue Lo = Op.getOperand(0);
 936   SDValue Hi = Op.getOperand(1);
 937   SDValue Shift = Op.getOperand(2);
 938   SDValue Zero = DAG.getConstant(0, DL, VT);
 939   SDValue One  = DAG.getConstant(1, DL, VT);
 940
 941   SDValue Width  = DAG.getConstant(VT.getSizeInBits(), DL, VT);
 942   SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT);
 943   SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
 944   SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
 945
 946   // The dance around Width1 is necessary for 0 special case.
 947   // Without it the CompShift might be 32, producing incorrect results in
 948   // Overflow. So we do the shift in two steps, the alternative is to
 949   // add a conditional to filter the special case.
 950
 951   SDValue Overflow = DAG.getNode(ISD::SRL, DL, VT, Lo, CompShift);
 952   Overflow = DAG.getNode(ISD::SRL, DL, VT, Overflow, One);
 953
 954   SDValue HiSmall = DAG.getNode(ISD::SHL, DL, VT, Hi, Shift);
 955   HiSmall = DAG.getNode(ISD::OR, DL, VT, HiSmall, Overflow);
 956   SDValue LoSmall = DAG.getNode(ISD::SHL, DL, VT, Lo, Shift);
 957
 958   SDValue HiBig = DAG.getNode(ISD::SHL, DL, VT, Lo, BigShift);
 959   SDValue LoBig = Zero;
 960
 961   Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
 962   Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
 963
 964   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
 965 }
 966
 967 SDValue R600TargetLowering::LowerSRXParts(SDValue Op, SelectionDAG &DAG) const {
 968   SDLoc DL(Op);
 969   EVT VT = Op.getValueType();
 970
 971   SDValue Lo = Op.getOperand(0);
 972   SDValue Hi = Op.getOperand(1);
 973   SDValue Shift = Op.getOperand(2);
 974   SDValue Zero = DAG.getConstant(0, DL, VT);
 975   SDValue One  = DAG.getConstant(1, DL, VT);
 976
 977   const bool SRA = Op.getOpcode() == ISD::SRA_PARTS;
 978
 979   SDValue Width  = DAG.getConstant(VT.getSizeInBits(), DL, VT);
 980   SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT);
 981   SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
 982   SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
 983
 984   // The dance around Width1 is necessary for 0 special case.
 985   // Without it the CompShift might be 32, producing incorrect results in
 986   // Overflow. So we do the shift in two steps, the alternative is to
 987   // add a conditional to filter the special case.
 988
 989   SDValue Overflow = DAG.getNode(ISD::SHL, DL, VT, Hi, CompShift);
 990   Overflow = DAG.getNode(ISD::SHL, DL, VT, Overflow, One);
 991
 992   SDValue HiSmall = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, Shift);
 993   SDValue LoSmall = DAG.getNode(ISD::SRL, DL, VT, Lo, Shift);
 994   LoSmall = DAG.getNode(ISD::OR, DL, VT, LoSmall, Overflow);
 995
 996   SDValue LoBig = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, BigShift);
 997   SDValue HiBig = SRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, Width1) : Zero;
 998
 999   Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
1000   Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
1001
1002   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
1003 }
1004
1005 SDValue R600TargetLowering::LowerUADDSUBO(SDValue Op, SelectionDAG &DAG,
1006                                           unsigned mainop, unsigned ovf) const {
1007   SDLoc DL(Op);
1008   EVT VT = Op.getValueType();
1009
1010   SDValue Lo = Op.getOperand(0);
1011   SDValue Hi = Op.getOperand(1);
1012
1013   SDValue OVF = DAG.getNode(ovf, DL, VT, Lo, Hi);
1014   // Extend sign.
1015   OVF = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, OVF,
1016                     DAG.getValueType(MVT::i1));
1017
1018   SDValue Res = DAG.getNode(mainop, DL, VT, Lo, Hi);
1019
1020   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT, VT), Res, OVF);
1021 }
1022
1023 SDValue R600TargetLowering::lowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const {
1024   SDLoc DL(Op);
1025   return DAG.getNode(
1026       ISD::SETCC,
1027       DL,
1028       MVT::i1,
1029       Op, DAG.getConstantFP(1.0f, DL, MVT::f32),
1030       DAG.getCondCode(ISD::SETEQ));
1031 }
1032
1033 SDValue R600TargetLowering::lowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const {
1034   SDLoc DL(Op);
1035   return DAG.getNode(
1036       ISD::SETCC,
1037       DL,
1038       MVT::i1,
1039       Op, DAG.getConstantFP(-1.0f, DL, MVT::f32),
1040       DAG.getCondCode(ISD::SETEQ));
1041 }
1042
1043 SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
1044                                                    const SDLoc &DL,
1045                                                    unsigned DwordOffset) const {
1046   unsigned ByteOffset = DwordOffset * 4;
1047   PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1048                                       AMDGPUAS::CONSTANT_BUFFER_0);
1049
1050   // We shouldn't be using an offset wider than 16-bits for implicit parameters.
1051   assert(isInt<16>(ByteOffset));
1052
1053   return DAG.getLoad(VT, DL, DAG.getEntryNode(),
1054                      DAG.getConstant(ByteOffset, DL, MVT::i32), // PTR
1055                      MachinePointerInfo(ConstantPointerNull::get(PtrType)));
1056 }
1057
1058 bool R600TargetLowering::isZero(SDValue Op) const {
1059   if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
1060     return Cst->isNullValue();
1061   } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
1062     return CstFP->isZero();
1063   } else {
1064     return false;
1065   }
1066 }
1067
1068 bool R600TargetLowering::isHWTrueValue(SDValue Op) const {
1069   if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
1070     return CFP->isExactlyValue(1.0);
1071   }
1072   return isAllOnesConstant(Op);
1073 }
1074
1075 bool R600TargetLowering::isHWFalseValue(SDValue Op) const {
1076   if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
1077     return CFP->getValueAPF().isZero();
1078   }
1079   return isNullConstant(Op);
1080 }
1081
1082 SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
1083   SDLoc DL(Op);
1084   EVT VT = Op.getValueType();
1085
1086   SDValue LHS = Op.getOperand(0);
1087   SDValue RHS = Op.getOperand(1);
1088   SDValue True = Op.getOperand(2);
1089   SDValue False = Op.getOperand(3);
1090   SDValue CC = Op.getOperand(4);
1091   SDValue Temp;
1092
1093   if (VT == MVT::f32) {
1094     DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
1095     SDValue MinMax = CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI);
1096     if (MinMax)
1097       return MinMax;
1098   }
1099
1100   // LHS and RHS are guaranteed to be the same value type
1101   EVT CompareVT = LHS.getValueType();
1102
1103   // Check if we can lower this to a native operation.
1104
1105   // Try to lower to a SET* instruction:
1106   //
1107   // SET* can match the following patterns:
1108   //
1109   // select_cc f32, f32, -1,  0, cc_supported
1110   // select_cc f32, f32, 1.0f, 0.0f, cc_supported
1111   // select_cc i32, i32, -1,  0, cc_supported
1112   //
1113
1114   // Move hardware True/False values to the correct operand.
1115   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1116   ISD::CondCode InverseCC =
1117      ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
1118   if (isHWTrueValue(False) && isHWFalseValue(True)) {
1119     if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) {
1120       std::swap(False, True);
1121       CC = DAG.getCondCode(InverseCC);
1122     } else {
1123       ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC);
1124       if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) {
1125         std::swap(False, True);
1126         std::swap(LHS, RHS);
1127         CC = DAG.getCondCode(SwapInvCC);
1128       }
1129     }
1130   }
1131
1132   if (isHWTrueValue(True) && isHWFalseValue(False) &&
1133       (CompareVT == VT || VT == MVT::i32)) {
1134     // This can be matched by a SET* instruction.
1135     return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
1136   }
1137
1138   // Try to lower to a CND* instruction:
1139   //
1140   // CND* can match the following patterns:
1141   //
1142   // select_cc f32, 0.0, f32, f32, cc_supported
1143   // select_cc f32, 0.0, i32, i32, cc_supported
1144   // select_cc i32, 0,   f32, f32, cc_supported
1145   // select_cc i32, 0,   i32, i32, cc_supported
1146   //
1147
1148   // Try to move the zero value to the RHS
1149   if (isZero(LHS)) {
1150     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1151     // Try swapping the operands
1152     ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode);
1153     if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
1154       std::swap(LHS, RHS);
1155       CC = DAG.getCondCode(CCSwapped);
1156     } else {
1157       // Try inverting the conditon and then swapping the operands
1158       ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT.isInteger());
1159       CCSwapped = ISD::getSetCCSwappedOperands(CCInv);
1160       if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
1161         std::swap(True, False);
1162         std::swap(LHS, RHS);
1163         CC = DAG.getCondCode(CCSwapped);
1164       }
1165     }
1166   }
1167   if (isZero(RHS)) {
1168     SDValue Cond = LHS;
1169     SDValue Zero = RHS;
1170     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1171     if (CompareVT != VT) {
1172       // Bitcast True / False to the correct types.  This will end up being
1173       // a nop, but it allows us to define only a single pattern in the
1174       // .TD files for each CND* instruction rather than having to have
1175       // one pattern for integer True/False and one for fp True/False
1176       True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
1177       False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
1178     }
1179
1180     switch (CCOpcode) {
1181     case ISD::SETONE:
1182     case ISD::SETUNE:
1183     case ISD::SETNE:
1184       CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
1185       Temp = True;
1186       True = False;
1187       False = Temp;
1188       break;
1189     default:
1190       break;
1191     }
1192     SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
1193         Cond, Zero,
1194         True, False,
1195         DAG.getCondCode(CCOpcode));
1196     return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
1197   }
1198
1199   // If we make it this for it means we have no native instructions to handle
1200   // this SELECT_CC, so we must lower it.
1201   SDValue HWTrue, HWFalse;
1202
1203   if (CompareVT == MVT::f32) {
1204     HWTrue = DAG.getConstantFP(1.0f, DL, CompareVT);
1205     HWFalse = DAG.getConstantFP(0.0f, DL, CompareVT);
1206   } else if (CompareVT == MVT::i32) {
1207     HWTrue = DAG.getConstant(-1, DL, CompareVT);
1208     HWFalse = DAG.getConstant(0, DL, CompareVT);
1209   }
1210   else {
1211     llvm_unreachable("Unhandled value type in LowerSELECT_CC");
1212   }
1213
1214   // Lower this unsupported SELECT_CC into a combination of two supported
1215   // SELECT_CC operations.
1216   SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
1217
1218   return DAG.getNode(ISD::SELECT_CC, DL, VT,
1219       Cond, HWFalse,
1220       True, False,
1221       DAG.getCondCode(ISD::SETNE));
1222 }
1223
1224 /// LLVM generates byte-addressed pointers.  For indirect addressing, we need to
1225 /// convert these pointers to a register index.  Each register holds
1226 /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
1227 /// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
1228 /// for indirect addressing.
1229 SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
1230                                                unsigned StackWidth,
1231                                                SelectionDAG &DAG) const {
1232   unsigned SRLPad;
1233   switch(StackWidth) {
1234   case 1:
1235     SRLPad = 2;
1236     break;
1237   case 2:
1238     SRLPad = 3;
1239     break;
1240   case 4:
1241     SRLPad = 4;
1242     break;
1243   default: llvm_unreachable("Invalid stack width");
1244   }
1245
1246   SDLoc DL(Ptr);
1247   return DAG.getNode(ISD::SRL, DL, Ptr.getValueType(), Ptr,
1248                      DAG.getConstant(SRLPad, DL, MVT::i32));
1249 }
1250
1251 void R600TargetLowering::getStackAddress(unsigned StackWidth,
1252                                          unsigned ElemIdx,
1253                                          unsigned &Channel,
1254                                          unsigned &PtrIncr) const {
1255   switch (StackWidth) {
1256   default:
1257   case 1:
1258     Channel = 0;
1259     if (ElemIdx > 0) {
1260       PtrIncr = 1;
1261     } else {
1262       PtrIncr = 0;
1263     }
1264     break;
1265   case 2:
1266     Channel = ElemIdx % 2;
1267     if (ElemIdx == 2) {
1268       PtrIncr = 1;
1269     } else {
1270       PtrIncr = 0;
1271     }
1272     break;
1273   case 4:
1274     Channel = ElemIdx;
1275     PtrIncr = 0;
1276     break;
1277   }
1278 }
1279
1280 SDValue R600TargetLowering::lowerPrivateTruncStore(StoreSDNode *Store,
1281                                                    SelectionDAG &DAG) const {
1282   SDLoc DL(Store);
1283
1284   unsigned Mask = 0;
1285   if (Store->getMemoryVT() == MVT::i8) {
1286     Mask = 0xff;
1287   } else if (Store->getMemoryVT() == MVT::i16) {
1288     Mask = 0xffff;
1289   }
1290
1291   SDValue Chain = Store->getChain();
1292   SDValue BasePtr = Store->getBasePtr();
1293   EVT MemVT = Store->getMemoryVT();
1294
1295   SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, BasePtr,
1296                             DAG.getConstant(2, DL, MVT::i32));
1297   SDValue Dst = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, MVT::i32,
1298                             Chain, Ptr,
1299                             DAG.getTargetConstant(0, DL, MVT::i32));
1300
1301   SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32, BasePtr,
1302                                 DAG.getConstant(0x3, DL, MVT::i32));
1303
1304   SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
1305                                  DAG.getConstant(3, DL, MVT::i32));
1306
1307   SDValue SExtValue = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32,
1308                                   Store->getValue());
1309
1310   SDValue MaskedValue = DAG.getZeroExtendInReg(SExtValue, DL, MemVT);
1311
1312   SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, MVT::i32,
1313                                      MaskedValue, ShiftAmt);
1314
1315   SDValue DstMask = DAG.getNode(ISD::SHL, DL, MVT::i32,
1316                                 DAG.getConstant(Mask, DL, MVT::i32),
1317                                 ShiftAmt);
1318   DstMask = DAG.getNode(ISD::XOR, DL, MVT::i32, DstMask,
1319                         DAG.getConstant(0xffffffff, DL, MVT::i32));
1320   Dst = DAG.getNode(ISD::AND, DL, MVT::i32, Dst, DstMask);
1321
1322   SDValue Value = DAG.getNode(ISD::OR, DL, MVT::i32, Dst, ShiftedValue);
1323   return DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
1324                      Chain, Value, Ptr,
1325                      DAG.getTargetConstant(0, DL, MVT::i32));
1326 }
1327
1328 SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
1329   if (SDValue Result = AMDGPUTargetLowering::MergeVectorStore(Op, DAG))
1330     return Result;
1331
1332   StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
1333   unsigned AS = StoreNode->getAddressSpace();
1334   SDValue Value = StoreNode->getValue();
1335   EVT ValueVT = Value.getValueType();
1336
1337   if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS) &&
1338       ValueVT.isVector()) {
1339     return SplitVectorStore(Op, DAG);
1340   }
1341
1342   SDLoc DL(Op);
1343   SDValue Chain = StoreNode->getChain();
1344   SDValue Ptr = StoreNode->getBasePtr();
1345
1346   if (AS == AMDGPUAS::GLOBAL_ADDRESS) {
1347     if (StoreNode->isTruncatingStore()) {
1348       EVT VT = Value.getValueType();
1349       assert(VT.bitsLE(MVT::i32));
1350       EVT MemVT = StoreNode->getMemoryVT();
1351       SDValue MaskConstant;
1352       if (MemVT == MVT::i8) {
1353         MaskConstant = DAG.getConstant(0xFF, DL, MVT::i32);
1354       } else {
1355         assert(MemVT == MVT::i16);
1356         MaskConstant = DAG.getConstant(0xFFFF, DL, MVT::i32);
1357       }
1358       SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr,
1359                                       DAG.getConstant(2, DL, MVT::i32));
1360       SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr,
1361                                       DAG.getConstant(0x00000003, DL, VT));
1362       SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant);
1363       SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex,
1364                                    DAG.getConstant(3, DL, VT));
1365       SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift);
1366       SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift);
1367       // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32
1368       // vector instead.
1369       SDValue Src[4] = {
1370         ShiftedValue,
1371         DAG.getConstant(0, DL, MVT::i32),
1372         DAG.getConstant(0, DL, MVT::i32),
1373         Mask
1374       };
1375       SDValue Input = DAG.getBuildVector(MVT::v4i32, DL, Src);
1376       SDValue Args[3] = { Chain, Input, DWordAddr };
1377       return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL,
1378                                      Op->getVTList(), Args, MemVT,
1379                                      StoreNode->getMemOperand());
1380     } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR &&
1381                ValueVT.bitsGE(MVT::i32)) {
1382       // Convert pointer from byte address to dword address.
1383       Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
1384                         DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
1385                                     Ptr, DAG.getConstant(2, DL, MVT::i32)));
1386
1387       if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
1388         llvm_unreachable("Truncated and indexed stores not supported yet");
1389       } else {
1390         Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
1391       }
1392       return Chain;
1393     }
1394   }
1395
1396   if (AS != AMDGPUAS::PRIVATE_ADDRESS)
1397     return SDValue();
1398
1399   EVT MemVT = StoreNode->getMemoryVT();
1400   if (MemVT.bitsLT(MVT::i32))
1401     return lowerPrivateTruncStore(StoreNode, DAG);
1402
1403   // Lowering for indirect addressing
1404   const MachineFunction &MF = DAG.getMachineFunction();
1405   const R600FrameLowering *TFL = getSubtarget()->getFrameLowering();
1406   unsigned StackWidth = TFL->getStackWidth(MF);
1407
1408   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1409
1410   if (ValueVT.isVector()) {
1411     unsigned NumElemVT = ValueVT.getVectorNumElements();
1412     EVT ElemVT = ValueVT.getVectorElementType();
1413     SmallVector<SDValue, 4> Stores(NumElemVT);
1414
1415     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1416                                       "vector width in load");
1417
1418     for (unsigned i = 0; i < NumElemVT; ++i) {
1419       unsigned Channel, PtrIncr;
1420       getStackAddress(StackWidth, i, Channel, PtrIncr);
1421       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1422                         DAG.getConstant(PtrIncr, DL, MVT::i32));
1423       SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
1424                                  Value, DAG.getConstant(i, DL, MVT::i32));
1425
1426       Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
1427                               Chain, Elem, Ptr,
1428                               DAG.getTargetConstant(Channel, DL, MVT::i32));
1429     }
1430      Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
1431    } else {
1432     if (ValueVT == MVT::i8) {
1433       Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
1434     }
1435     Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
1436     DAG.getTargetConstant(0, DL, MVT::i32)); // Channel
1437   }
1438
1439   return Chain;
1440 }
1441
1442 // return (512 + (kc_bank << 12)
1443 static int
1444 ConstantAddressBlock(unsigned AddressSpace) {
1445   switch (AddressSpace) {
1446   case AMDGPUAS::CONSTANT_BUFFER_0:
1447     return 512;
1448   case AMDGPUAS::CONSTANT_BUFFER_1:
1449     return 512 + 4096;
1450   case AMDGPUAS::CONSTANT_BUFFER_2:
1451     return 512 + 4096 * 2;
1452   case AMDGPUAS::CONSTANT_BUFFER_3:
1453     return 512 + 4096 * 3;
1454   case AMDGPUAS::CONSTANT_BUFFER_4:
1455     return 512 + 4096 * 4;
1456   case AMDGPUAS::CONSTANT_BUFFER_5:
1457     return 512 + 4096 * 5;
1458   case AMDGPUAS::CONSTANT_BUFFER_6:
1459     return 512 + 4096 * 6;
1460   case AMDGPUAS::CONSTANT_BUFFER_7:
1461     return 512 + 4096 * 7;
1462   case AMDGPUAS::CONSTANT_BUFFER_8:
1463     return 512 + 4096 * 8;
1464   case AMDGPUAS::CONSTANT_BUFFER_9:
1465     return 512 + 4096 * 9;
1466   case AMDGPUAS::CONSTANT_BUFFER_10:
1467     return 512 + 4096 * 10;
1468   case AMDGPUAS::CONSTANT_BUFFER_11:
1469     return 512 + 4096 * 11;
1470   case AMDGPUAS::CONSTANT_BUFFER_12:
1471     return 512 + 4096 * 12;
1472   case AMDGPUAS::CONSTANT_BUFFER_13:
1473     return 512 + 4096 * 13;
1474   case AMDGPUAS::CONSTANT_BUFFER_14:
1475     return 512 + 4096 * 14;
1476   case AMDGPUAS::CONSTANT_BUFFER_15:
1477     return 512 + 4096 * 15;
1478   default:
1479     return -1;
1480   }
1481 }
1482
1483 SDValue R600TargetLowering::lowerPrivateExtLoad(SDValue Op,
1484                                                 SelectionDAG &DAG) const {
1485   SDLoc DL(Op);
1486   LoadSDNode *Load = cast<LoadSDNode>(Op);
1487   ISD::LoadExtType ExtType = Load->getExtensionType();
1488   EVT MemVT = Load->getMemoryVT();
1489
1490   // <SI && AS=PRIVATE && EXTLOAD && size < 32bit,
1491   // register (2-)byte extract.
1492
1493   // Get Register holding the target.
1494   SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Load->getBasePtr(),
1495                             DAG.getConstant(2, DL, MVT::i32));
1496   // Load the Register.
1497   SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(),
1498                             Load->getChain(),
1499                             Ptr,
1500                             DAG.getTargetConstant(0, DL, MVT::i32),
1501                             Op.getOperand(2));
1502
1503   // Get offset within the register.
1504   SDValue ByteIdx = DAG.getNode(ISD::AND, DL, MVT::i32,
1505                                 Load->getBasePtr(),
1506                                 DAG.getConstant(0x3, DL, MVT::i32));
1507
1508   // Bit offset of target byte (byteIdx * 8).
1509   SDValue ShiftAmt = DAG.getNode(ISD::SHL, DL, MVT::i32, ByteIdx,
1510                                  DAG.getConstant(3, DL, MVT::i32));
1511
1512   // Shift to the right.
1513   Ret = DAG.getNode(ISD::SRL, DL, MVT::i32, Ret, ShiftAmt);
1514
1515   // Eliminate the upper bits by setting them to ...
1516   EVT MemEltVT = MemVT.getScalarType();
1517
1518   // ... ones.
1519   if (ExtType == ISD::SEXTLOAD) {
1520     SDValue MemEltVTNode = DAG.getValueType(MemEltVT);
1521
1522     SDValue Ops[] = {
1523       DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, Ret, MemEltVTNode),
1524       Load->getChain()
1525     };
1526
1527     return DAG.getMergeValues(Ops, DL);
1528   }
1529
1530   // ... or zeros.
1531   SDValue Ops[] = {
1532     DAG.getZeroExtendInReg(Ret, DL, MemEltVT),
1533     Load->getChain()
1534   };
1535
1536   return DAG.getMergeValues(Ops, DL);
1537 }
1538
1539 SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
1540   LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
1541   unsigned AS = LoadNode->getAddressSpace();
1542   EVT MemVT = LoadNode->getMemoryVT();
1543   ISD::LoadExtType ExtType = LoadNode->getExtensionType();
1544
1545   if (AS == AMDGPUAS::PRIVATE_ADDRESS &&
1546       ExtType != ISD::NON_EXTLOAD && MemVT.bitsLT(MVT::i32)) {
1547     return lowerPrivateExtLoad(Op, DAG);
1548   }
1549
1550   SDLoc DL(Op);
1551   EVT VT = Op.getValueType();
1552   SDValue Chain = LoadNode->getChain();
1553   SDValue Ptr = LoadNode->getBasePtr();
1554
1555   if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) {
1556     SDValue MergedValues[2] = {
1557       scalarizeVectorLoad(LoadNode, DAG),
1558       Chain
1559     };
1560     return DAG.getMergeValues(MergedValues, DL);
1561   }
1562
1563   int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
1564   if (ConstantBlock > -1 &&
1565       ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) ||
1566        (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) {
1567     SDValue Result;
1568     if (isa<ConstantExpr>(LoadNode->getMemOperand()->getValue()) ||
1569         isa<Constant>(LoadNode->getMemOperand()->getValue()) ||
1570         isa<ConstantSDNode>(Ptr)) {
1571       SDValue Slots[4];
1572       for (unsigned i = 0; i < 4; i++) {
1573         // We want Const position encoded with the following formula :
1574         // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
1575         // const_index is Ptr computed by llvm using an alignment of 16.
1576         // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
1577         // then div by 4 at the ISel step
1578         SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
1579             DAG.getConstant(4 * i + ConstantBlock * 16, DL, MVT::i32));
1580         Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
1581       }
1582       EVT NewVT = MVT::v4i32;
1583       unsigned NumElements = 4;
1584       if (VT.isVector()) {
1585         NewVT = VT;
1586         NumElements = VT.getVectorNumElements();
1587       }
1588       Result = DAG.getBuildVector(NewVT, DL, makeArrayRef(Slots, NumElements));
1589     } else {
1590       // non-constant ptr can't be folded, keeps it as a v4f32 load
1591       Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
1592           DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
1593                       DAG.getConstant(4, DL, MVT::i32)),
1594                       DAG.getConstant(LoadNode->getAddressSpace() -
1595                                       AMDGPUAS::CONSTANT_BUFFER_0, DL, MVT::i32)
1596           );
1597     }
1598
1599     if (!VT.isVector()) {
1600       Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
1601                            DAG.getConstant(0, DL, MVT::i32));
1602     }
1603
1604     SDValue MergedValues[2] = {
1605       Result,
1606       Chain
1607     };
1608     return DAG.getMergeValues(MergedValues, DL);
1609   }
1610
1611   SDValue LoweredLoad;
1612
1613   // For most operations returning SDValue() will result in the node being
1614   // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we
1615   // need to manually expand loads that may be legal in some address spaces and
1616   // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for
1617   // compute shaders, since the data is sign extended when it is uploaded to the
1618   // buffer. However SEXT loads from other address spaces are not supported, so
1619   // we need to expand them here.
1620   if (LoadNode->getExtensionType() == ISD::SEXTLOAD) {
1621     EVT MemVT = LoadNode->getMemoryVT();
1622     assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8));
1623     SDValue NewLoad = DAG.getExtLoad(
1624         ISD::EXTLOAD, DL, VT, Chain, Ptr, LoadNode->getPointerInfo(), MemVT,
1625         LoadNode->getAlignment(), LoadNode->getMemOperand()->getFlags());
1626     SDValue Res = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, NewLoad,
1627                               DAG.getValueType(MemVT));
1628
1629     SDValue MergedValues[2] = { Res, Chain };
1630     return DAG.getMergeValues(MergedValues, DL);
1631   }
1632
1633   if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1634     return SDValue();
1635   }
1636
1637   // Lowering for indirect addressing
1638   const MachineFunction &MF = DAG.getMachineFunction();
1639   const R600FrameLowering *TFL = getSubtarget()->getFrameLowering();
1640   unsigned StackWidth = TFL->getStackWidth(MF);
1641
1642   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1643
1644   if (VT.isVector()) {
1645     unsigned NumElemVT = VT.getVectorNumElements();
1646     EVT ElemVT = VT.getVectorElementType();
1647     SDValue Loads[4];
1648
1649     assert(NumElemVT <= 4);
1650     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1651                                       "vector width in load");
1652
1653     for (unsigned i = 0; i < NumElemVT; ++i) {
1654       unsigned Channel, PtrIncr;
1655       getStackAddress(StackWidth, i, Channel, PtrIncr);
1656       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1657                         DAG.getConstant(PtrIncr, DL, MVT::i32));
1658       Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
1659                              Chain, Ptr,
1660                              DAG.getTargetConstant(Channel, DL, MVT::i32),
1661                              Op.getOperand(2));
1662     }
1663     EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, NumElemVT);
1664     LoweredLoad = DAG.getBuildVector(TargetVT, DL, makeArrayRef(Loads, NumElemVT));
1665   } else {
1666     LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
1667                               Chain, Ptr,
1668                               DAG.getTargetConstant(0, DL, MVT::i32), // Channel
1669                               Op.getOperand(2));
1670   }
1671
1672   SDValue Ops[2] = {
1673     LoweredLoad,
1674     Chain
1675   };
1676
1677   return DAG.getMergeValues(Ops, DL);
1678 }
1679
1680 SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
1681   SDValue Chain = Op.getOperand(0);
1682   SDValue Cond  = Op.getOperand(1);
1683   SDValue Jump  = Op.getOperand(2);
1684
1685   return DAG.getNode(AMDGPUISD::BRANCH_COND, SDLoc(Op), Op.getValueType(),
1686                      Chain, Jump, Cond);
1687 }
1688
1689 SDValue R600TargetLowering::lowerFrameIndex(SDValue Op,
1690                                             SelectionDAG &DAG) const {
1691   MachineFunction &MF = DAG.getMachineFunction();
1692   const R600FrameLowering *TFL = getSubtarget()->getFrameLowering();
1693
1694   FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op);
1695
1696   unsigned FrameIndex = FIN->getIndex();
1697   unsigned IgnoredFrameReg;
1698   unsigned Offset =
1699     TFL->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg);
1700   return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), SDLoc(Op),
1701                          Op.getValueType());
1702 }
1703
1704 /// XXX Only kernel functions are supported, so we can assume for now that
1705 /// every function is a kernel function, but in the future we should use
1706 /// separate calling conventions for kernel and non-kernel functions.
1707 SDValue R600TargetLowering::LowerFormalArguments(
1708     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
1709     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
1710     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
1711   SmallVector<CCValAssign, 16> ArgLocs;
1712   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
1713                  *DAG.getContext());
1714   MachineFunction &MF = DAG.getMachineFunction();
1715   R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
1716
1717   SmallVector<ISD::InputArg, 8> LocalIns;
1718
1719   getOriginalFunctionArgs(DAG, MF.getFunction(), Ins, LocalIns);
1720
1721   AnalyzeFormalArguments(CCInfo, LocalIns);
1722
1723   for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
1724     CCValAssign &VA = ArgLocs[i];
1725     const ISD::InputArg &In = Ins[i];
1726     EVT VT = In.VT;
1727     EVT MemVT = VA.getLocVT();
1728     if (!VT.isVector() && MemVT.isVector()) {
1729       // Get load source type if scalarized.
1730       MemVT = MemVT.getVectorElementType();
1731     }
1732
1733     if (AMDGPU::isShader(CallConv)) {
1734       unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass);
1735       SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
1736       InVals.push_back(Register);
1737       continue;
1738     }
1739
1740     PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1741                                           AMDGPUAS::CONSTANT_BUFFER_0);
1742
1743     // i64 isn't a legal type, so the register type used ends up as i32, which
1744     // isn't expected here. It attempts to create this sextload, but it ends up
1745     // being invalid. Somehow this seems to work with i64 arguments, but breaks
1746     // for <1 x i64>.
1747
1748     // The first 36 bytes of the input buffer contains information about
1749     // thread group and global sizes.
1750     ISD::LoadExtType Ext = ISD::NON_EXTLOAD;
1751     if (MemVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) {
1752       // FIXME: This should really check the extload type, but the handling of
1753       // extload vector parameters seems to be broken.
1754
1755       // Ext = In.Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
1756       Ext = ISD::SEXTLOAD;
1757     }
1758
1759     // Compute the offset from the value.
1760     // XXX - I think PartOffset should give you this, but it seems to give the
1761     // size of the register which isn't useful.
1762
1763     unsigned ValBase = ArgLocs[In.getOrigArgIndex()].getLocMemOffset();
1764     unsigned PartOffset = VA.getLocMemOffset();
1765     unsigned Offset = 36 + VA.getLocMemOffset();
1766
1767     MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase);
1768     SDValue Arg = DAG.getLoad(
1769         ISD::UNINDEXED, Ext, VT, DL, Chain,
1770         DAG.getConstant(Offset, DL, MVT::i32), DAG.getUNDEF(MVT::i32), PtrInfo,
1771         MemVT, /* Alignment = */ 4,
1772         MachineMemOperand::MONonTemporal | MachineMemOperand::MOInvariant);
1773
1774     // 4 is the preferred alignment for the CONSTANT memory space.
1775     InVals.push_back(Arg);
1776     MFI->ABIArgOffset = Offset + MemVT.getStoreSize();
1777   }
1778   return Chain;
1779 }
1780
1781 EVT R600TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
1782                                            EVT VT) const {
1783    if (!VT.isVector())
1784      return MVT::i32;
1785    return VT.changeVectorElementTypeToInteger();
1786 }
1787
1788 bool R600TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
1789                                                         unsigned AddrSpace,
1790                                                         unsigned Align,
1791                                                         bool *IsFast) const {
1792   if (IsFast)
1793     *IsFast = false;
1794
1795   if (!VT.isSimple() || VT == MVT::Other)
1796     return false;
1797
1798   if (VT.bitsLT(MVT::i32))
1799     return false;
1800
1801   // TODO: This is a rough estimate.
1802   if (IsFast)
1803     *IsFast = true;
1804
1805   return VT.bitsGT(MVT::i32) && Align % 4 == 0;
1806 }
1807
1808 static SDValue CompactSwizzlableVector(
1809   SelectionDAG &DAG, SDValue VectorEntry,
1810   DenseMap<unsigned, unsigned> &RemapSwizzle) {
1811   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1812   assert(RemapSwizzle.empty());
1813   SDValue NewBldVec[4] = {
1814     VectorEntry.getOperand(0),
1815     VectorEntry.getOperand(1),
1816     VectorEntry.getOperand(2),
1817     VectorEntry.getOperand(3)
1818   };
1819
1820   for (unsigned i = 0; i < 4; i++) {
1821     if (NewBldVec[i].isUndef())
1822       // We mask write here to teach later passes that the ith element of this
1823       // vector is undef. Thus we can use it to reduce 128 bits reg usage,
1824       // break false dependencies and additionnaly make assembly easier to read.
1825       RemapSwizzle[i] = 7; // SEL_MASK_WRITE
1826     if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) {
1827       if (C->isZero()) {
1828         RemapSwizzle[i] = 4; // SEL_0
1829         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1830       } else if (C->isExactlyValue(1.0)) {
1831         RemapSwizzle[i] = 5; // SEL_1
1832         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1833       }
1834     }
1835
1836     if (NewBldVec[i].isUndef())
1837       continue;
1838     for (unsigned j = 0; j < i; j++) {
1839       if (NewBldVec[i] == NewBldVec[j]) {
1840         NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType());
1841         RemapSwizzle[i] = j;
1842         break;
1843       }
1844     }
1845   }
1846
1847   return DAG.getBuildVector(VectorEntry.getValueType(), SDLoc(VectorEntry),
1848                             NewBldVec);
1849 }
1850
1851 static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
1852                                 DenseMap<unsigned, unsigned> &RemapSwizzle) {
1853   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1854   assert(RemapSwizzle.empty());
1855   SDValue NewBldVec[4] = {
1856       VectorEntry.getOperand(0),
1857       VectorEntry.getOperand(1),
1858       VectorEntry.getOperand(2),
1859       VectorEntry.getOperand(3)
1860   };
1861   bool isUnmovable[4] = { false, false, false, false };
1862   for (unsigned i = 0; i < 4; i++) {
1863     RemapSwizzle[i] = i;
1864     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1865       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1866           ->getZExtValue();
1867       if (i == Idx)
1868         isUnmovable[Idx] = true;
1869     }
1870   }
1871
1872   for (unsigned i = 0; i < 4; i++) {
1873     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1874       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1875           ->getZExtValue();
1876       if (isUnmovable[Idx])
1877         continue;
1878       // Swap i and Idx
1879       std::swap(NewBldVec[Idx], NewBldVec[i]);
1880       std::swap(RemapSwizzle[i], RemapSwizzle[Idx]);
1881       break;
1882     }
1883   }
1884
1885   return DAG.getBuildVector(VectorEntry.getValueType(), SDLoc(VectorEntry),
1886                             NewBldVec);
1887 }
1888
1889 SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, SDValue Swz[4],
1890                                             SelectionDAG &DAG,
1891                                             const SDLoc &DL) const {
1892   assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR);
1893   // Old -> New swizzle values
1894   DenseMap<unsigned, unsigned> SwizzleRemap;
1895
1896   BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap);
1897   for (unsigned i = 0; i < 4; i++) {
1898     unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue();
1899     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1900       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32);
1901   }
1902
1903   SwizzleRemap.clear();
1904   BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap);
1905   for (unsigned i = 0; i < 4; i++) {
1906     unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue();
1907     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1908       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32);
1909   }
1910
1911   return BuildVector;
1912 }
1913
1914
1915 //===----------------------------------------------------------------------===//
1916 // Custom DAG Optimizations
1917 //===----------------------------------------------------------------------===//
1918
1919 SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
1920                                               DAGCombinerInfo &DCI) const {
1921   SelectionDAG &DAG = DCI.DAG;
1922
1923   switch (N->getOpcode()) {
1924   default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
1925   // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
1926   case ISD::FP_ROUND: {
1927       SDValue Arg = N->getOperand(0);
1928       if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
1929         return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0),
1930                            Arg.getOperand(0));
1931       }
1932       break;
1933     }
1934
1935   // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
1936   // (i32 select_cc f32, f32, -1, 0 cc)
1937   //
1938   // Mesa's GLSL frontend generates the above pattern a lot and we can lower
1939   // this to one of the SET*_DX10 instructions.
1940   case ISD::FP_TO_SINT: {
1941     SDValue FNeg = N->getOperand(0);
1942     if (FNeg.getOpcode() != ISD::FNEG) {
1943       return SDValue();
1944     }
1945     SDValue SelectCC = FNeg.getOperand(0);
1946     if (SelectCC.getOpcode() != ISD::SELECT_CC ||
1947         SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
1948         SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
1949         !isHWTrueValue(SelectCC.getOperand(2)) ||
1950         !isHWFalseValue(SelectCC.getOperand(3))) {
1951       return SDValue();
1952     }
1953
1954     SDLoc dl(N);
1955     return DAG.getNode(ISD::SELECT_CC, dl, N->getValueType(0),
1956                            SelectCC.getOperand(0), // LHS
1957                            SelectCC.getOperand(1), // RHS
1958                            DAG.getConstant(-1, dl, MVT::i32), // True
1959                            DAG.getConstant(0, dl, MVT::i32),  // False
1960                            SelectCC.getOperand(4)); // CC
1961
1962     break;
1963   }
1964
1965   // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx
1966   // => build_vector elt0, ... , NewEltIdx, ... , eltN
1967   case ISD::INSERT_VECTOR_ELT: {
1968     SDValue InVec = N->getOperand(0);
1969     SDValue InVal = N->getOperand(1);
1970     SDValue EltNo = N->getOperand(2);
1971     SDLoc dl(N);
1972
1973     // If the inserted element is an UNDEF, just use the input vector.
1974     if (InVal.isUndef())
1975       return InVec;
1976
1977     EVT VT = InVec.getValueType();
1978
1979     // If we can't generate a legal BUILD_VECTOR, exit
1980     if (!isOperationLegal(ISD::BUILD_VECTOR, VT))
1981       return SDValue();
1982
1983     // Check that we know which element is being inserted
1984     if (!isa<ConstantSDNode>(EltNo))
1985       return SDValue();
1986     unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
1987
1988     // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
1989     // be converted to a BUILD_VECTOR).  Fill in the Ops vector with the
1990     // vector elements.
1991     SmallVector<SDValue, 8> Ops;
1992     if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
1993       Ops.append(InVec.getNode()->op_begin(),
1994                  InVec.getNode()->op_end());
1995     } else if (InVec.isUndef()) {
1996       unsigned NElts = VT.getVectorNumElements();
1997       Ops.append(NElts, DAG.getUNDEF(InVal.getValueType()));
1998     } else {
1999       return SDValue();
2000     }
2001
2002     // Insert the element
2003     if (Elt < Ops.size()) {
2004       // All the operands of BUILD_VECTOR must have the same type;
2005       // we enforce that here.
2006       EVT OpVT = Ops[0].getValueType();
2007       if (InVal.getValueType() != OpVT)
2008         InVal = OpVT.bitsGT(InVal.getValueType()) ?
2009           DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) :
2010           DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal);
2011       Ops[Elt] = InVal;
2012     }
2013
2014     // Return the new vector
2015     return DAG.getBuildVector(VT, dl, Ops);
2016   }
2017
2018   // Extract_vec (Build_vector) generated by custom lowering
2019   // also needs to be customly combined
2020   case ISD::EXTRACT_VECTOR_ELT: {
2021     SDValue Arg = N->getOperand(0);
2022     if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
2023       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
2024         unsigned Element = Const->getZExtValue();
2025         return Arg->getOperand(Element);
2026       }
2027     }
2028     if (Arg.getOpcode() == ISD::BITCAST &&
2029         Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
2030       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
2031         unsigned Element = Const->getZExtValue();
2032         return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(),
2033             Arg->getOperand(0).getOperand(Element));
2034       }
2035     }
2036     break;
2037   }
2038
2039   case ISD::SELECT_CC: {
2040     // Try common optimizations
2041     if (SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI))
2042       return Ret;
2043
2044     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
2045     //      selectcc x, y, a, b, inv(cc)
2046     //
2047     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
2048     //      selectcc x, y, a, b, cc
2049     SDValue LHS = N->getOperand(0);
2050     if (LHS.getOpcode() != ISD::SELECT_CC) {
2051       return SDValue();
2052     }
2053
2054     SDValue RHS = N->getOperand(1);
2055     SDValue True = N->getOperand(2);
2056     SDValue False = N->getOperand(3);
2057     ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
2058
2059     if (LHS.getOperand(2).getNode() != True.getNode() ||
2060         LHS.getOperand(3).getNode() != False.getNode() ||
2061         RHS.getNode() != False.getNode()) {
2062       return SDValue();
2063     }
2064
2065     switch (NCC) {
2066     default: return SDValue();
2067     case ISD::SETNE: return LHS;
2068     case ISD::SETEQ: {
2069       ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
2070       LHSCC = ISD::getSetCCInverse(LHSCC,
2071                                   LHS.getOperand(0).getValueType().isInteger());
2072       if (DCI.isBeforeLegalizeOps() ||
2073           isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType()))
2074         return DAG.getSelectCC(SDLoc(N),
2075                                LHS.getOperand(0),
2076                                LHS.getOperand(1),
2077                                LHS.getOperand(2),
2078                                LHS.getOperand(3),
2079                                LHSCC);
2080       break;
2081     }
2082     }
2083     return SDValue();
2084   }
2085
2086   case AMDGPUISD::EXPORT: {
2087     SDValue Arg = N->getOperand(1);
2088     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
2089       break;
2090
2091     SDValue NewArgs[8] = {
2092       N->getOperand(0), // Chain
2093       SDValue(),
2094       N->getOperand(2), // ArrayBase
2095       N->getOperand(3), // Type
2096       N->getOperand(4), // SWZ_X
2097       N->getOperand(5), // SWZ_Y
2098       N->getOperand(6), // SWZ_Z
2099       N->getOperand(7) // SWZ_W
2100     };
2101     SDLoc DL(N);
2102     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG, DL);
2103     return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs);
2104   }
2105   case AMDGPUISD::TEXTURE_FETCH: {
2106     SDValue Arg = N->getOperand(1);
2107     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
2108       break;
2109
2110     SDValue NewArgs[19] = {
2111       N->getOperand(0),
2112       N->getOperand(1),
2113       N->getOperand(2),
2114       N->getOperand(3),
2115       N->getOperand(4),
2116       N->getOperand(5),
2117       N->getOperand(6),
2118       N->getOperand(7),
2119       N->getOperand(8),
2120       N->getOperand(9),
2121       N->getOperand(10),
2122       N->getOperand(11),
2123       N->getOperand(12),
2124       N->getOperand(13),
2125       N->getOperand(14),
2126       N->getOperand(15),
2127       N->getOperand(16),
2128       N->getOperand(17),
2129       N->getOperand(18),
2130     };
2131     SDLoc DL(N);
2132     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG, DL);
2133     return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, N->getVTList(), NewArgs);
2134   }
2135   }
2136
2137   return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
2138 }
2139
2140 bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx,
2141                                      SDValue &Src, SDValue &Neg, SDValue &Abs,
2142                                      SDValue &Sel, SDValue &Imm,
2143                                      SelectionDAG &DAG) const {
2144   const R600InstrInfo *TII = getSubtarget()->getInstrInfo();
2145   if (!Src.isMachineOpcode())
2146     return false;
2147
2148   switch (Src.getMachineOpcode()) {
2149   case AMDGPU::FNEG_R600:
2150     if (!Neg.getNode())
2151       return false;
2152     Src = Src.getOperand(0);
2153     Neg = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32);
2154     return true;
2155   case AMDGPU::FABS_R600:
2156     if (!Abs.getNode())
2157       return false;
2158     Src = Src.getOperand(0);
2159     Abs = DAG.getTargetConstant(1, SDLoc(ParentNode), MVT::i32);
2160     return true;
2161   case AMDGPU::CONST_COPY: {
2162     unsigned Opcode = ParentNode->getMachineOpcode();
2163     bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2164
2165     if (!Sel.getNode())
2166       return false;
2167
2168     SDValue CstOffset = Src.getOperand(0);
2169     if (ParentNode->getValueType(0).isVector())
2170       return false;
2171
2172     // Gather constants values
2173     int SrcIndices[] = {
2174       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
2175       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
2176       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2),
2177       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
2178       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
2179       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
2180       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
2181       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
2182       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
2183       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
2184       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
2185     };
2186     std::vector<unsigned> Consts;
2187     for (int OtherSrcIdx : SrcIndices) {
2188       int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx);
2189       if (OtherSrcIdx < 0 || OtherSelIdx < 0)
2190         continue;
2191       if (HasDst) {
2192         OtherSrcIdx--;
2193         OtherSelIdx--;
2194       }
2195       if (RegisterSDNode *Reg =
2196           dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) {
2197         if (Reg->getReg() == AMDGPU::ALU_CONST) {
2198           ConstantSDNode *Cst
2199             = cast<ConstantSDNode>(ParentNode->getOperand(OtherSelIdx));
2200           Consts.push_back(Cst->getZExtValue());
2201         }
2202       }
2203     }
2204
2205     ConstantSDNode *Cst = cast<ConstantSDNode>(CstOffset);
2206     Consts.push_back(Cst->getZExtValue());
2207     if (!TII->fitsConstReadLimitations(Consts)) {
2208       return false;
2209     }
2210
2211     Sel = CstOffset;
2212     Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32);
2213     return true;
2214   }
2215   case AMDGPU::MOV_IMM_GLOBAL_ADDR:
2216     // Check if the Imm slot is used. Taken from below.
2217     if (cast<ConstantSDNode>(Imm)->getZExtValue())
2218       return false;
2219     Imm = Src.getOperand(0);
2220     Src = DAG.getRegister(AMDGPU::ALU_LITERAL_X, MVT::i32);
2221     return true;
2222   case AMDGPU::MOV_IMM_I32:
2223   case AMDGPU::MOV_IMM_F32: {
2224     unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
2225     uint64_t ImmValue = 0;
2226
2227
2228     if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) {
2229       ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
2230       float FloatValue = FPC->getValueAPF().convertToFloat();
2231       if (FloatValue == 0.0) {
2232         ImmReg = AMDGPU::ZERO;
2233       } else if (FloatValue == 0.5) {
2234         ImmReg = AMDGPU::HALF;
2235       } else if (FloatValue == 1.0) {
2236         ImmReg = AMDGPU::ONE;
2237       } else {
2238         ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
2239       }
2240     } else {
2241       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0));
2242       uint64_t Value = C->getZExtValue();
2243       if (Value == 0) {
2244         ImmReg = AMDGPU::ZERO;
2245       } else if (Value == 1) {
2246         ImmReg = AMDGPU::ONE_INT;
2247       } else {
2248         ImmValue = Value;
2249       }
2250     }
2251
2252     // Check that we aren't already using an immediate.
2253     // XXX: It's possible for an instruction to have more than one
2254     // immediate operand, but this is not supported yet.
2255     if (ImmReg == AMDGPU::ALU_LITERAL_X) {
2256       if (!Imm.getNode())
2257         return false;
2258       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm);
2259       assert(C);
2260       if (C->getZExtValue())
2261         return false;
2262       Imm = DAG.getTargetConstant(ImmValue, SDLoc(ParentNode), MVT::i32);
2263     }
2264     Src = DAG.getRegister(ImmReg, MVT::i32);
2265     return true;
2266   }
2267   default:
2268     return false;
2269   }
2270 }
2271
2272 /// \brief Fold the instructions after selecting them
2273 SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
2274                                             SelectionDAG &DAG) const {
2275   const R600InstrInfo *TII = getSubtarget()->getInstrInfo();
2276   if (!Node->isMachineOpcode())
2277     return Node;
2278
2279   unsigned Opcode = Node->getMachineOpcode();
2280   SDValue FakeOp;
2281
2282   std::vector<SDValue> Ops(Node->op_begin(), Node->op_end());
2283
2284   if (Opcode == AMDGPU::DOT_4) {
2285     int OperandIdx[] = {
2286       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
2287       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
2288       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
2289       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
2290       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
2291       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
2292       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
2293       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
2294         };
2295     int NegIdx[] = {
2296       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X),
2297       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y),
2298       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z),
2299       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W),
2300       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X),
2301       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y),
2302       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z),
2303       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W)
2304     };
2305     int AbsIdx[] = {
2306       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X),
2307       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y),
2308       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z),
2309       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W),
2310       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X),
2311       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y),
2312       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z),
2313       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W)
2314     };
2315     for (unsigned i = 0; i < 8; i++) {
2316       if (OperandIdx[i] < 0)
2317         return Node;
2318       SDValue &Src = Ops[OperandIdx[i] - 1];
2319       SDValue &Neg = Ops[NegIdx[i] - 1];
2320       SDValue &Abs = Ops[AbsIdx[i] - 1];
2321       bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2322       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
2323       if (HasDst)
2324         SelIdx--;
2325       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
2326       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG))
2327         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2328     }
2329   } else if (Opcode == AMDGPU::REG_SEQUENCE) {
2330     for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) {
2331       SDValue &Src = Ops[i];
2332       if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG))
2333         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2334     }
2335   } else if (Opcode == AMDGPU::CLAMP_R600) {
2336     SDValue Src = Node->getOperand(0);
2337     if (!Src.isMachineOpcode() ||
2338         !TII->hasInstrModifiers(Src.getMachineOpcode()))
2339       return Node;
2340     int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(),
2341         AMDGPU::OpName::clamp);
2342     if (ClampIdx < 0)
2343       return Node;
2344     SDLoc DL(Node);
2345     std::vector<SDValue> Ops(Src->op_begin(), Src->op_end());
2346     Ops[ClampIdx - 1] = DAG.getTargetConstant(1, DL, MVT::i32);
2347     return DAG.getMachineNode(Src.getMachineOpcode(), DL,
2348                               Node->getVTList(), Ops);
2349   } else {
2350     if (!TII->hasInstrModifiers(Opcode))
2351       return Node;
2352     int OperandIdx[] = {
2353       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
2354       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
2355       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2)
2356     };
2357     int NegIdx[] = {
2358       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg),
2359       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg),
2360       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg)
2361     };
2362     int AbsIdx[] = {
2363       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs),
2364       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs),
2365       -1
2366     };
2367     for (unsigned i = 0; i < 3; i++) {
2368       if (OperandIdx[i] < 0)
2369         return Node;
2370       SDValue &Src = Ops[OperandIdx[i] - 1];
2371       SDValue &Neg = Ops[NegIdx[i] - 1];
2372       SDValue FakeAbs;
2373       SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs;
2374       bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2375       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
2376       int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal);
2377       if (HasDst) {
2378         SelIdx--;
2379         ImmIdx--;
2380       }
2381       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
2382       SDValue &Imm = Ops[ImmIdx];
2383       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG))
2384         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2385     }
2386   }
2387
2388   return Node;
2389 }