lib/Target/R600/R600ISelLowering.cpp

   1 //===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 /// \brief Custom DAG lowering for R600
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "R600ISelLowering.h"
  16 #include "AMDGPUFrameLowering.h"
  17 #include "AMDGPUIntrinsicInfo.h"
  18 #include "AMDGPUSubtarget.h"
  19 #include "R600Defines.h"
  20 #include "R600InstrInfo.h"
  21 #include "R600MachineFunctionInfo.h"
  22 #include "llvm/Analysis/ValueTracking.h"
  23 #include "llvm/CodeGen/CallingConvLower.h"
  24 #include "llvm/CodeGen/MachineFrameInfo.h"
  25 #include "llvm/CodeGen/MachineInstrBuilder.h"
  26 #include "llvm/CodeGen/MachineRegisterInfo.h"
  27 #include "llvm/CodeGen/SelectionDAG.h"
  28 #include "llvm/IR/Argument.h"
  29 #include "llvm/IR/Function.h"
  30
  31 using namespace llvm;
  32
  33 R600TargetLowering::R600TargetLowering(TargetMachine &TM,
  34                                        const AMDGPUSubtarget &STI)
  35     : AMDGPUTargetLowering(TM, STI), Gen(STI.getGeneration()) {
  36   addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
  37   addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
  38   addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
  39   addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
  40   addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
  41   addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
  42
  43   computeRegisterProperties(STI.getRegisterInfo());
  44
  45   // Set condition code actions
  46   setCondCodeAction(ISD::SETO,   MVT::f32, Expand);
  47   setCondCodeAction(ISD::SETUO,  MVT::f32, Expand);
  48   setCondCodeAction(ISD::SETLT,  MVT::f32, Expand);
  49   setCondCodeAction(ISD::SETLE,  MVT::f32, Expand);
  50   setCondCodeAction(ISD::SETOLT, MVT::f32, Expand);
  51   setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
  52   setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
  53   setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
  54   setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
  55   setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
  56   setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
  57   setCondCodeAction(ISD::SETULE, MVT::f32, Expand);
  58
  59   setCondCodeAction(ISD::SETLE, MVT::i32, Expand);
  60   setCondCodeAction(ISD::SETLT, MVT::i32, Expand);
  61   setCondCodeAction(ISD::SETULE, MVT::i32, Expand);
  62   setCondCodeAction(ISD::SETULT, MVT::i32, Expand);
  63
  64   setOperationAction(ISD::FCOS, MVT::f32, Custom);
  65   setOperationAction(ISD::FSIN, MVT::f32, Custom);
  66
  67   setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
  68   setOperationAction(ISD::SETCC, MVT::v2i32, Expand);
  69
  70   setOperationAction(ISD::BR_CC, MVT::i32, Expand);
  71   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
  72   setOperationAction(ISD::BRCOND, MVT::Other, Custom);
  73
  74   setOperationAction(ISD::FSUB, MVT::f32, Expand);
  75
  76   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
  77   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
  78   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
  79
  80   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
  81   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
  82
  83   setOperationAction(ISD::SETCC, MVT::i32, Expand);
  84   setOperationAction(ISD::SETCC, MVT::f32, Expand);
  85   setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
  86   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
  87   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
  88
  89   setOperationAction(ISD::SELECT, MVT::i32, Expand);
  90   setOperationAction(ISD::SELECT, MVT::f32, Expand);
  91   setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
  92   setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
  93
  94   // Expand sign extension of vectors
  95   if (!Subtarget->hasBFE())
  96     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
  97
  98   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Expand);
  99   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Expand);
 100
 101   if (!Subtarget->hasBFE())
 102     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
 103   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Expand);
 104   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Expand);
 105
 106   if (!Subtarget->hasBFE())
 107     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
 108   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand);
 109   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand);
 110
 111   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
 112   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand);
 113   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand);
 114
 115   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand);
 116
 117
 118   // Legalize loads and stores to the private address space.
 119   setOperationAction(ISD::LOAD, MVT::i32, Custom);
 120   setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
 121   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
 122
 123   // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
 124   // spaces, so it is custom lowered to handle those where it isn't.
 125   for (MVT VT : MVT::integer_valuetypes()) {
 126     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
 127     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Custom);
 128     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Custom);
 129
 130     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
 131     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Custom);
 132     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Custom);
 133
 134     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
 135     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Custom);
 136     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Custom);
 137   }
 138
 139   setOperationAction(ISD::STORE, MVT::i8, Custom);
 140   setOperationAction(ISD::STORE, MVT::i32, Custom);
 141   setOperationAction(ISD::STORE, MVT::v2i32, Custom);
 142   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
 143   setTruncStoreAction(MVT::i32, MVT::i8, Custom);
 144   setTruncStoreAction(MVT::i32, MVT::i16, Custom);
 145
 146   setOperationAction(ISD::LOAD, MVT::i32, Custom);
 147   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
 148   setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
 149
 150   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom);
 151   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom);
 152   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
 153   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
 154
 155   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom);
 156   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom);
 157   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
 158   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
 159
 160   setTargetDAGCombine(ISD::FP_ROUND);
 161   setTargetDAGCombine(ISD::FP_TO_SINT);
 162   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
 163   setTargetDAGCombine(ISD::SELECT_CC);
 164   setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
 165
 166   setOperationAction(ISD::SUB, MVT::i64, Expand);
 167
 168   // These should be replaced by UDVIREM, but it does not happen automatically
 169   // during Type Legalization
 170   setOperationAction(ISD::UDIV, MVT::i64, Custom);
 171   setOperationAction(ISD::UREM, MVT::i64, Custom);
 172   setOperationAction(ISD::SDIV, MVT::i64, Custom);
 173   setOperationAction(ISD::SREM, MVT::i64, Custom);
 174
 175   // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32
 176   //  to be Legal/Custom in order to avoid library calls.
 177   setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
 178   setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
 179   setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
 180
 181   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
 182
 183   const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
 184   for (MVT VT : ScalarIntVTs) {
 185     setOperationAction(ISD::ADDC, VT, Expand);
 186     setOperationAction(ISD::SUBC, VT, Expand);
 187     setOperationAction(ISD::ADDE, VT, Expand);
 188     setOperationAction(ISD::SUBE, VT, Expand);
 189   }
 190
 191   setSchedulingPreference(Sched::Source);
 192 }
 193
 194 MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
 195     MachineInstr * MI, MachineBasicBlock * BB) const {
 196   MachineFunction * MF = BB->getParent();
 197   MachineRegisterInfo &MRI = MF->getRegInfo();
 198   MachineBasicBlock::iterator I = *MI;
 199   const R600InstrInfo *TII =
 200       static_cast<const R600InstrInfo *>(Subtarget->getInstrInfo());
 201
 202   switch (MI->getOpcode()) {
 203   default:
 204     // Replace LDS_*_RET instruction that don't have any uses with the
 205     // equivalent LDS_*_NORET instruction.
 206     if (TII->isLDSRetInstr(MI->getOpcode())) {
 207       int DstIdx = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
 208       assert(DstIdx != -1);
 209       MachineInstrBuilder NewMI;
 210       // FIXME: getLDSNoRetOp method only handles LDS_1A1D LDS ops. Add
 211       //        LDS_1A2D support and remove this special case.
 212       if (!MRI.use_empty(MI->getOperand(DstIdx).getReg()) ||
 213            MI->getOpcode() == AMDGPU::LDS_CMPST_RET)
 214         return BB;
 215
 216       NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
 217                       TII->get(AMDGPU::getLDSNoRetOp(MI->getOpcode())));
 218       for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) {
 219         NewMI.addOperand(MI->getOperand(i));
 220       }
 221     } else {
 222       return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
 223     }
 224     break;
 225   case AMDGPU::CLAMP_R600: {
 226     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 227                                                    AMDGPU::MOV,
 228                                                    MI->getOperand(0).getReg(),
 229                                                    MI->getOperand(1).getReg());
 230     TII->addFlag(NewMI, 0, MO_FLAG_CLAMP);
 231     break;
 232   }
 233
 234   case AMDGPU::FABS_R600: {
 235     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 236                                                     AMDGPU::MOV,
 237                                                     MI->getOperand(0).getReg(),
 238                                                     MI->getOperand(1).getReg());
 239     TII->addFlag(NewMI, 0, MO_FLAG_ABS);
 240     break;
 241   }
 242
 243   case AMDGPU::FNEG_R600: {
 244     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 245                                                     AMDGPU::MOV,
 246                                                     MI->getOperand(0).getReg(),
 247                                                     MI->getOperand(1).getReg());
 248     TII->addFlag(NewMI, 0, MO_FLAG_NEG);
 249     break;
 250   }
 251
 252   case AMDGPU::MASK_WRITE: {
 253     unsigned maskedRegister = MI->getOperand(0).getReg();
 254     assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
 255     MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
 256     TII->addFlag(defInstr, 0, MO_FLAG_MASK);
 257     break;
 258   }
 259
 260   case AMDGPU::MOV_IMM_F32:
 261     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
 262                      MI->getOperand(1).getFPImm()->getValueAPF()
 263                          .bitcastToAPInt().getZExtValue());
 264     break;
 265   case AMDGPU::MOV_IMM_I32:
 266     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
 267                      MI->getOperand(1).getImm());
 268     break;
 269   case AMDGPU::CONST_COPY: {
 270     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV,
 271         MI->getOperand(0).getReg(), AMDGPU::ALU_CONST);
 272     TII->setImmOperand(NewMI, AMDGPU::OpName::src0_sel,
 273         MI->getOperand(1).getImm());
 274     break;
 275   }
 276
 277   case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
 278   case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
 279   case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
 280     unsigned EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
 281
 282     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
 283             .addOperand(MI->getOperand(0))
 284             .addOperand(MI->getOperand(1))
 285             .addImm(EOP); // Set End of program bit
 286     break;
 287   }
 288
 289   case AMDGPU::TXD: {
 290     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 291     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 292     MachineOperand &RID = MI->getOperand(4);
 293     MachineOperand &SID = MI->getOperand(5);
 294     unsigned TextureId = MI->getOperand(6).getImm();
 295     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
 296     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
 297
 298     switch (TextureId) {
 299     case 5: // Rect
 300       CTX = CTY = 0;
 301       break;
 302     case 6: // Shadow1D
 303       SrcW = SrcZ;
 304       break;
 305     case 7: // Shadow2D
 306       SrcW = SrcZ;
 307       break;
 308     case 8: // ShadowRect
 309       CTX = CTY = 0;
 310       SrcW = SrcZ;
 311       break;
 312     case 9: // 1DArray
 313       SrcZ = SrcY;
 314       CTZ = 0;
 315       break;
 316     case 10: // 2DArray
 317       CTZ = 0;
 318       break;
 319     case 11: // Shadow1DArray
 320       SrcZ = SrcY;
 321       CTZ = 0;
 322       break;
 323     case 12: // Shadow2DArray
 324       CTZ = 0;
 325       break;
 326     }
 327     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
 328             .addOperand(MI->getOperand(3))
 329             .addImm(SrcX)
 330             .addImm(SrcY)
 331             .addImm(SrcZ)
 332             .addImm(SrcW)
 333             .addImm(0)
 334             .addImm(0)
 335             .addImm(0)
 336             .addImm(0)
 337             .addImm(1)
 338             .addImm(2)
 339             .addImm(3)
 340             .addOperand(RID)
 341             .addOperand(SID)
 342             .addImm(CTX)
 343             .addImm(CTY)
 344             .addImm(CTZ)
 345             .addImm(CTW);
 346     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
 347             .addOperand(MI->getOperand(2))
 348             .addImm(SrcX)
 349             .addImm(SrcY)
 350             .addImm(SrcZ)
 351             .addImm(SrcW)
 352             .addImm(0)
 353             .addImm(0)
 354             .addImm(0)
 355             .addImm(0)
 356             .addImm(1)
 357             .addImm(2)
 358             .addImm(3)
 359             .addOperand(RID)
 360             .addOperand(SID)
 361             .addImm(CTX)
 362             .addImm(CTY)
 363             .addImm(CTZ)
 364             .addImm(CTW);
 365     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
 366             .addOperand(MI->getOperand(0))
 367             .addOperand(MI->getOperand(1))
 368             .addImm(SrcX)
 369             .addImm(SrcY)
 370             .addImm(SrcZ)
 371             .addImm(SrcW)
 372             .addImm(0)
 373             .addImm(0)
 374             .addImm(0)
 375             .addImm(0)
 376             .addImm(1)
 377             .addImm(2)
 378             .addImm(3)
 379             .addOperand(RID)
 380             .addOperand(SID)
 381             .addImm(CTX)
 382             .addImm(CTY)
 383             .addImm(CTZ)
 384             .addImm(CTW)
 385             .addReg(T0, RegState::Implicit)
 386             .addReg(T1, RegState::Implicit);
 387     break;
 388   }
 389
 390   case AMDGPU::TXD_SHADOW: {
 391     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 392     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 393     MachineOperand &RID = MI->getOperand(4);
 394     MachineOperand &SID = MI->getOperand(5);
 395     unsigned TextureId = MI->getOperand(6).getImm();
 396     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
 397     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
 398
 399     switch (TextureId) {
 400     case 5: // Rect
 401       CTX = CTY = 0;
 402       break;
 403     case 6: // Shadow1D
 404       SrcW = SrcZ;
 405       break;
 406     case 7: // Shadow2D
 407       SrcW = SrcZ;
 408       break;
 409     case 8: // ShadowRect
 410       CTX = CTY = 0;
 411       SrcW = SrcZ;
 412       break;
 413     case 9: // 1DArray
 414       SrcZ = SrcY;
 415       CTZ = 0;
 416       break;
 417     case 10: // 2DArray
 418       CTZ = 0;
 419       break;
 420     case 11: // Shadow1DArray
 421       SrcZ = SrcY;
 422       CTZ = 0;
 423       break;
 424     case 12: // Shadow2DArray
 425       CTZ = 0;
 426       break;
 427     }
 428
 429     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
 430             .addOperand(MI->getOperand(3))
 431             .addImm(SrcX)
 432             .addImm(SrcY)
 433             .addImm(SrcZ)
 434             .addImm(SrcW)
 435             .addImm(0)
 436             .addImm(0)
 437             .addImm(0)
 438             .addImm(0)
 439             .addImm(1)
 440             .addImm(2)
 441             .addImm(3)
 442             .addOperand(RID)
 443             .addOperand(SID)
 444             .addImm(CTX)
 445             .addImm(CTY)
 446             .addImm(CTZ)
 447             .addImm(CTW);
 448     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
 449             .addOperand(MI->getOperand(2))
 450             .addImm(SrcX)
 451             .addImm(SrcY)
 452             .addImm(SrcZ)
 453             .addImm(SrcW)
 454             .addImm(0)
 455             .addImm(0)
 456             .addImm(0)
 457             .addImm(0)
 458             .addImm(1)
 459             .addImm(2)
 460             .addImm(3)
 461             .addOperand(RID)
 462             .addOperand(SID)
 463             .addImm(CTX)
 464             .addImm(CTY)
 465             .addImm(CTZ)
 466             .addImm(CTW);
 467     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
 468             .addOperand(MI->getOperand(0))
 469             .addOperand(MI->getOperand(1))
 470             .addImm(SrcX)
 471             .addImm(SrcY)
 472             .addImm(SrcZ)
 473             .addImm(SrcW)
 474             .addImm(0)
 475             .addImm(0)
 476             .addImm(0)
 477             .addImm(0)
 478             .addImm(1)
 479             .addImm(2)
 480             .addImm(3)
 481             .addOperand(RID)
 482             .addOperand(SID)
 483             .addImm(CTX)
 484             .addImm(CTY)
 485             .addImm(CTZ)
 486             .addImm(CTW)
 487             .addReg(T0, RegState::Implicit)
 488             .addReg(T1, RegState::Implicit);
 489     break;
 490   }
 491
 492   case AMDGPU::BRANCH:
 493       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
 494               .addOperand(MI->getOperand(0));
 495       break;
 496
 497   case AMDGPU::BRANCH_COND_f32: {
 498     MachineInstr *NewMI =
 499       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 500               AMDGPU::PREDICATE_BIT)
 501               .addOperand(MI->getOperand(1))
 502               .addImm(OPCODE_IS_NOT_ZERO)
 503               .addImm(0); // Flags
 504     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
 505     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
 506             .addOperand(MI->getOperand(0))
 507             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 508     break;
 509   }
 510
 511   case AMDGPU::BRANCH_COND_i32: {
 512     MachineInstr *NewMI =
 513       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 514             AMDGPU::PREDICATE_BIT)
 515             .addOperand(MI->getOperand(1))
 516             .addImm(OPCODE_IS_NOT_ZERO_INT)
 517             .addImm(0); // Flags
 518     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
 519     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
 520            .addOperand(MI->getOperand(0))
 521             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 522     break;
 523   }
 524
 525   case AMDGPU::EG_ExportSwz:
 526   case AMDGPU::R600_ExportSwz: {
 527     // Instruction is left unmodified if its not the last one of its type
 528     bool isLastInstructionOfItsType = true;
 529     unsigned InstExportType = MI->getOperand(1).getImm();
 530     for (MachineBasicBlock::iterator NextExportInst = std::next(I),
 531          EndBlock = BB->end(); NextExportInst != EndBlock;
 532          NextExportInst = std::next(NextExportInst)) {
 533       if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
 534           NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
 535         unsigned CurrentInstExportType = NextExportInst->getOperand(1)
 536             .getImm();
 537         if (CurrentInstExportType == InstExportType) {
 538           isLastInstructionOfItsType = false;
 539           break;
 540         }
 541       }
 542     }
 543     bool EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
 544     if (!EOP && !isLastInstructionOfItsType)
 545       return BB;
 546     unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
 547     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
 548             .addOperand(MI->getOperand(0))
 549             .addOperand(MI->getOperand(1))
 550             .addOperand(MI->getOperand(2))
 551             .addOperand(MI->getOperand(3))
 552             .addOperand(MI->getOperand(4))
 553             .addOperand(MI->getOperand(5))
 554             .addOperand(MI->getOperand(6))
 555             .addImm(CfInst)
 556             .addImm(EOP);
 557     break;
 558   }
 559   case AMDGPU::RETURN: {
 560     // RETURN instructions must have the live-out registers as implicit uses,
 561     // otherwise they appear dead.
 562     R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
 563     MachineInstrBuilder MIB(*MF, MI);
 564     for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i)
 565       MIB.addReg(MFI->LiveOuts[i], RegState::Implicit);
 566     return BB;
 567   }
 568   }
 569
 570   MI->eraseFromParent();
 571   return BB;
 572 }
 573
 574 //===----------------------------------------------------------------------===//
 575 // Custom DAG Lowering Operations
 576 //===----------------------------------------------------------------------===//
 577
 578 SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
 579   MachineFunction &MF = DAG.getMachineFunction();
 580   R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
 581   switch (Op.getOpcode()) {
 582   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 583   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
 584   case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
 585   case ISD::SHL_PARTS: return LowerSHLParts(Op, DAG);
 586   case ISD::SRA_PARTS:
 587   case ISD::SRL_PARTS: return LowerSRXParts(Op, DAG);
 588   case ISD::FCOS:
 589   case ISD::FSIN: return LowerTrig(Op, DAG);
 590   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
 591   case ISD::STORE: return LowerSTORE(Op, DAG);
 592   case ISD::LOAD: {
 593     SDValue Result = LowerLOAD(Op, DAG);
 594     assert((!Result.getNode() ||
 595             Result.getNode()->getNumValues() == 2) &&
 596            "Load should return a value and a chain");
 597     return Result;
 598   }
 599
 600   case ISD::BRCOND: return LowerBRCOND(Op, DAG);
 601   case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
 602   case ISD::INTRINSIC_VOID: {
 603     SDValue Chain = Op.getOperand(0);
 604     unsigned IntrinsicID =
 605                          cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 606     switch (IntrinsicID) {
 607     case AMDGPUIntrinsic::AMDGPU_store_output: {
 608       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
 609       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
 610       MFI->LiveOuts.push_back(Reg);
 611       return DAG.getCopyToReg(Chain, SDLoc(Op), Reg, Op.getOperand(2));
 612     }
 613     case AMDGPUIntrinsic::R600_store_swizzle: {
 614       const SDValue Args[8] = {
 615         Chain,
 616         Op.getOperand(2), // Export Value
 617         Op.getOperand(3), // ArrayBase
 618         Op.getOperand(4), // Type
 619         DAG.getConstant(0, MVT::i32), // SWZ_X
 620         DAG.getConstant(1, MVT::i32), // SWZ_Y
 621         DAG.getConstant(2, MVT::i32), // SWZ_Z
 622         DAG.getConstant(3, MVT::i32) // SWZ_W
 623       };
 624       return DAG.getNode(AMDGPUISD::EXPORT, SDLoc(Op), Op.getValueType(), Args);
 625     }
 626
 627     // default for switch(IntrinsicID)
 628     default: break;
 629     }
 630     // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
 631     break;
 632   }
 633   case ISD::INTRINSIC_WO_CHAIN: {
 634     unsigned IntrinsicID =
 635                          cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
 636     EVT VT = Op.getValueType();
 637     SDLoc DL(Op);
 638     switch(IntrinsicID) {
 639     default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 640     case AMDGPUIntrinsic::R600_load_input: {
 641       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 642       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
 643       MachineFunction &MF = DAG.getMachineFunction();
 644       MachineRegisterInfo &MRI = MF.getRegInfo();
 645       MRI.addLiveIn(Reg);
 646       return DAG.getCopyFromReg(DAG.getEntryNode(),
 647           SDLoc(DAG.getEntryNode()), Reg, VT);
 648     }
 649
 650     case AMDGPUIntrinsic::R600_interp_input: {
 651       int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 652       int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue();
 653       MachineSDNode *interp;
 654       if (ijb < 0) {
 655         const R600InstrInfo *TII =
 656             static_cast<const R600InstrInfo *>(Subtarget->getInstrInfo());
 657         interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL,
 658             MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32));
 659         return DAG.getTargetExtractSubreg(
 660             TII->getRegisterInfo().getSubRegFromChannel(slot % 4),
 661             DL, MVT::f32, SDValue(interp, 0));
 662       }
 663       MachineFunction &MF = DAG.getMachineFunction();
 664       MachineRegisterInfo &MRI = MF.getRegInfo();
 665       unsigned RegisterI = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb);
 666       unsigned RegisterJ = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1);
 667       MRI.addLiveIn(RegisterI);
 668       MRI.addLiveIn(RegisterJ);
 669       SDValue RegisterINode = DAG.getCopyFromReg(DAG.getEntryNode(),
 670           SDLoc(DAG.getEntryNode()), RegisterI, MVT::f32);
 671       SDValue RegisterJNode = DAG.getCopyFromReg(DAG.getEntryNode(),
 672           SDLoc(DAG.getEntryNode()), RegisterJ, MVT::f32);
 673
 674       if (slot % 4 < 2)
 675         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
 676             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
 677             RegisterJNode, RegisterINode);
 678       else
 679         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
 680             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
 681             RegisterJNode, RegisterINode);
 682       return SDValue(interp, slot % 2);
 683     }
 684     case AMDGPUIntrinsic::R600_interp_xy:
 685     case AMDGPUIntrinsic::R600_interp_zw: {
 686       int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 687       MachineSDNode *interp;
 688       SDValue RegisterINode = Op.getOperand(2);
 689       SDValue RegisterJNode = Op.getOperand(3);
 690
 691       if (IntrinsicID == AMDGPUIntrinsic::R600_interp_xy)
 692         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
 693             MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32),
 694             RegisterJNode, RegisterINode);
 695       else
 696         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
 697             MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32),
 698             RegisterJNode, RegisterINode);
 699       return DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32,
 700           SDValue(interp, 0), SDValue(interp, 1));
 701     }
 702     case AMDGPUIntrinsic::R600_tex:
 703     case AMDGPUIntrinsic::R600_texc:
 704     case AMDGPUIntrinsic::R600_txl:
 705     case AMDGPUIntrinsic::R600_txlc:
 706     case AMDGPUIntrinsic::R600_txb:
 707     case AMDGPUIntrinsic::R600_txbc:
 708     case AMDGPUIntrinsic::R600_txf:
 709     case AMDGPUIntrinsic::R600_txq:
 710     case AMDGPUIntrinsic::R600_ddx:
 711     case AMDGPUIntrinsic::R600_ddy:
 712     case AMDGPUIntrinsic::R600_ldptr: {
 713       unsigned TextureOp;
 714       switch (IntrinsicID) {
 715       case AMDGPUIntrinsic::R600_tex:
 716         TextureOp = 0;
 717         break;
 718       case AMDGPUIntrinsic::R600_texc:
 719         TextureOp = 1;
 720         break;
 721       case AMDGPUIntrinsic::R600_txl:
 722         TextureOp = 2;
 723         break;
 724       case AMDGPUIntrinsic::R600_txlc:
 725         TextureOp = 3;
 726         break;
 727       case AMDGPUIntrinsic::R600_txb:
 728         TextureOp = 4;
 729         break;
 730       case AMDGPUIntrinsic::R600_txbc:
 731         TextureOp = 5;
 732         break;
 733       case AMDGPUIntrinsic::R600_txf:
 734         TextureOp = 6;
 735         break;
 736       case AMDGPUIntrinsic::R600_txq:
 737         TextureOp = 7;
 738         break;
 739       case AMDGPUIntrinsic::R600_ddx:
 740         TextureOp = 8;
 741         break;
 742       case AMDGPUIntrinsic::R600_ddy:
 743         TextureOp = 9;
 744         break;
 745       case AMDGPUIntrinsic::R600_ldptr:
 746         TextureOp = 10;
 747         break;
 748       default:
 749         llvm_unreachable("Unknow Texture Operation");
 750       }
 751
 752       SDValue TexArgs[19] = {
 753         DAG.getConstant(TextureOp, MVT::i32),
 754         Op.getOperand(1),
 755         DAG.getConstant(0, MVT::i32),
 756         DAG.getConstant(1, MVT::i32),
 757         DAG.getConstant(2, MVT::i32),
 758         DAG.getConstant(3, MVT::i32),
 759         Op.getOperand(2),
 760         Op.getOperand(3),
 761         Op.getOperand(4),
 762         DAG.getConstant(0, MVT::i32),
 763         DAG.getConstant(1, MVT::i32),
 764         DAG.getConstant(2, MVT::i32),
 765         DAG.getConstant(3, MVT::i32),
 766         Op.getOperand(5),
 767         Op.getOperand(6),
 768         Op.getOperand(7),
 769         Op.getOperand(8),
 770         Op.getOperand(9),
 771         Op.getOperand(10)
 772       };
 773       return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs);
 774     }
 775     case AMDGPUIntrinsic::AMDGPU_dp4: {
 776       SDValue Args[8] = {
 777       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 778           DAG.getConstant(0, MVT::i32)),
 779       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 780           DAG.getConstant(0, MVT::i32)),
 781       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 782           DAG.getConstant(1, MVT::i32)),
 783       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 784           DAG.getConstant(1, MVT::i32)),
 785       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 786           DAG.getConstant(2, MVT::i32)),
 787       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 788           DAG.getConstant(2, MVT::i32)),
 789       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 790           DAG.getConstant(3, MVT::i32)),
 791       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 792           DAG.getConstant(3, MVT::i32))
 793       };
 794       return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args);
 795     }
 796
 797     case Intrinsic::r600_read_ngroups_x:
 798       return LowerImplicitParameter(DAG, VT, DL, 0);
 799     case Intrinsic::r600_read_ngroups_y:
 800       return LowerImplicitParameter(DAG, VT, DL, 1);
 801     case Intrinsic::r600_read_ngroups_z:
 802       return LowerImplicitParameter(DAG, VT, DL, 2);
 803     case Intrinsic::r600_read_global_size_x:
 804       return LowerImplicitParameter(DAG, VT, DL, 3);
 805     case Intrinsic::r600_read_global_size_y:
 806       return LowerImplicitParameter(DAG, VT, DL, 4);
 807     case Intrinsic::r600_read_global_size_z:
 808       return LowerImplicitParameter(DAG, VT, DL, 5);
 809     case Intrinsic::r600_read_local_size_x:
 810       return LowerImplicitParameter(DAG, VT, DL, 6);
 811     case Intrinsic::r600_read_local_size_y:
 812       return LowerImplicitParameter(DAG, VT, DL, 7);
 813     case Intrinsic::r600_read_local_size_z:
 814       return LowerImplicitParameter(DAG, VT, DL, 8);
 815
 816     case Intrinsic::AMDGPU_read_workdim:
 817       return LowerImplicitParameter(DAG, VT, DL, MFI->ABIArgOffset / 4);
 818
 819     case Intrinsic::r600_read_tgid_x:
 820       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 821                                   AMDGPU::T1_X, VT);
 822     case Intrinsic::r600_read_tgid_y:
 823       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 824                                   AMDGPU::T1_Y, VT);
 825     case Intrinsic::r600_read_tgid_z:
 826       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 827                                   AMDGPU::T1_Z, VT);
 828     case Intrinsic::r600_read_tidig_x:
 829       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 830                                   AMDGPU::T0_X, VT);
 831     case Intrinsic::r600_read_tidig_y:
 832       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 833                                   AMDGPU::T0_Y, VT);
 834     case Intrinsic::r600_read_tidig_z:
 835       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 836                                   AMDGPU::T0_Z, VT);
 837     case Intrinsic::AMDGPU_rsq:
 838       // XXX - I'm assuming SI's RSQ_LEGACY matches R600's behavior.
 839       return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
 840
 841     case AMDGPUIntrinsic::AMDGPU_fract:
 842     case AMDGPUIntrinsic::AMDIL_fraction: // Legacy name.
 843       return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
 844     }
 845     // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
 846     break;
 847   }
 848   } // end switch(Op.getOpcode())
 849   return SDValue();
 850 }
 851
 852 void R600TargetLowering::ReplaceNodeResults(SDNode *N,
 853                                             SmallVectorImpl<SDValue> &Results,
 854                                             SelectionDAG &DAG) const {
 855   switch (N->getOpcode()) {
 856   default:
 857     AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);
 858     return;
 859   case ISD::FP_TO_UINT:
 860     if (N->getValueType(0) == MVT::i1) {
 861       Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
 862       return;
 863     }
 864     // Fall-through. Since we don't care about out of bounds values
 865     // we can use FP_TO_SINT for uints too. The DAGLegalizer code for uint
 866     // considers some extra cases which are not necessary here.
 867   case ISD::FP_TO_SINT: {
 868     SDValue Result;
 869     if (expandFP_TO_SINT(N, Result, DAG))
 870       Results.push_back(Result);
 871     return;
 872   }
 873   case ISD::UDIV: {
 874     SDValue Op = SDValue(N, 0);
 875     SDLoc DL(Op);
 876     EVT VT = Op.getValueType();
 877     SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT),
 878       N->getOperand(0), N->getOperand(1));
 879     Results.push_back(UDIVREM);
 880     break;
 881   }
 882   case ISD::UREM: {
 883     SDValue Op = SDValue(N, 0);
 884     SDLoc DL(Op);
 885     EVT VT = Op.getValueType();
 886     SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT),
 887       N->getOperand(0), N->getOperand(1));
 888     Results.push_back(UDIVREM.getValue(1));
 889     break;
 890   }
 891   case ISD::SDIV: {
 892     SDValue Op = SDValue(N, 0);
 893     SDLoc DL(Op);
 894     EVT VT = Op.getValueType();
 895     SDValue SDIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(VT, VT),
 896       N->getOperand(0), N->getOperand(1));
 897     Results.push_back(SDIVREM);
 898     break;
 899   }
 900   case ISD::SREM: {
 901     SDValue Op = SDValue(N, 0);
 902     SDLoc DL(Op);
 903     EVT VT = Op.getValueType();
 904     SDValue SDIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(VT, VT),
 905       N->getOperand(0), N->getOperand(1));
 906     Results.push_back(SDIVREM.getValue(1));
 907     break;
 908   }
 909   case ISD::SDIVREM: {
 910     SDValue Op = SDValue(N, 1);
 911     SDValue RES = LowerSDIVREM(Op, DAG);
 912     Results.push_back(RES);
 913     Results.push_back(RES.getValue(1));
 914     break;
 915   }
 916   case ISD::UDIVREM: {
 917     SDValue Op = SDValue(N, 0);
 918     LowerUDIVREM64(Op, DAG, Results);
 919     break;
 920   }
 921   }
 922 }
 923
 924 SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG,
 925                                                    SDValue Vector) const {
 926
 927   SDLoc DL(Vector);
 928   EVT VecVT = Vector.getValueType();
 929   EVT EltVT = VecVT.getVectorElementType();
 930   SmallVector<SDValue, 8> Args;
 931
 932   for (unsigned i = 0, e = VecVT.getVectorNumElements();
 933                                                            i != e; ++i) {
 934     Args.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
 935                                Vector, DAG.getConstant(i, getVectorIdxTy())));
 936   }
 937
 938   return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args);
 939 }
 940
 941 SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
 942                                                     SelectionDAG &DAG) const {
 943
 944   SDLoc DL(Op);
 945   SDValue Vector = Op.getOperand(0);
 946   SDValue Index = Op.getOperand(1);
 947
 948   if (isa<ConstantSDNode>(Index) ||
 949       Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
 950     return Op;
 951
 952   Vector = vectorToVerticalVector(DAG, Vector);
 953   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(),
 954                      Vector, Index);
 955 }
 956
 957 SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
 958                                                    SelectionDAG &DAG) const {
 959   SDLoc DL(Op);
 960   SDValue Vector = Op.getOperand(0);
 961   SDValue Value = Op.getOperand(1);
 962   SDValue Index = Op.getOperand(2);
 963
 964   if (isa<ConstantSDNode>(Index) ||
 965       Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
 966     return Op;
 967
 968   Vector = vectorToVerticalVector(DAG, Vector);
 969   SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(),
 970                                Vector, Value, Index);
 971   return vectorToVerticalVector(DAG, Insert);
 972 }
 973
 974 SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
 975   // On hw >= R700, COS/SIN input must be between -1. and 1.
 976   // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5)
 977   EVT VT = Op.getValueType();
 978   SDValue Arg = Op.getOperand(0);
 979   SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, SDLoc(Op), VT,
 980       DAG.getNode(ISD::FADD, SDLoc(Op), VT,
 981         DAG.getNode(ISD::FMUL, SDLoc(Op), VT, Arg,
 982           DAG.getConstantFP(0.15915494309, MVT::f32)),
 983         DAG.getConstantFP(0.5, MVT::f32)));
 984   unsigned TrigNode;
 985   switch (Op.getOpcode()) {
 986   case ISD::FCOS:
 987     TrigNode = AMDGPUISD::COS_HW;
 988     break;
 989   case ISD::FSIN:
 990     TrigNode = AMDGPUISD::SIN_HW;
 991     break;
 992   default:
 993     llvm_unreachable("Wrong trig opcode");
 994   }
 995   SDValue TrigVal = DAG.getNode(TrigNode, SDLoc(Op), VT,
 996       DAG.getNode(ISD::FADD, SDLoc(Op), VT, FractPart,
 997         DAG.getConstantFP(-0.5, MVT::f32)));
 998   if (Gen >= AMDGPUSubtarget::R700)
 999     return TrigVal;
1000   // On R600 hw, COS/SIN input must be between -Pi and Pi.
1001   return DAG.getNode(ISD::FMUL, SDLoc(Op), VT, TrigVal,
1002       DAG.getConstantFP(3.14159265359, MVT::f32));
1003 }
1004
1005 SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const {
1006   SDLoc DL(Op);
1007   EVT VT = Op.getValueType();
1008
1009   SDValue Lo = Op.getOperand(0);
1010   SDValue Hi = Op.getOperand(1);
1011   SDValue Shift = Op.getOperand(2);
1012   SDValue Zero = DAG.getConstant(0, VT);
1013   SDValue One  = DAG.getConstant(1, VT);
1014
1015   SDValue Width  = DAG.getConstant(VT.getSizeInBits(), VT);
1016   SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, VT);
1017   SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
1018   SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
1019
1020   // The dance around Width1 is necessary for 0 special case.
1021   // Without it the CompShift might be 32, producing incorrect results in
1022   // Overflow. So we do the shift in two steps, the alternative is to
1023   // add a conditional to filter the special case.
1024
1025   SDValue Overflow = DAG.getNode(ISD::SRL, DL, VT, Lo, CompShift);
1026   Overflow = DAG.getNode(ISD::SRL, DL, VT, Overflow, One);
1027
1028   SDValue HiSmall = DAG.getNode(ISD::SHL, DL, VT, Hi, Shift);
1029   HiSmall = DAG.getNode(ISD::OR, DL, VT, HiSmall, Overflow);
1030   SDValue LoSmall = DAG.getNode(ISD::SHL, DL, VT, Lo, Shift);
1031
1032   SDValue HiBig = DAG.getNode(ISD::SHL, DL, VT, Lo, BigShift);
1033   SDValue LoBig = Zero;
1034
1035   Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
1036   Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
1037
1038   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
1039 }
1040
1041 SDValue R600TargetLowering::LowerSRXParts(SDValue Op, SelectionDAG &DAG) const {
1042   SDLoc DL(Op);
1043   EVT VT = Op.getValueType();
1044
1045   SDValue Lo = Op.getOperand(0);
1046   SDValue Hi = Op.getOperand(1);
1047   SDValue Shift = Op.getOperand(2);
1048   SDValue Zero = DAG.getConstant(0, VT);
1049   SDValue One  = DAG.getConstant(1, VT);
1050
1051   const bool SRA = Op.getOpcode() == ISD::SRA_PARTS;
1052
1053   SDValue Width  = DAG.getConstant(VT.getSizeInBits(), VT);
1054   SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, VT);
1055   SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
1056   SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
1057
1058   // The dance around Width1 is necessary for 0 special case.
1059   // Without it the CompShift might be 32, producing incorrect results in
1060   // Overflow. So we do the shift in two steps, the alternative is to
1061   // add a conditional to filter the special case.
1062
1063   SDValue Overflow = DAG.getNode(ISD::SHL, DL, VT, Hi, CompShift);
1064   Overflow = DAG.getNode(ISD::SHL, DL, VT, Overflow, One);
1065
1066   SDValue HiSmall = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, Shift);
1067   SDValue LoSmall = DAG.getNode(ISD::SRL, DL, VT, Lo, Shift);
1068   LoSmall = DAG.getNode(ISD::OR, DL, VT, LoSmall, Overflow);
1069
1070   SDValue LoBig = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, BigShift);
1071   SDValue HiBig = SRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, Width1) : Zero;
1072
1073   Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
1074   Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
1075
1076   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
1077 }
1078
1079 SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
1080   return DAG.getNode(
1081       ISD::SETCC,
1082       SDLoc(Op),
1083       MVT::i1,
1084       Op, DAG.getConstantFP(0.0f, MVT::f32),
1085       DAG.getCondCode(ISD::SETNE)
1086       );
1087 }
1088
1089 SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
1090                                                    SDLoc DL,
1091                                                    unsigned DwordOffset) const {
1092   unsigned ByteOffset = DwordOffset * 4;
1093   PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1094                                       AMDGPUAS::CONSTANT_BUFFER_0);
1095
1096   // We shouldn't be using an offset wider than 16-bits for implicit parameters.
1097   assert(isInt<16>(ByteOffset));
1098
1099   return DAG.getLoad(VT, DL, DAG.getEntryNode(),
1100                      DAG.getConstant(ByteOffset, MVT::i32), // PTR
1101                      MachinePointerInfo(ConstantPointerNull::get(PtrType)),
1102                      false, false, false, 0);
1103 }
1104
1105 bool R600TargetLowering::isZero(SDValue Op) const {
1106   if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
1107     return Cst->isNullValue();
1108   } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
1109     return CstFP->isZero();
1110   } else {
1111     return false;
1112   }
1113 }
1114
1115 SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
1116   SDLoc DL(Op);
1117   EVT VT = Op.getValueType();
1118
1119   SDValue LHS = Op.getOperand(0);
1120   SDValue RHS = Op.getOperand(1);
1121   SDValue True = Op.getOperand(2);
1122   SDValue False = Op.getOperand(3);
1123   SDValue CC = Op.getOperand(4);
1124   SDValue Temp;
1125
1126   if (VT == MVT::f32) {
1127     DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
1128     SDValue MinMax = CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI);
1129     if (MinMax)
1130       return MinMax;
1131   }
1132
1133   // LHS and RHS are guaranteed to be the same value type
1134   EVT CompareVT = LHS.getValueType();
1135
1136   // Check if we can lower this to a native operation.
1137
1138   // Try to lower to a SET* instruction:
1139   //
1140   // SET* can match the following patterns:
1141   //
1142   // select_cc f32, f32, -1,  0, cc_supported
1143   // select_cc f32, f32, 1.0f, 0.0f, cc_supported
1144   // select_cc i32, i32, -1,  0, cc_supported
1145   //
1146
1147   // Move hardware True/False values to the correct operand.
1148   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1149   ISD::CondCode InverseCC =
1150      ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
1151   if (isHWTrueValue(False) && isHWFalseValue(True)) {
1152     if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) {
1153       std::swap(False, True);
1154       CC = DAG.getCondCode(InverseCC);
1155     } else {
1156       ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC);
1157       if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) {
1158         std::swap(False, True);
1159         std::swap(LHS, RHS);
1160         CC = DAG.getCondCode(SwapInvCC);
1161       }
1162     }
1163   }
1164
1165   if (isHWTrueValue(True) && isHWFalseValue(False) &&
1166       (CompareVT == VT || VT == MVT::i32)) {
1167     // This can be matched by a SET* instruction.
1168     return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
1169   }
1170
1171   // Try to lower to a CND* instruction:
1172   //
1173   // CND* can match the following patterns:
1174   //
1175   // select_cc f32, 0.0, f32, f32, cc_supported
1176   // select_cc f32, 0.0, i32, i32, cc_supported
1177   // select_cc i32, 0,   f32, f32, cc_supported
1178   // select_cc i32, 0,   i32, i32, cc_supported
1179   //
1180
1181   // Try to move the zero value to the RHS
1182   if (isZero(LHS)) {
1183     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1184     // Try swapping the operands
1185     ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode);
1186     if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
1187       std::swap(LHS, RHS);
1188       CC = DAG.getCondCode(CCSwapped);
1189     } else {
1190       // Try inverting the conditon and then swapping the operands
1191       ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT.isInteger());
1192       CCSwapped = ISD::getSetCCSwappedOperands(CCInv);
1193       if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
1194         std::swap(True, False);
1195         std::swap(LHS, RHS);
1196         CC = DAG.getCondCode(CCSwapped);
1197       }
1198     }
1199   }
1200   if (isZero(RHS)) {
1201     SDValue Cond = LHS;
1202     SDValue Zero = RHS;
1203     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1204     if (CompareVT != VT) {
1205       // Bitcast True / False to the correct types.  This will end up being
1206       // a nop, but it allows us to define only a single pattern in the
1207       // .TD files for each CND* instruction rather than having to have
1208       // one pattern for integer True/False and one for fp True/False
1209       True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
1210       False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
1211     }
1212
1213     switch (CCOpcode) {
1214     case ISD::SETONE:
1215     case ISD::SETUNE:
1216     case ISD::SETNE:
1217       CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
1218       Temp = True;
1219       True = False;
1220       False = Temp;
1221       break;
1222     default:
1223       break;
1224     }
1225     SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
1226         Cond, Zero,
1227         True, False,
1228         DAG.getCondCode(CCOpcode));
1229     return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
1230   }
1231
1232   // If we make it this for it means we have no native instructions to handle
1233   // this SELECT_CC, so we must lower it.
1234   SDValue HWTrue, HWFalse;
1235
1236   if (CompareVT == MVT::f32) {
1237     HWTrue = DAG.getConstantFP(1.0f, CompareVT);
1238     HWFalse = DAG.getConstantFP(0.0f, CompareVT);
1239   } else if (CompareVT == MVT::i32) {
1240     HWTrue = DAG.getConstant(-1, CompareVT);
1241     HWFalse = DAG.getConstant(0, CompareVT);
1242   }
1243   else {
1244     llvm_unreachable("Unhandled value type in LowerSELECT_CC");
1245   }
1246
1247   // Lower this unsupported SELECT_CC into a combination of two supported
1248   // SELECT_CC operations.
1249   SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
1250
1251   return DAG.getNode(ISD::SELECT_CC, DL, VT,
1252       Cond, HWFalse,
1253       True, False,
1254       DAG.getCondCode(ISD::SETNE));
1255 }
1256
1257 /// LLVM generates byte-addressed pointers.  For indirect addressing, we need to
1258 /// convert these pointers to a register index.  Each register holds
1259 /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
1260 /// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
1261 /// for indirect addressing.
1262 SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
1263                                                unsigned StackWidth,
1264                                                SelectionDAG &DAG) const {
1265   unsigned SRLPad;
1266   switch(StackWidth) {
1267   case 1:
1268     SRLPad = 2;
1269     break;
1270   case 2:
1271     SRLPad = 3;
1272     break;
1273   case 4:
1274     SRLPad = 4;
1275     break;
1276   default: llvm_unreachable("Invalid stack width");
1277   }
1278
1279   return DAG.getNode(ISD::SRL, SDLoc(Ptr), Ptr.getValueType(), Ptr,
1280                      DAG.getConstant(SRLPad, MVT::i32));
1281 }
1282
1283 void R600TargetLowering::getStackAddress(unsigned StackWidth,
1284                                          unsigned ElemIdx,
1285                                          unsigned &Channel,
1286                                          unsigned &PtrIncr) const {
1287   switch (StackWidth) {
1288   default:
1289   case 1:
1290     Channel = 0;
1291     if (ElemIdx > 0) {
1292       PtrIncr = 1;
1293     } else {
1294       PtrIncr = 0;
1295     }
1296     break;
1297   case 2:
1298     Channel = ElemIdx % 2;
1299     if (ElemIdx == 2) {
1300       PtrIncr = 1;
1301     } else {
1302       PtrIncr = 0;
1303     }
1304     break;
1305   case 4:
1306     Channel = ElemIdx;
1307     PtrIncr = 0;
1308     break;
1309   }
1310 }
1311
1312 SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
1313   SDLoc DL(Op);
1314   StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
1315   SDValue Chain = Op.getOperand(0);
1316   SDValue Value = Op.getOperand(1);
1317   SDValue Ptr = Op.getOperand(2);
1318
1319   SDValue Result = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
1320   if (Result.getNode()) {
1321     return Result;
1322   }
1323
1324   if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS) {
1325     if (StoreNode->isTruncatingStore()) {
1326       EVT VT = Value.getValueType();
1327       assert(VT.bitsLE(MVT::i32));
1328       EVT MemVT = StoreNode->getMemoryVT();
1329       SDValue MaskConstant;
1330       if (MemVT == MVT::i8) {
1331         MaskConstant = DAG.getConstant(0xFF, MVT::i32);
1332       } else {
1333         assert(MemVT == MVT::i16);
1334         MaskConstant = DAG.getConstant(0xFFFF, MVT::i32);
1335       }
1336       SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr,
1337                                       DAG.getConstant(2, MVT::i32));
1338       SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr,
1339                                       DAG.getConstant(0x00000003, VT));
1340       SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant);
1341       SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex,
1342                                    DAG.getConstant(3, VT));
1343       SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift);
1344       SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift);
1345       // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32
1346       // vector instead.
1347       SDValue Src[4] = {
1348         ShiftedValue,
1349         DAG.getConstant(0, MVT::i32),
1350         DAG.getConstant(0, MVT::i32),
1351         Mask
1352       };
1353       SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src);
1354       SDValue Args[3] = { Chain, Input, DWordAddr };
1355       return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL,
1356                                      Op->getVTList(), Args, MemVT,
1357                                      StoreNode->getMemOperand());
1358     } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR &&
1359                Value.getValueType().bitsGE(MVT::i32)) {
1360       // Convert pointer from byte address to dword address.
1361       Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
1362                         DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
1363                                     Ptr, DAG.getConstant(2, MVT::i32)));
1364
1365       if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
1366         llvm_unreachable("Truncated and indexed stores not supported yet");
1367       } else {
1368         Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
1369       }
1370       return Chain;
1371     }
1372   }
1373
1374   EVT ValueVT = Value.getValueType();
1375
1376   if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1377     return SDValue();
1378   }
1379
1380   SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
1381   if (Ret.getNode()) {
1382     return Ret;
1383   }
1384   // Lowering for indirect addressing
1385
1386   const MachineFunction &MF = DAG.getMachineFunction();
1387   const AMDGPUFrameLowering *TFL =
1388       static_cast<const AMDGPUFrameLowering *>(Subtarget->getFrameLowering());
1389   unsigned StackWidth = TFL->getStackWidth(MF);
1390
1391   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1392
1393   if (ValueVT.isVector()) {
1394     unsigned NumElemVT = ValueVT.getVectorNumElements();
1395     EVT ElemVT = ValueVT.getVectorElementType();
1396     SmallVector<SDValue, 4> Stores(NumElemVT);
1397
1398     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1399                                       "vector width in load");
1400
1401     for (unsigned i = 0; i < NumElemVT; ++i) {
1402       unsigned Channel, PtrIncr;
1403       getStackAddress(StackWidth, i, Channel, PtrIncr);
1404       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1405                         DAG.getConstant(PtrIncr, MVT::i32));
1406       SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
1407                                  Value, DAG.getConstant(i, MVT::i32));
1408
1409       Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
1410                               Chain, Elem, Ptr,
1411                               DAG.getTargetConstant(Channel, MVT::i32));
1412     }
1413      Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
1414    } else {
1415     if (ValueVT == MVT::i8) {
1416       Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
1417     }
1418     Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
1419     DAG.getTargetConstant(0, MVT::i32)); // Channel
1420   }
1421
1422   return Chain;
1423 }
1424
1425 // return (512 + (kc_bank << 12)
1426 static int
1427 ConstantAddressBlock(unsigned AddressSpace) {
1428   switch (AddressSpace) {
1429   case AMDGPUAS::CONSTANT_BUFFER_0:
1430     return 512;
1431   case AMDGPUAS::CONSTANT_BUFFER_1:
1432     return 512 + 4096;
1433   case AMDGPUAS::CONSTANT_BUFFER_2:
1434     return 512 + 4096 * 2;
1435   case AMDGPUAS::CONSTANT_BUFFER_3:
1436     return 512 + 4096 * 3;
1437   case AMDGPUAS::CONSTANT_BUFFER_4:
1438     return 512 + 4096 * 4;
1439   case AMDGPUAS::CONSTANT_BUFFER_5:
1440     return 512 + 4096 * 5;
1441   case AMDGPUAS::CONSTANT_BUFFER_6:
1442     return 512 + 4096 * 6;
1443   case AMDGPUAS::CONSTANT_BUFFER_7:
1444     return 512 + 4096 * 7;
1445   case AMDGPUAS::CONSTANT_BUFFER_8:
1446     return 512 + 4096 * 8;
1447   case AMDGPUAS::CONSTANT_BUFFER_9:
1448     return 512 + 4096 * 9;
1449   case AMDGPUAS::CONSTANT_BUFFER_10:
1450     return 512 + 4096 * 10;
1451   case AMDGPUAS::CONSTANT_BUFFER_11:
1452     return 512 + 4096 * 11;
1453   case AMDGPUAS::CONSTANT_BUFFER_12:
1454     return 512 + 4096 * 12;
1455   case AMDGPUAS::CONSTANT_BUFFER_13:
1456     return 512 + 4096 * 13;
1457   case AMDGPUAS::CONSTANT_BUFFER_14:
1458     return 512 + 4096 * 14;
1459   case AMDGPUAS::CONSTANT_BUFFER_15:
1460     return 512 + 4096 * 15;
1461   default:
1462     return -1;
1463   }
1464 }
1465
1466 SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
1467 {
1468   EVT VT = Op.getValueType();
1469   SDLoc DL(Op);
1470   LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
1471   SDValue Chain = Op.getOperand(0);
1472   SDValue Ptr = Op.getOperand(1);
1473   SDValue LoweredLoad;
1474
1475   SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG);
1476   if (Ret.getNode()) {
1477     SDValue Ops[2] = {
1478       Ret,
1479       Chain
1480     };
1481     return DAG.getMergeValues(Ops, DL);
1482   }
1483
1484   // Lower loads constant address space global variable loads
1485   if (LoadNode->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
1486       isa<GlobalVariable>(GetUnderlyingObject(
1487           LoadNode->getMemOperand()->getValue(), *getDataLayout()))) {
1488
1489     SDValue Ptr = DAG.getZExtOrTrunc(LoadNode->getBasePtr(), DL,
1490         getPointerTy(AMDGPUAS::PRIVATE_ADDRESS));
1491     Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
1492         DAG.getConstant(2, MVT::i32));
1493     return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op->getVTList(),
1494                        LoadNode->getChain(), Ptr,
1495                        DAG.getTargetConstant(0, MVT::i32), Op.getOperand(2));
1496   }
1497
1498   if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) {
1499     SDValue MergedValues[2] = {
1500       ScalarizeVectorLoad(Op, DAG),
1501       Chain
1502     };
1503     return DAG.getMergeValues(MergedValues, DL);
1504   }
1505
1506   int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
1507   if (ConstantBlock > -1 &&
1508       ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) ||
1509        (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) {
1510     SDValue Result;
1511     if (isa<ConstantExpr>(LoadNode->getMemOperand()->getValue()) ||
1512         isa<Constant>(LoadNode->getMemOperand()->getValue()) ||
1513         isa<ConstantSDNode>(Ptr)) {
1514       SDValue Slots[4];
1515       for (unsigned i = 0; i < 4; i++) {
1516         // We want Const position encoded with the following formula :
1517         // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
1518         // const_index is Ptr computed by llvm using an alignment of 16.
1519         // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
1520         // then div by 4 at the ISel step
1521         SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
1522             DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32));
1523         Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
1524       }
1525       EVT NewVT = MVT::v4i32;
1526       unsigned NumElements = 4;
1527       if (VT.isVector()) {
1528         NewVT = VT;
1529         NumElements = VT.getVectorNumElements();
1530       }
1531       Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT,
1532                            makeArrayRef(Slots, NumElements));
1533     } else {
1534       // non-constant ptr can't be folded, keeps it as a v4f32 load
1535       Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
1536           DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)),
1537           DAG.getConstant(LoadNode->getAddressSpace() -
1538                           AMDGPUAS::CONSTANT_BUFFER_0, MVT::i32)
1539           );
1540     }
1541
1542     if (!VT.isVector()) {
1543       Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
1544           DAG.getConstant(0, MVT::i32));
1545     }
1546
1547     SDValue MergedValues[2] = {
1548       Result,
1549       Chain
1550     };
1551     return DAG.getMergeValues(MergedValues, DL);
1552   }
1553
1554   // For most operations returning SDValue() will result in the node being
1555   // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we
1556   // need to manually expand loads that may be legal in some address spaces and
1557   // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for
1558   // compute shaders, since the data is sign extended when it is uploaded to the
1559   // buffer. However SEXT loads from other address spaces are not supported, so
1560   // we need to expand them here.
1561   if (LoadNode->getExtensionType() == ISD::SEXTLOAD) {
1562     EVT MemVT = LoadNode->getMemoryVT();
1563     assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8));
1564     SDValue ShiftAmount =
1565           DAG.getConstant(VT.getSizeInBits() - MemVT.getSizeInBits(), MVT::i32);
1566     SDValue NewLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, Chain, Ptr,
1567                                   LoadNode->getPointerInfo(), MemVT,
1568                                   LoadNode->isVolatile(),
1569                                   LoadNode->isNonTemporal(),
1570                                   LoadNode->isInvariant(),
1571                                   LoadNode->getAlignment());
1572     SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, NewLoad, ShiftAmount);
1573     SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Shl, ShiftAmount);
1574
1575     SDValue MergedValues[2] = { Sra, Chain };
1576     return DAG.getMergeValues(MergedValues, DL);
1577   }
1578
1579   if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1580     return SDValue();
1581   }
1582
1583   // Lowering for indirect addressing
1584   const MachineFunction &MF = DAG.getMachineFunction();
1585   const AMDGPUFrameLowering *TFL =
1586       static_cast<const AMDGPUFrameLowering *>(Subtarget->getFrameLowering());
1587   unsigned StackWidth = TFL->getStackWidth(MF);
1588
1589   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1590
1591   if (VT.isVector()) {
1592     unsigned NumElemVT = VT.getVectorNumElements();
1593     EVT ElemVT = VT.getVectorElementType();
1594     SDValue Loads[4];
1595
1596     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1597                                       "vector width in load");
1598
1599     for (unsigned i = 0; i < NumElemVT; ++i) {
1600       unsigned Channel, PtrIncr;
1601       getStackAddress(StackWidth, i, Channel, PtrIncr);
1602       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1603                         DAG.getConstant(PtrIncr, MVT::i32));
1604       Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
1605                              Chain, Ptr,
1606                              DAG.getTargetConstant(Channel, MVT::i32),
1607                              Op.getOperand(2));
1608     }
1609     for (unsigned i = NumElemVT; i < 4; ++i) {
1610       Loads[i] = DAG.getUNDEF(ElemVT);
1611     }
1612     EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4);
1613     LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads);
1614   } else {
1615     LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
1616                               Chain, Ptr,
1617                               DAG.getTargetConstant(0, MVT::i32), // Channel
1618                               Op.getOperand(2));
1619   }
1620
1621   SDValue Ops[2] = {
1622     LoweredLoad,
1623     Chain
1624   };
1625
1626   return DAG.getMergeValues(Ops, DL);
1627 }
1628
1629 SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
1630   SDValue Chain = Op.getOperand(0);
1631   SDValue Cond  = Op.getOperand(1);
1632   SDValue Jump  = Op.getOperand(2);
1633
1634   return DAG.getNode(AMDGPUISD::BRANCH_COND, SDLoc(Op), Op.getValueType(),
1635                      Chain, Jump, Cond);
1636 }
1637
1638 /// XXX Only kernel functions are supported, so we can assume for now that
1639 /// every function is a kernel function, but in the future we should use
1640 /// separate calling conventions for kernel and non-kernel functions.
1641 SDValue R600TargetLowering::LowerFormalArguments(
1642                                       SDValue Chain,
1643                                       CallingConv::ID CallConv,
1644                                       bool isVarArg,
1645                                       const SmallVectorImpl<ISD::InputArg> &Ins,
1646                                       SDLoc DL, SelectionDAG &DAG,
1647                                       SmallVectorImpl<SDValue> &InVals) const {
1648   SmallVector<CCValAssign, 16> ArgLocs;
1649   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
1650                  *DAG.getContext());
1651   MachineFunction &MF = DAG.getMachineFunction();
1652   R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
1653
1654   SmallVector<ISD::InputArg, 8> LocalIns;
1655
1656   getOriginalFunctionArgs(DAG, MF.getFunction(), Ins, LocalIns);
1657
1658   AnalyzeFormalArguments(CCInfo, LocalIns);
1659
1660   for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
1661     CCValAssign &VA = ArgLocs[i];
1662     const ISD::InputArg &In = Ins[i];
1663     EVT VT = In.VT;
1664     EVT MemVT = VA.getLocVT();
1665     if (!VT.isVector() && MemVT.isVector()) {
1666       // Get load source type if scalarized.
1667       MemVT = MemVT.getVectorElementType();
1668     }
1669
1670     if (MFI->getShaderType() != ShaderType::COMPUTE) {
1671       unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass);
1672       SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
1673       InVals.push_back(Register);
1674       continue;
1675     }
1676
1677     PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1678                                           AMDGPUAS::CONSTANT_BUFFER_0);
1679
1680     // i64 isn't a legal type, so the register type used ends up as i32, which
1681     // isn't expected here. It attempts to create this sextload, but it ends up
1682     // being invalid. Somehow this seems to work with i64 arguments, but breaks
1683     // for <1 x i64>.
1684
1685     // The first 36 bytes of the input buffer contains information about
1686     // thread group and global sizes.
1687     ISD::LoadExtType Ext = ISD::NON_EXTLOAD;
1688     if (MemVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) {
1689       // FIXME: This should really check the extload type, but the handling of
1690       // extload vector parameters seems to be broken.
1691
1692       // Ext = In.Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
1693       Ext = ISD::SEXTLOAD;
1694     }
1695
1696     // Compute the offset from the value.
1697     // XXX - I think PartOffset should give you this, but it seems to give the
1698     // size of the register which isn't useful.
1699
1700     unsigned ValBase = ArgLocs[In.getOrigArgIndex()].getLocMemOffset();
1701     unsigned PartOffset = VA.getLocMemOffset();
1702     unsigned Offset = 36 + VA.getLocMemOffset();
1703
1704     MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase);
1705     SDValue Arg = DAG.getLoad(ISD::UNINDEXED, Ext, VT, DL, Chain,
1706                               DAG.getConstant(Offset, MVT::i32),
1707                               DAG.getUNDEF(MVT::i32),
1708                               PtrInfo,
1709                               MemVT, false, true, true, 4);
1710
1711     // 4 is the preferred alignment for the CONSTANT memory space.
1712     InVals.push_back(Arg);
1713     MFI->ABIArgOffset = Offset + MemVT.getStoreSize();
1714   }
1715   return Chain;
1716 }
1717
1718 EVT R600TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
1719    if (!VT.isVector())
1720      return MVT::i32;
1721    return VT.changeVectorElementTypeToInteger();
1722 }
1723
1724 static SDValue CompactSwizzlableVector(
1725   SelectionDAG &DAG, SDValue VectorEntry,
1726   DenseMap<unsigned, unsigned> &RemapSwizzle) {
1727   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1728   assert(RemapSwizzle.empty());
1729   SDValue NewBldVec[4] = {
1730     VectorEntry.getOperand(0),
1731     VectorEntry.getOperand(1),
1732     VectorEntry.getOperand(2),
1733     VectorEntry.getOperand(3)
1734   };
1735
1736   for (unsigned i = 0; i < 4; i++) {
1737     if (NewBldVec[i].getOpcode() == ISD::UNDEF)
1738       // We mask write here to teach later passes that the ith element of this
1739       // vector is undef. Thus we can use it to reduce 128 bits reg usage,
1740       // break false dependencies and additionnaly make assembly easier to read.
1741       RemapSwizzle[i] = 7; // SEL_MASK_WRITE
1742     if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) {
1743       if (C->isZero()) {
1744         RemapSwizzle[i] = 4; // SEL_0
1745         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1746       } else if (C->isExactlyValue(1.0)) {
1747         RemapSwizzle[i] = 5; // SEL_1
1748         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1749       }
1750     }
1751
1752     if (NewBldVec[i].getOpcode() == ISD::UNDEF)
1753       continue;
1754     for (unsigned j = 0; j < i; j++) {
1755       if (NewBldVec[i] == NewBldVec[j]) {
1756         NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType());
1757         RemapSwizzle[i] = j;
1758         break;
1759       }
1760     }
1761   }
1762
1763   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1764                      VectorEntry.getValueType(), NewBldVec);
1765 }
1766
1767 static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
1768                                 DenseMap<unsigned, unsigned> &RemapSwizzle) {
1769   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1770   assert(RemapSwizzle.empty());
1771   SDValue NewBldVec[4] = {
1772       VectorEntry.getOperand(0),
1773       VectorEntry.getOperand(1),
1774       VectorEntry.getOperand(2),
1775       VectorEntry.getOperand(3)
1776   };
1777   bool isUnmovable[4] = { false, false, false, false };
1778   for (unsigned i = 0; i < 4; i++) {
1779     RemapSwizzle[i] = i;
1780     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1781       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1782           ->getZExtValue();
1783       if (i == Idx)
1784         isUnmovable[Idx] = true;
1785     }
1786   }
1787
1788   for (unsigned i = 0; i < 4; i++) {
1789     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1790       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1791           ->getZExtValue();
1792       if (isUnmovable[Idx])
1793         continue;
1794       // Swap i and Idx
1795       std::swap(NewBldVec[Idx], NewBldVec[i]);
1796       std::swap(RemapSwizzle[i], RemapSwizzle[Idx]);
1797       break;
1798     }
1799   }
1800
1801   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1802                      VectorEntry.getValueType(), NewBldVec);
1803 }
1804
1805
1806 SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector,
1807 SDValue Swz[4], SelectionDAG &DAG) const {
1808   assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR);
1809   // Old -> New swizzle values
1810   DenseMap<unsigned, unsigned> SwizzleRemap;
1811
1812   BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap);
1813   for (unsigned i = 0; i < 4; i++) {
1814     unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue();
1815     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1816       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
1817   }
1818
1819   SwizzleRemap.clear();
1820   BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap);
1821   for (unsigned i = 0; i < 4; i++) {
1822     unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue();
1823     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1824       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
1825   }
1826
1827   return BuildVector;
1828 }
1829
1830
1831 //===----------------------------------------------------------------------===//
1832 // Custom DAG Optimizations
1833 //===----------------------------------------------------------------------===//
1834
1835 SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
1836                                               DAGCombinerInfo &DCI) const {
1837   SelectionDAG &DAG = DCI.DAG;
1838
1839   switch (N->getOpcode()) {
1840   default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
1841   // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
1842   case ISD::FP_ROUND: {
1843       SDValue Arg = N->getOperand(0);
1844       if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
1845         return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0),
1846                            Arg.getOperand(0));
1847       }
1848       break;
1849     }
1850
1851   // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
1852   // (i32 select_cc f32, f32, -1, 0 cc)
1853   //
1854   // Mesa's GLSL frontend generates the above pattern a lot and we can lower
1855   // this to one of the SET*_DX10 instructions.
1856   case ISD::FP_TO_SINT: {
1857     SDValue FNeg = N->getOperand(0);
1858     if (FNeg.getOpcode() != ISD::FNEG) {
1859       return SDValue();
1860     }
1861     SDValue SelectCC = FNeg.getOperand(0);
1862     if (SelectCC.getOpcode() != ISD::SELECT_CC ||
1863         SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
1864         SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
1865         !isHWTrueValue(SelectCC.getOperand(2)) ||
1866         !isHWFalseValue(SelectCC.getOperand(3))) {
1867       return SDValue();
1868     }
1869
1870     return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N->getValueType(0),
1871                            SelectCC.getOperand(0), // LHS
1872                            SelectCC.getOperand(1), // RHS
1873                            DAG.getConstant(-1, MVT::i32), // True
1874                            DAG.getConstant(0, MVT::i32),  // False
1875                            SelectCC.getOperand(4)); // CC
1876
1877     break;
1878   }
1879
1880   // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx
1881   // => build_vector elt0, ... , NewEltIdx, ... , eltN
1882   case ISD::INSERT_VECTOR_ELT: {
1883     SDValue InVec = N->getOperand(0);
1884     SDValue InVal = N->getOperand(1);
1885     SDValue EltNo = N->getOperand(2);
1886     SDLoc dl(N);
1887
1888     // If the inserted element is an UNDEF, just use the input vector.
1889     if (InVal.getOpcode() == ISD::UNDEF)
1890       return InVec;
1891
1892     EVT VT = InVec.getValueType();
1893
1894     // If we can't generate a legal BUILD_VECTOR, exit
1895     if (!isOperationLegal(ISD::BUILD_VECTOR, VT))
1896       return SDValue();
1897
1898     // Check that we know which element is being inserted
1899     if (!isa<ConstantSDNode>(EltNo))
1900       return SDValue();
1901     unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
1902
1903     // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
1904     // be converted to a BUILD_VECTOR).  Fill in the Ops vector with the
1905     // vector elements.
1906     SmallVector<SDValue, 8> Ops;
1907     if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
1908       Ops.append(InVec.getNode()->op_begin(),
1909                  InVec.getNode()->op_end());
1910     } else if (InVec.getOpcode() == ISD::UNDEF) {
1911       unsigned NElts = VT.getVectorNumElements();
1912       Ops.append(NElts, DAG.getUNDEF(InVal.getValueType()));
1913     } else {
1914       return SDValue();
1915     }
1916
1917     // Insert the element
1918     if (Elt < Ops.size()) {
1919       // All the operands of BUILD_VECTOR must have the same type;
1920       // we enforce that here.
1921       EVT OpVT = Ops[0].getValueType();
1922       if (InVal.getValueType() != OpVT)
1923         InVal = OpVT.bitsGT(InVal.getValueType()) ?
1924           DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) :
1925           DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal);
1926       Ops[Elt] = InVal;
1927     }
1928
1929     // Return the new vector
1930     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
1931   }
1932
1933   // Extract_vec (Build_vector) generated by custom lowering
1934   // also needs to be customly combined
1935   case ISD::EXTRACT_VECTOR_ELT: {
1936     SDValue Arg = N->getOperand(0);
1937     if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
1938       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1939         unsigned Element = Const->getZExtValue();
1940         return Arg->getOperand(Element);
1941       }
1942     }
1943     if (Arg.getOpcode() == ISD::BITCAST &&
1944         Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
1945       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1946         unsigned Element = Const->getZExtValue();
1947         return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(),
1948             Arg->getOperand(0).getOperand(Element));
1949       }
1950     }
1951   }
1952
1953   case ISD::SELECT_CC: {
1954     // Try common optimizations
1955     SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
1956     if (Ret.getNode())
1957       return Ret;
1958
1959     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
1960     //      selectcc x, y, a, b, inv(cc)
1961     //
1962     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
1963     //      selectcc x, y, a, b, cc
1964     SDValue LHS = N->getOperand(0);
1965     if (LHS.getOpcode() != ISD::SELECT_CC) {
1966       return SDValue();
1967     }
1968
1969     SDValue RHS = N->getOperand(1);
1970     SDValue True = N->getOperand(2);
1971     SDValue False = N->getOperand(3);
1972     ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
1973
1974     if (LHS.getOperand(2).getNode() != True.getNode() ||
1975         LHS.getOperand(3).getNode() != False.getNode() ||
1976         RHS.getNode() != False.getNode()) {
1977       return SDValue();
1978     }
1979
1980     switch (NCC) {
1981     default: return SDValue();
1982     case ISD::SETNE: return LHS;
1983     case ISD::SETEQ: {
1984       ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
1985       LHSCC = ISD::getSetCCInverse(LHSCC,
1986                                   LHS.getOperand(0).getValueType().isInteger());
1987       if (DCI.isBeforeLegalizeOps() ||
1988           isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType()))
1989         return DAG.getSelectCC(SDLoc(N),
1990                                LHS.getOperand(0),
1991                                LHS.getOperand(1),
1992                                LHS.getOperand(2),
1993                                LHS.getOperand(3),
1994                                LHSCC);
1995       break;
1996     }
1997     }
1998     return SDValue();
1999   }
2000
2001   case AMDGPUISD::EXPORT: {
2002     SDValue Arg = N->getOperand(1);
2003     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
2004       break;
2005
2006     SDValue NewArgs[8] = {
2007       N->getOperand(0), // Chain
2008       SDValue(),
2009       N->getOperand(2), // ArrayBase
2010       N->getOperand(3), // Type
2011       N->getOperand(4), // SWZ_X
2012       N->getOperand(5), // SWZ_Y
2013       N->getOperand(6), // SWZ_Z
2014       N->getOperand(7) // SWZ_W
2015     };
2016     SDLoc DL(N);
2017     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG);
2018     return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs);
2019   }
2020   case AMDGPUISD::TEXTURE_FETCH: {
2021     SDValue Arg = N->getOperand(1);
2022     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
2023       break;
2024
2025     SDValue NewArgs[19] = {
2026       N->getOperand(0),
2027       N->getOperand(1),
2028       N->getOperand(2),
2029       N->getOperand(3),
2030       N->getOperand(4),
2031       N->getOperand(5),
2032       N->getOperand(6),
2033       N->getOperand(7),
2034       N->getOperand(8),
2035       N->getOperand(9),
2036       N->getOperand(10),
2037       N->getOperand(11),
2038       N->getOperand(12),
2039       N->getOperand(13),
2040       N->getOperand(14),
2041       N->getOperand(15),
2042       N->getOperand(16),
2043       N->getOperand(17),
2044       N->getOperand(18),
2045     };
2046     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG);
2047     return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, SDLoc(N), N->getVTList(),
2048         NewArgs);
2049   }
2050   }
2051
2052   return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
2053 }
2054
2055 static bool
2056 FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg,
2057             SDValue &Abs, SDValue &Sel, SDValue &Imm, SelectionDAG &DAG) {
2058   const R600InstrInfo *TII =
2059       static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo());
2060   if (!Src.isMachineOpcode())
2061     return false;
2062   switch (Src.getMachineOpcode()) {
2063   case AMDGPU::FNEG_R600:
2064     if (!Neg.getNode())
2065       return false;
2066     Src = Src.getOperand(0);
2067     Neg = DAG.getTargetConstant(1, MVT::i32);
2068     return true;
2069   case AMDGPU::FABS_R600:
2070     if (!Abs.getNode())
2071       return false;
2072     Src = Src.getOperand(0);
2073     Abs = DAG.getTargetConstant(1, MVT::i32);
2074     return true;
2075   case AMDGPU::CONST_COPY: {
2076     unsigned Opcode = ParentNode->getMachineOpcode();
2077     bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2078
2079     if (!Sel.getNode())
2080       return false;
2081
2082     SDValue CstOffset = Src.getOperand(0);
2083     if (ParentNode->getValueType(0).isVector())
2084       return false;
2085
2086     // Gather constants values
2087     int SrcIndices[] = {
2088       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
2089       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
2090       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2),
2091       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
2092       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
2093       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
2094       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
2095       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
2096       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
2097       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
2098       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
2099     };
2100     std::vector<unsigned> Consts;
2101     for (int OtherSrcIdx : SrcIndices) {
2102       int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx);
2103       if (OtherSrcIdx < 0 || OtherSelIdx < 0)
2104         continue;
2105       if (HasDst) {
2106         OtherSrcIdx--;
2107         OtherSelIdx--;
2108       }
2109       if (RegisterSDNode *Reg =
2110           dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) {
2111         if (Reg->getReg() == AMDGPU::ALU_CONST) {
2112           ConstantSDNode *Cst
2113             = cast<ConstantSDNode>(ParentNode->getOperand(OtherSelIdx));
2114           Consts.push_back(Cst->getZExtValue());
2115         }
2116       }
2117     }
2118
2119     ConstantSDNode *Cst = cast<ConstantSDNode>(CstOffset);
2120     Consts.push_back(Cst->getZExtValue());
2121     if (!TII->fitsConstReadLimitations(Consts)) {
2122       return false;
2123     }
2124
2125     Sel = CstOffset;
2126     Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32);
2127     return true;
2128   }
2129   case AMDGPU::MOV_IMM_I32:
2130   case AMDGPU::MOV_IMM_F32: {
2131     unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
2132     uint64_t ImmValue = 0;
2133
2134
2135     if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) {
2136       ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
2137       float FloatValue = FPC->getValueAPF().convertToFloat();
2138       if (FloatValue == 0.0) {
2139         ImmReg = AMDGPU::ZERO;
2140       } else if (FloatValue == 0.5) {
2141         ImmReg = AMDGPU::HALF;
2142       } else if (FloatValue == 1.0) {
2143         ImmReg = AMDGPU::ONE;
2144       } else {
2145         ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
2146       }
2147     } else {
2148       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0));
2149       uint64_t Value = C->getZExtValue();
2150       if (Value == 0) {
2151         ImmReg = AMDGPU::ZERO;
2152       } else if (Value == 1) {
2153         ImmReg = AMDGPU::ONE_INT;
2154       } else {
2155         ImmValue = Value;
2156       }
2157     }
2158
2159     // Check that we aren't already using an immediate.
2160     // XXX: It's possible for an instruction to have more than one
2161     // immediate operand, but this is not supported yet.
2162     if (ImmReg == AMDGPU::ALU_LITERAL_X) {
2163       if (!Imm.getNode())
2164         return false;
2165       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm);
2166       assert(C);
2167       if (C->getZExtValue())
2168         return false;
2169       Imm = DAG.getTargetConstant(ImmValue, MVT::i32);
2170     }
2171     Src = DAG.getRegister(ImmReg, MVT::i32);
2172     return true;
2173   }
2174   default:
2175     return false;
2176   }
2177 }
2178
2179
2180 /// \brief Fold the instructions after selecting them
2181 SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
2182                                             SelectionDAG &DAG) const {
2183   const R600InstrInfo *TII =
2184       static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo());
2185   if (!Node->isMachineOpcode())
2186     return Node;
2187   unsigned Opcode = Node->getMachineOpcode();
2188   SDValue FakeOp;
2189
2190   std::vector<SDValue> Ops(Node->op_begin(), Node->op_end());
2191
2192   if (Opcode == AMDGPU::DOT_4) {
2193     int OperandIdx[] = {
2194       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
2195       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
2196       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
2197       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
2198       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
2199       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
2200       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
2201       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
2202         };
2203     int NegIdx[] = {
2204       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X),
2205       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y),
2206       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z),
2207       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W),
2208       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X),
2209       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y),
2210       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z),
2211       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W)
2212     };
2213     int AbsIdx[] = {
2214       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X),
2215       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y),
2216       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z),
2217       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W),
2218       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X),
2219       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y),
2220       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z),
2221       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W)
2222     };
2223     for (unsigned i = 0; i < 8; i++) {
2224       if (OperandIdx[i] < 0)
2225         return Node;
2226       SDValue &Src = Ops[OperandIdx[i] - 1];
2227       SDValue &Neg = Ops[NegIdx[i] - 1];
2228       SDValue &Abs = Ops[AbsIdx[i] - 1];
2229       bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2230       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
2231       if (HasDst)
2232         SelIdx--;
2233       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
2234       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG))
2235         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2236     }
2237   } else if (Opcode == AMDGPU::REG_SEQUENCE) {
2238     for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) {
2239       SDValue &Src = Ops[i];
2240       if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG))
2241         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2242     }
2243   } else if (Opcode == AMDGPU::CLAMP_R600) {
2244     SDValue Src = Node->getOperand(0);
2245     if (!Src.isMachineOpcode() ||
2246         !TII->hasInstrModifiers(Src.getMachineOpcode()))
2247       return Node;
2248     int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(),
2249         AMDGPU::OpName::clamp);
2250     if (ClampIdx < 0)
2251       return Node;
2252     std::vector<SDValue> Ops(Src->op_begin(), Src->op_end());
2253     Ops[ClampIdx - 1] = DAG.getTargetConstant(1, MVT::i32);
2254     return DAG.getMachineNode(Src.getMachineOpcode(), SDLoc(Node),
2255         Node->getVTList(), Ops);
2256   } else {
2257     if (!TII->hasInstrModifiers(Opcode))
2258       return Node;
2259     int OperandIdx[] = {
2260       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
2261       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
2262       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2)
2263     };
2264     int NegIdx[] = {
2265       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg),
2266       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg),
2267       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg)
2268     };
2269     int AbsIdx[] = {
2270       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs),
2271       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs),
2272       -1
2273     };
2274     for (unsigned i = 0; i < 3; i++) {
2275       if (OperandIdx[i] < 0)
2276         return Node;
2277       SDValue &Src = Ops[OperandIdx[i] - 1];
2278       SDValue &Neg = Ops[NegIdx[i] - 1];
2279       SDValue FakeAbs;
2280       SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs;
2281       bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2282       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
2283       int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal);
2284       if (HasDst) {
2285         SelIdx--;
2286         ImmIdx--;
2287       }
2288       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
2289       SDValue &Imm = Ops[ImmIdx];
2290       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG))
2291         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2292     }
2293   }
2294
2295   return Node;
2296 }