lib/Target/R600/R600ISelLowering.cpp

   1 //===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 /// \file
  11 /// \brief Custom DAG lowering for R600
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "R600ISelLowering.h"
  16 #include "AMDGPUFrameLowering.h"
  17 #include "AMDGPUIntrinsicInfo.h"
  18 #include "AMDGPUSubtarget.h"
  19 #include "R600Defines.h"
  20 #include "R600InstrInfo.h"
  21 #include "R600MachineFunctionInfo.h"
  22 #include "llvm/Analysis/ValueTracking.h"
  23 #include "llvm/CodeGen/CallingConvLower.h"
  24 #include "llvm/CodeGen/MachineFrameInfo.h"
  25 #include "llvm/CodeGen/MachineInstrBuilder.h"
  26 #include "llvm/CodeGen/MachineRegisterInfo.h"
  27 #include "llvm/CodeGen/SelectionDAG.h"
  28 #include "llvm/IR/Argument.h"
  29 #include "llvm/IR/Function.h"
  30
  31 using namespace llvm;
  32
  33 R600TargetLowering::R600TargetLowering(TargetMachine &TM,
  34                                        const AMDGPUSubtarget &STI)
  35     : AMDGPUTargetLowering(TM, STI), Gen(STI.getGeneration()) {
  36   addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
  37   addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
  38   addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
  39   addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
  40   addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
  41   addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
  42
  43   computeRegisterProperties(STI.getRegisterInfo());
  44
  45   // Set condition code actions
  46   setCondCodeAction(ISD::SETO,   MVT::f32, Expand);
  47   setCondCodeAction(ISD::SETUO,  MVT::f32, Expand);
  48   setCondCodeAction(ISD::SETLT,  MVT::f32, Expand);
  49   setCondCodeAction(ISD::SETLE,  MVT::f32, Expand);
  50   setCondCodeAction(ISD::SETOLT, MVT::f32, Expand);
  51   setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
  52   setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
  53   setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
  54   setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
  55   setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
  56   setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
  57   setCondCodeAction(ISD::SETULE, MVT::f32, Expand);
  58
  59   setCondCodeAction(ISD::SETLE, MVT::i32, Expand);
  60   setCondCodeAction(ISD::SETLT, MVT::i32, Expand);
  61   setCondCodeAction(ISD::SETULE, MVT::i32, Expand);
  62   setCondCodeAction(ISD::SETULT, MVT::i32, Expand);
  63
  64   setOperationAction(ISD::FCOS, MVT::f32, Custom);
  65   setOperationAction(ISD::FSIN, MVT::f32, Custom);
  66
  67   setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
  68   setOperationAction(ISD::SETCC, MVT::v2i32, Expand);
  69
  70   setOperationAction(ISD::BR_CC, MVT::i32, Expand);
  71   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
  72   setOperationAction(ISD::BRCOND, MVT::Other, Custom);
  73
  74   setOperationAction(ISD::FSUB, MVT::f32, Expand);
  75
  76   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
  77   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
  78   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
  79
  80   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
  81   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
  82
  83   setOperationAction(ISD::SETCC, MVT::i32, Expand);
  84   setOperationAction(ISD::SETCC, MVT::f32, Expand);
  85   setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
  86   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
  87   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
  88
  89   setOperationAction(ISD::SELECT, MVT::i32, Expand);
  90   setOperationAction(ISD::SELECT, MVT::f32, Expand);
  91   setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
  92   setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
  93
  94   // Expand sign extension of vectors
  95   if (!Subtarget->hasBFE())
  96     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
  97
  98   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Expand);
  99   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Expand);
 100
 101   if (!Subtarget->hasBFE())
 102     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
 103   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Expand);
 104   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Expand);
 105
 106   if (!Subtarget->hasBFE())
 107     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
 108   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand);
 109   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand);
 110
 111   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
 112   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand);
 113   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand);
 114
 115   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand);
 116
 117
 118   // Legalize loads and stores to the private address space.
 119   setOperationAction(ISD::LOAD, MVT::i32, Custom);
 120   setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
 121   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
 122
 123   // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
 124   // spaces, so it is custom lowered to handle those where it isn't.
 125   for (MVT VT : MVT::integer_valuetypes()) {
 126     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
 127     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Custom);
 128     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Custom);
 129
 130     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
 131     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Custom);
 132     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Custom);
 133
 134     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
 135     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Custom);
 136     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Custom);
 137   }
 138
 139   setOperationAction(ISD::STORE, MVT::i8, Custom);
 140   setOperationAction(ISD::STORE, MVT::i32, Custom);
 141   setOperationAction(ISD::STORE, MVT::v2i32, Custom);
 142   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
 143   setTruncStoreAction(MVT::i32, MVT::i8, Custom);
 144   setTruncStoreAction(MVT::i32, MVT::i16, Custom);
 145
 146   setOperationAction(ISD::LOAD, MVT::i32, Custom);
 147   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
 148   setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
 149
 150   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom);
 151   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom);
 152   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
 153   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
 154
 155   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom);
 156   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom);
 157   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
 158   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
 159
 160   setTargetDAGCombine(ISD::FP_ROUND);
 161   setTargetDAGCombine(ISD::FP_TO_SINT);
 162   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
 163   setTargetDAGCombine(ISD::SELECT_CC);
 164   setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
 165
 166   setOperationAction(ISD::SUB, MVT::i64, Expand);
 167
 168   // These should be replaced by UDVIREM, but it does not happen automatically
 169   // during Type Legalization
 170   setOperationAction(ISD::UDIV, MVT::i64, Custom);
 171   setOperationAction(ISD::UREM, MVT::i64, Custom);
 172   setOperationAction(ISD::SDIV, MVT::i64, Custom);
 173   setOperationAction(ISD::SREM, MVT::i64, Custom);
 174
 175   // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32
 176   //  to be Legal/Custom in order to avoid library calls.
 177   setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
 178   setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
 179   setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
 180
 181   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
 182
 183   const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
 184   for (MVT VT : ScalarIntVTs) {
 185     setOperationAction(ISD::ADDC, VT, Expand);
 186     setOperationAction(ISD::SUBC, VT, Expand);
 187     setOperationAction(ISD::ADDE, VT, Expand);
 188     setOperationAction(ISD::SUBE, VT, Expand);
 189   }
 190
 191   setSchedulingPreference(Sched::Source);
 192 }
 193
 194 MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
 195     MachineInstr * MI, MachineBasicBlock * BB) const {
 196   MachineFunction * MF = BB->getParent();
 197   MachineRegisterInfo &MRI = MF->getRegInfo();
 198   MachineBasicBlock::iterator I = *MI;
 199   const R600InstrInfo *TII =
 200       static_cast<const R600InstrInfo *>(Subtarget->getInstrInfo());
 201
 202   switch (MI->getOpcode()) {
 203   default:
 204     // Replace LDS_*_RET instruction that don't have any uses with the
 205     // equivalent LDS_*_NORET instruction.
 206     if (TII->isLDSRetInstr(MI->getOpcode())) {
 207       int DstIdx = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
 208       assert(DstIdx != -1);
 209       MachineInstrBuilder NewMI;
 210       // FIXME: getLDSNoRetOp method only handles LDS_1A1D LDS ops. Add
 211       //        LDS_1A2D support and remove this special case.
 212       if (!MRI.use_empty(MI->getOperand(DstIdx).getReg()) ||
 213            MI->getOpcode() == AMDGPU::LDS_CMPST_RET)
 214         return BB;
 215
 216       NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
 217                       TII->get(AMDGPU::getLDSNoRetOp(MI->getOpcode())));
 218       for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) {
 219         NewMI.addOperand(MI->getOperand(i));
 220       }
 221     } else {
 222       return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
 223     }
 224     break;
 225   case AMDGPU::CLAMP_R600: {
 226     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 227                                                    AMDGPU::MOV,
 228                                                    MI->getOperand(0).getReg(),
 229                                                    MI->getOperand(1).getReg());
 230     TII->addFlag(NewMI, 0, MO_FLAG_CLAMP);
 231     break;
 232   }
 233
 234   case AMDGPU::FABS_R600: {
 235     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 236                                                     AMDGPU::MOV,
 237                                                     MI->getOperand(0).getReg(),
 238                                                     MI->getOperand(1).getReg());
 239     TII->addFlag(NewMI, 0, MO_FLAG_ABS);
 240     break;
 241   }
 242
 243   case AMDGPU::FNEG_R600: {
 244     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
 245                                                     AMDGPU::MOV,
 246                                                     MI->getOperand(0).getReg(),
 247                                                     MI->getOperand(1).getReg());
 248     TII->addFlag(NewMI, 0, MO_FLAG_NEG);
 249     break;
 250   }
 251
 252   case AMDGPU::MASK_WRITE: {
 253     unsigned maskedRegister = MI->getOperand(0).getReg();
 254     assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
 255     MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
 256     TII->addFlag(defInstr, 0, MO_FLAG_MASK);
 257     break;
 258   }
 259
 260   case AMDGPU::MOV_IMM_F32:
 261     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
 262                      MI->getOperand(1).getFPImm()->getValueAPF()
 263                          .bitcastToAPInt().getZExtValue());
 264     break;
 265   case AMDGPU::MOV_IMM_I32:
 266     TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
 267                      MI->getOperand(1).getImm());
 268     break;
 269   case AMDGPU::CONST_COPY: {
 270     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV,
 271         MI->getOperand(0).getReg(), AMDGPU::ALU_CONST);
 272     TII->setImmOperand(NewMI, AMDGPU::OpName::src0_sel,
 273         MI->getOperand(1).getImm());
 274     break;
 275   }
 276
 277   case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
 278   case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
 279   case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
 280     unsigned EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
 281
 282     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
 283             .addOperand(MI->getOperand(0))
 284             .addOperand(MI->getOperand(1))
 285             .addImm(EOP); // Set End of program bit
 286     break;
 287   }
 288
 289   case AMDGPU::TXD: {
 290     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 291     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 292     MachineOperand &RID = MI->getOperand(4);
 293     MachineOperand &SID = MI->getOperand(5);
 294     unsigned TextureId = MI->getOperand(6).getImm();
 295     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
 296     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
 297
 298     switch (TextureId) {
 299     case 5: // Rect
 300       CTX = CTY = 0;
 301       break;
 302     case 6: // Shadow1D
 303       SrcW = SrcZ;
 304       break;
 305     case 7: // Shadow2D
 306       SrcW = SrcZ;
 307       break;
 308     case 8: // ShadowRect
 309       CTX = CTY = 0;
 310       SrcW = SrcZ;
 311       break;
 312     case 9: // 1DArray
 313       SrcZ = SrcY;
 314       CTZ = 0;
 315       break;
 316     case 10: // 2DArray
 317       CTZ = 0;
 318       break;
 319     case 11: // Shadow1DArray
 320       SrcZ = SrcY;
 321       CTZ = 0;
 322       break;
 323     case 12: // Shadow2DArray
 324       CTZ = 0;
 325       break;
 326     }
 327     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
 328             .addOperand(MI->getOperand(3))
 329             .addImm(SrcX)
 330             .addImm(SrcY)
 331             .addImm(SrcZ)
 332             .addImm(SrcW)
 333             .addImm(0)
 334             .addImm(0)
 335             .addImm(0)
 336             .addImm(0)
 337             .addImm(1)
 338             .addImm(2)
 339             .addImm(3)
 340             .addOperand(RID)
 341             .addOperand(SID)
 342             .addImm(CTX)
 343             .addImm(CTY)
 344             .addImm(CTZ)
 345             .addImm(CTW);
 346     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
 347             .addOperand(MI->getOperand(2))
 348             .addImm(SrcX)
 349             .addImm(SrcY)
 350             .addImm(SrcZ)
 351             .addImm(SrcW)
 352             .addImm(0)
 353             .addImm(0)
 354             .addImm(0)
 355             .addImm(0)
 356             .addImm(1)
 357             .addImm(2)
 358             .addImm(3)
 359             .addOperand(RID)
 360             .addOperand(SID)
 361             .addImm(CTX)
 362             .addImm(CTY)
 363             .addImm(CTZ)
 364             .addImm(CTW);
 365     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
 366             .addOperand(MI->getOperand(0))
 367             .addOperand(MI->getOperand(1))
 368             .addImm(SrcX)
 369             .addImm(SrcY)
 370             .addImm(SrcZ)
 371             .addImm(SrcW)
 372             .addImm(0)
 373             .addImm(0)
 374             .addImm(0)
 375             .addImm(0)
 376             .addImm(1)
 377             .addImm(2)
 378             .addImm(3)
 379             .addOperand(RID)
 380             .addOperand(SID)
 381             .addImm(CTX)
 382             .addImm(CTY)
 383             .addImm(CTZ)
 384             .addImm(CTW)
 385             .addReg(T0, RegState::Implicit)
 386             .addReg(T1, RegState::Implicit);
 387     break;
 388   }
 389
 390   case AMDGPU::TXD_SHADOW: {
 391     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 392     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
 393     MachineOperand &RID = MI->getOperand(4);
 394     MachineOperand &SID = MI->getOperand(5);
 395     unsigned TextureId = MI->getOperand(6).getImm();
 396     unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
 397     unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
 398
 399     switch (TextureId) {
 400     case 5: // Rect
 401       CTX = CTY = 0;
 402       break;
 403     case 6: // Shadow1D
 404       SrcW = SrcZ;
 405       break;
 406     case 7: // Shadow2D
 407       SrcW = SrcZ;
 408       break;
 409     case 8: // ShadowRect
 410       CTX = CTY = 0;
 411       SrcW = SrcZ;
 412       break;
 413     case 9: // 1DArray
 414       SrcZ = SrcY;
 415       CTZ = 0;
 416       break;
 417     case 10: // 2DArray
 418       CTZ = 0;
 419       break;
 420     case 11: // Shadow1DArray
 421       SrcZ = SrcY;
 422       CTZ = 0;
 423       break;
 424     case 12: // Shadow2DArray
 425       CTZ = 0;
 426       break;
 427     }
 428
 429     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
 430             .addOperand(MI->getOperand(3))
 431             .addImm(SrcX)
 432             .addImm(SrcY)
 433             .addImm(SrcZ)
 434             .addImm(SrcW)
 435             .addImm(0)
 436             .addImm(0)
 437             .addImm(0)
 438             .addImm(0)
 439             .addImm(1)
 440             .addImm(2)
 441             .addImm(3)
 442             .addOperand(RID)
 443             .addOperand(SID)
 444             .addImm(CTX)
 445             .addImm(CTY)
 446             .addImm(CTZ)
 447             .addImm(CTW);
 448     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
 449             .addOperand(MI->getOperand(2))
 450             .addImm(SrcX)
 451             .addImm(SrcY)
 452             .addImm(SrcZ)
 453             .addImm(SrcW)
 454             .addImm(0)
 455             .addImm(0)
 456             .addImm(0)
 457             .addImm(0)
 458             .addImm(1)
 459             .addImm(2)
 460             .addImm(3)
 461             .addOperand(RID)
 462             .addOperand(SID)
 463             .addImm(CTX)
 464             .addImm(CTY)
 465             .addImm(CTZ)
 466             .addImm(CTW);
 467     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
 468             .addOperand(MI->getOperand(0))
 469             .addOperand(MI->getOperand(1))
 470             .addImm(SrcX)
 471             .addImm(SrcY)
 472             .addImm(SrcZ)
 473             .addImm(SrcW)
 474             .addImm(0)
 475             .addImm(0)
 476             .addImm(0)
 477             .addImm(0)
 478             .addImm(1)
 479             .addImm(2)
 480             .addImm(3)
 481             .addOperand(RID)
 482             .addOperand(SID)
 483             .addImm(CTX)
 484             .addImm(CTY)
 485             .addImm(CTZ)
 486             .addImm(CTW)
 487             .addReg(T0, RegState::Implicit)
 488             .addReg(T1, RegState::Implicit);
 489     break;
 490   }
 491
 492   case AMDGPU::BRANCH:
 493       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
 494               .addOperand(MI->getOperand(0));
 495       break;
 496
 497   case AMDGPU::BRANCH_COND_f32: {
 498     MachineInstr *NewMI =
 499       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 500               AMDGPU::PREDICATE_BIT)
 501               .addOperand(MI->getOperand(1))
 502               .addImm(OPCODE_IS_NOT_ZERO)
 503               .addImm(0); // Flags
 504     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
 505     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
 506             .addOperand(MI->getOperand(0))
 507             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 508     break;
 509   }
 510
 511   case AMDGPU::BRANCH_COND_i32: {
 512     MachineInstr *NewMI =
 513       BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
 514             AMDGPU::PREDICATE_BIT)
 515             .addOperand(MI->getOperand(1))
 516             .addImm(OPCODE_IS_NOT_ZERO_INT)
 517             .addImm(0); // Flags
 518     TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
 519     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
 520            .addOperand(MI->getOperand(0))
 521             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
 522     break;
 523   }
 524
 525   case AMDGPU::EG_ExportSwz:
 526   case AMDGPU::R600_ExportSwz: {
 527     // Instruction is left unmodified if its not the last one of its type
 528     bool isLastInstructionOfItsType = true;
 529     unsigned InstExportType = MI->getOperand(1).getImm();
 530     for (MachineBasicBlock::iterator NextExportInst = std::next(I),
 531          EndBlock = BB->end(); NextExportInst != EndBlock;
 532          NextExportInst = std::next(NextExportInst)) {
 533       if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
 534           NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
 535         unsigned CurrentInstExportType = NextExportInst->getOperand(1)
 536             .getImm();
 537         if (CurrentInstExportType == InstExportType) {
 538           isLastInstructionOfItsType = false;
 539           break;
 540         }
 541       }
 542     }
 543     bool EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
 544     if (!EOP && !isLastInstructionOfItsType)
 545       return BB;
 546     unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
 547     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
 548             .addOperand(MI->getOperand(0))
 549             .addOperand(MI->getOperand(1))
 550             .addOperand(MI->getOperand(2))
 551             .addOperand(MI->getOperand(3))
 552             .addOperand(MI->getOperand(4))
 553             .addOperand(MI->getOperand(5))
 554             .addOperand(MI->getOperand(6))
 555             .addImm(CfInst)
 556             .addImm(EOP);
 557     break;
 558   }
 559   case AMDGPU::RETURN: {
 560     // RETURN instructions must have the live-out registers as implicit uses,
 561     // otherwise they appear dead.
 562     R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
 563     MachineInstrBuilder MIB(*MF, MI);
 564     for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i)
 565       MIB.addReg(MFI->LiveOuts[i], RegState::Implicit);
 566     return BB;
 567   }
 568   }
 569
 570   MI->eraseFromParent();
 571   return BB;
 572 }
 573
 574 //===----------------------------------------------------------------------===//
 575 // Custom DAG Lowering Operations
 576 //===----------------------------------------------------------------------===//
 577
 578 SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
 579   MachineFunction &MF = DAG.getMachineFunction();
 580   R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
 581   switch (Op.getOpcode()) {
 582   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 583   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
 584   case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
 585   case ISD::SHL_PARTS: return LowerSHLParts(Op, DAG);
 586   case ISD::SRA_PARTS:
 587   case ISD::SRL_PARTS: return LowerSRXParts(Op, DAG);
 588   case ISD::FCOS:
 589   case ISD::FSIN: return LowerTrig(Op, DAG);
 590   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
 591   case ISD::STORE: return LowerSTORE(Op, DAG);
 592   case ISD::LOAD: {
 593     SDValue Result = LowerLOAD(Op, DAG);
 594     assert((!Result.getNode() ||
 595             Result.getNode()->getNumValues() == 2) &&
 596            "Load should return a value and a chain");
 597     return Result;
 598   }
 599
 600   case ISD::BRCOND: return LowerBRCOND(Op, DAG);
 601   case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
 602   case ISD::INTRINSIC_VOID: {
 603     SDValue Chain = Op.getOperand(0);
 604     unsigned IntrinsicID =
 605                          cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 606     switch (IntrinsicID) {
 607     case AMDGPUIntrinsic::AMDGPU_store_output: {
 608       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
 609       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
 610       MFI->LiveOuts.push_back(Reg);
 611       return DAG.getCopyToReg(Chain, SDLoc(Op), Reg, Op.getOperand(2));
 612     }
 613     case AMDGPUIntrinsic::R600_store_swizzle: {
 614       const SDValue Args[8] = {
 615         Chain,
 616         Op.getOperand(2), // Export Value
 617         Op.getOperand(3), // ArrayBase
 618         Op.getOperand(4), // Type
 619         DAG.getConstant(0, MVT::i32), // SWZ_X
 620         DAG.getConstant(1, MVT::i32), // SWZ_Y
 621         DAG.getConstant(2, MVT::i32), // SWZ_Z
 622         DAG.getConstant(3, MVT::i32) // SWZ_W
 623       };
 624       return DAG.getNode(AMDGPUISD::EXPORT, SDLoc(Op), Op.getValueType(), Args);
 625     }
 626
 627     // default for switch(IntrinsicID)
 628     default: break;
 629     }
 630     // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
 631     break;
 632   }
 633   case ISD::INTRINSIC_WO_CHAIN: {
 634     unsigned IntrinsicID =
 635                          cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
 636     EVT VT = Op.getValueType();
 637     SDLoc DL(Op);
 638     switch(IntrinsicID) {
 639     default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
 640     case AMDGPUIntrinsic::R600_load_input: {
 641       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 642       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
 643       MachineFunction &MF = DAG.getMachineFunction();
 644       MachineRegisterInfo &MRI = MF.getRegInfo();
 645       MRI.addLiveIn(Reg);
 646       return DAG.getCopyFromReg(DAG.getEntryNode(),
 647           SDLoc(DAG.getEntryNode()), Reg, VT);
 648     }
 649
 650     case AMDGPUIntrinsic::R600_interp_input: {
 651       int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 652       int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue();
 653       MachineSDNode *interp;
 654       if (ijb < 0) {
 655         const R600InstrInfo *TII =
 656             static_cast<const R600InstrInfo *>(Subtarget->getInstrInfo());
 657         interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL,
 658             MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32));
 659         return DAG.getTargetExtractSubreg(
 660             TII->getRegisterInfo().getSubRegFromChannel(slot % 4),
 661             DL, MVT::f32, SDValue(interp, 0));
 662       }
 663       MachineFunction &MF = DAG.getMachineFunction();
 664       MachineRegisterInfo &MRI = MF.getRegInfo();
 665       unsigned RegisterI = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb);
 666       unsigned RegisterJ = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1);
 667       MRI.addLiveIn(RegisterI);
 668       MRI.addLiveIn(RegisterJ);
 669       SDValue RegisterINode = DAG.getCopyFromReg(DAG.getEntryNode(),
 670           SDLoc(DAG.getEntryNode()), RegisterI, MVT::f32);
 671       SDValue RegisterJNode = DAG.getCopyFromReg(DAG.getEntryNode(),
 672           SDLoc(DAG.getEntryNode()), RegisterJ, MVT::f32);
 673
 674       if (slot % 4 < 2)
 675         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
 676             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
 677             RegisterJNode, RegisterINode);
 678       else
 679         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
 680             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
 681             RegisterJNode, RegisterINode);
 682       return SDValue(interp, slot % 2);
 683     }
 684     case AMDGPUIntrinsic::R600_interp_xy:
 685     case AMDGPUIntrinsic::R600_interp_zw: {
 686       int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 687       MachineSDNode *interp;
 688       SDValue RegisterINode = Op.getOperand(2);
 689       SDValue RegisterJNode = Op.getOperand(3);
 690
 691       if (IntrinsicID == AMDGPUIntrinsic::R600_interp_xy)
 692         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
 693             MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32),
 694             RegisterJNode, RegisterINode);
 695       else
 696         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
 697             MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32),
 698             RegisterJNode, RegisterINode);
 699       return DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32,
 700           SDValue(interp, 0), SDValue(interp, 1));
 701     }
 702     case AMDGPUIntrinsic::R600_tex:
 703     case AMDGPUIntrinsic::R600_texc:
 704     case AMDGPUIntrinsic::R600_txl:
 705     case AMDGPUIntrinsic::R600_txlc:
 706     case AMDGPUIntrinsic::R600_txb:
 707     case AMDGPUIntrinsic::R600_txbc:
 708     case AMDGPUIntrinsic::R600_txf:
 709     case AMDGPUIntrinsic::R600_txq:
 710     case AMDGPUIntrinsic::R600_ddx:
 711     case AMDGPUIntrinsic::R600_ddy:
 712     case AMDGPUIntrinsic::R600_ldptr: {
 713       unsigned TextureOp;
 714       switch (IntrinsicID) {
 715       case AMDGPUIntrinsic::R600_tex:
 716         TextureOp = 0;
 717         break;
 718       case AMDGPUIntrinsic::R600_texc:
 719         TextureOp = 1;
 720         break;
 721       case AMDGPUIntrinsic::R600_txl:
 722         TextureOp = 2;
 723         break;
 724       case AMDGPUIntrinsic::R600_txlc:
 725         TextureOp = 3;
 726         break;
 727       case AMDGPUIntrinsic::R600_txb:
 728         TextureOp = 4;
 729         break;
 730       case AMDGPUIntrinsic::R600_txbc:
 731         TextureOp = 5;
 732         break;
 733       case AMDGPUIntrinsic::R600_txf:
 734         TextureOp = 6;
 735         break;
 736       case AMDGPUIntrinsic::R600_txq:
 737         TextureOp = 7;
 738         break;
 739       case AMDGPUIntrinsic::R600_ddx:
 740         TextureOp = 8;
 741         break;
 742       case AMDGPUIntrinsic::R600_ddy:
 743         TextureOp = 9;
 744         break;
 745       case AMDGPUIntrinsic::R600_ldptr:
 746         TextureOp = 10;
 747         break;
 748       default:
 749         llvm_unreachable("Unknow Texture Operation");
 750       }
 751
 752       SDValue TexArgs[19] = {
 753         DAG.getConstant(TextureOp, MVT::i32),
 754         Op.getOperand(1),
 755         DAG.getConstant(0, MVT::i32),
 756         DAG.getConstant(1, MVT::i32),
 757         DAG.getConstant(2, MVT::i32),
 758         DAG.getConstant(3, MVT::i32),
 759         Op.getOperand(2),
 760         Op.getOperand(3),
 761         Op.getOperand(4),
 762         DAG.getConstant(0, MVT::i32),
 763         DAG.getConstant(1, MVT::i32),
 764         DAG.getConstant(2, MVT::i32),
 765         DAG.getConstant(3, MVT::i32),
 766         Op.getOperand(5),
 767         Op.getOperand(6),
 768         Op.getOperand(7),
 769         Op.getOperand(8),
 770         Op.getOperand(9),
 771         Op.getOperand(10)
 772       };
 773       return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs);
 774     }
 775     case AMDGPUIntrinsic::AMDGPU_dp4: {
 776       SDValue Args[8] = {
 777       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 778           DAG.getConstant(0, MVT::i32)),
 779       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 780           DAG.getConstant(0, MVT::i32)),
 781       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 782           DAG.getConstant(1, MVT::i32)),
 783       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 784           DAG.getConstant(1, MVT::i32)),
 785       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 786           DAG.getConstant(2, MVT::i32)),
 787       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 788           DAG.getConstant(2, MVT::i32)),
 789       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
 790           DAG.getConstant(3, MVT::i32)),
 791       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
 792           DAG.getConstant(3, MVT::i32))
 793       };
 794       return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args);
 795     }
 796
 797     case Intrinsic::r600_read_ngroups_x:
 798       return LowerImplicitParameter(DAG, VT, DL, 0);
 799     case Intrinsic::r600_read_ngroups_y:
 800       return LowerImplicitParameter(DAG, VT, DL, 1);
 801     case Intrinsic::r600_read_ngroups_z:
 802       return LowerImplicitParameter(DAG, VT, DL, 2);
 803     case Intrinsic::r600_read_global_size_x:
 804       return LowerImplicitParameter(DAG, VT, DL, 3);
 805     case Intrinsic::r600_read_global_size_y:
 806       return LowerImplicitParameter(DAG, VT, DL, 4);
 807     case Intrinsic::r600_read_global_size_z:
 808       return LowerImplicitParameter(DAG, VT, DL, 5);
 809     case Intrinsic::r600_read_local_size_x:
 810       return LowerImplicitParameter(DAG, VT, DL, 6);
 811     case Intrinsic::r600_read_local_size_y:
 812       return LowerImplicitParameter(DAG, VT, DL, 7);
 813     case Intrinsic::r600_read_local_size_z:
 814       return LowerImplicitParameter(DAG, VT, DL, 8);
 815
 816     case Intrinsic::AMDGPU_read_workdim:
 817       return LowerImplicitParameter(DAG, VT, DL, MFI->ABIArgOffset / 4);
 818
 819     case Intrinsic::r600_read_tgid_x:
 820       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 821                                   AMDGPU::T1_X, VT);
 822     case Intrinsic::r600_read_tgid_y:
 823       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 824                                   AMDGPU::T1_Y, VT);
 825     case Intrinsic::r600_read_tgid_z:
 826       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 827                                   AMDGPU::T1_Z, VT);
 828     case Intrinsic::r600_read_tidig_x:
 829       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 830                                   AMDGPU::T0_X, VT);
 831     case Intrinsic::r600_read_tidig_y:
 832       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 833                                   AMDGPU::T0_Y, VT);
 834     case Intrinsic::r600_read_tidig_z:
 835       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
 836                                   AMDGPU::T0_Z, VT);
 837     case Intrinsic::AMDGPU_rsq:
 838       // XXX - I'm assuming SI's RSQ_LEGACY matches R600's behavior.
 839       return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
 840     }
 841     // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
 842     break;
 843   }
 844   } // end switch(Op.getOpcode())
 845   return SDValue();
 846 }
 847
 848 void R600TargetLowering::ReplaceNodeResults(SDNode *N,
 849                                             SmallVectorImpl<SDValue> &Results,
 850                                             SelectionDAG &DAG) const {
 851   switch (N->getOpcode()) {
 852   default:
 853     AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);
 854     return;
 855   case ISD::FP_TO_UINT:
 856     if (N->getValueType(0) == MVT::i1) {
 857       Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
 858       return;
 859     }
 860     // Fall-through. Since we don't care about out of bounds values
 861     // we can use FP_TO_SINT for uints too. The DAGLegalizer code for uint
 862     // considers some extra cases which are not necessary here.
 863   case ISD::FP_TO_SINT: {
 864     SDValue Result;
 865     if (expandFP_TO_SINT(N, Result, DAG))
 866       Results.push_back(Result);
 867     return;
 868   }
 869   case ISD::UDIV: {
 870     SDValue Op = SDValue(N, 0);
 871     SDLoc DL(Op);
 872     EVT VT = Op.getValueType();
 873     SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT),
 874       N->getOperand(0), N->getOperand(1));
 875     Results.push_back(UDIVREM);
 876     break;
 877   }
 878   case ISD::UREM: {
 879     SDValue Op = SDValue(N, 0);
 880     SDLoc DL(Op);
 881     EVT VT = Op.getValueType();
 882     SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT),
 883       N->getOperand(0), N->getOperand(1));
 884     Results.push_back(UDIVREM.getValue(1));
 885     break;
 886   }
 887   case ISD::SDIV: {
 888     SDValue Op = SDValue(N, 0);
 889     SDLoc DL(Op);
 890     EVT VT = Op.getValueType();
 891     SDValue SDIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(VT, VT),
 892       N->getOperand(0), N->getOperand(1));
 893     Results.push_back(SDIVREM);
 894     break;
 895   }
 896   case ISD::SREM: {
 897     SDValue Op = SDValue(N, 0);
 898     SDLoc DL(Op);
 899     EVT VT = Op.getValueType();
 900     SDValue SDIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(VT, VT),
 901       N->getOperand(0), N->getOperand(1));
 902     Results.push_back(SDIVREM.getValue(1));
 903     break;
 904   }
 905   case ISD::SDIVREM: {
 906     SDValue Op = SDValue(N, 1);
 907     SDValue RES = LowerSDIVREM(Op, DAG);
 908     Results.push_back(RES);
 909     Results.push_back(RES.getValue(1));
 910     break;
 911   }
 912   case ISD::UDIVREM: {
 913     SDValue Op = SDValue(N, 0);
 914     LowerUDIVREM64(Op, DAG, Results);
 915     break;
 916   }
 917   }
 918 }
 919
 920 SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG,
 921                                                    SDValue Vector) const {
 922
 923   SDLoc DL(Vector);
 924   EVT VecVT = Vector.getValueType();
 925   EVT EltVT = VecVT.getVectorElementType();
 926   SmallVector<SDValue, 8> Args;
 927
 928   for (unsigned i = 0, e = VecVT.getVectorNumElements();
 929                                                            i != e; ++i) {
 930     Args.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
 931                                Vector, DAG.getConstant(i, getVectorIdxTy())));
 932   }
 933
 934   return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args);
 935 }
 936
 937 SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
 938                                                     SelectionDAG &DAG) const {
 939
 940   SDLoc DL(Op);
 941   SDValue Vector = Op.getOperand(0);
 942   SDValue Index = Op.getOperand(1);
 943
 944   if (isa<ConstantSDNode>(Index) ||
 945       Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
 946     return Op;
 947
 948   Vector = vectorToVerticalVector(DAG, Vector);
 949   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(),
 950                      Vector, Index);
 951 }
 952
 953 SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
 954                                                    SelectionDAG &DAG) const {
 955   SDLoc DL(Op);
 956   SDValue Vector = Op.getOperand(0);
 957   SDValue Value = Op.getOperand(1);
 958   SDValue Index = Op.getOperand(2);
 959
 960   if (isa<ConstantSDNode>(Index) ||
 961       Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
 962     return Op;
 963
 964   Vector = vectorToVerticalVector(DAG, Vector);
 965   SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(),
 966                                Vector, Value, Index);
 967   return vectorToVerticalVector(DAG, Insert);
 968 }
 969
 970 SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
 971   // On hw >= R700, COS/SIN input must be between -1. and 1.
 972   // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5)
 973   EVT VT = Op.getValueType();
 974   SDValue Arg = Op.getOperand(0);
 975   SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, SDLoc(Op), VT,
 976       DAG.getNode(ISD::FADD, SDLoc(Op), VT,
 977         DAG.getNode(ISD::FMUL, SDLoc(Op), VT, Arg,
 978           DAG.getConstantFP(0.15915494309, MVT::f32)),
 979         DAG.getConstantFP(0.5, MVT::f32)));
 980   unsigned TrigNode;
 981   switch (Op.getOpcode()) {
 982   case ISD::FCOS:
 983     TrigNode = AMDGPUISD::COS_HW;
 984     break;
 985   case ISD::FSIN:
 986     TrigNode = AMDGPUISD::SIN_HW;
 987     break;
 988   default:
 989     llvm_unreachable("Wrong trig opcode");
 990   }
 991   SDValue TrigVal = DAG.getNode(TrigNode, SDLoc(Op), VT,
 992       DAG.getNode(ISD::FADD, SDLoc(Op), VT, FractPart,
 993         DAG.getConstantFP(-0.5, MVT::f32)));
 994   if (Gen >= AMDGPUSubtarget::R700)
 995     return TrigVal;
 996   // On R600 hw, COS/SIN input must be between -Pi and Pi.
 997   return DAG.getNode(ISD::FMUL, SDLoc(Op), VT, TrigVal,
 998       DAG.getConstantFP(3.14159265359, MVT::f32));
 999 }
1000
1001 SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const {
1002   SDLoc DL(Op);
1003   EVT VT = Op.getValueType();
1004
1005   SDValue Lo = Op.getOperand(0);
1006   SDValue Hi = Op.getOperand(1);
1007   SDValue Shift = Op.getOperand(2);
1008   SDValue Zero = DAG.getConstant(0, VT);
1009   SDValue One  = DAG.getConstant(1, VT);
1010
1011   SDValue Width  = DAG.getConstant(VT.getSizeInBits(), VT);
1012   SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, VT);
1013   SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
1014   SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
1015
1016   // The dance around Width1 is necessary for 0 special case.
1017   // Without it the CompShift might be 32, producing incorrect results in
1018   // Overflow. So we do the shift in two steps, the alternative is to
1019   // add a conditional to filter the special case.
1020
1021   SDValue Overflow = DAG.getNode(ISD::SRL, DL, VT, Lo, CompShift);
1022   Overflow = DAG.getNode(ISD::SRL, DL, VT, Overflow, One);
1023
1024   SDValue HiSmall = DAG.getNode(ISD::SHL, DL, VT, Hi, Shift);
1025   HiSmall = DAG.getNode(ISD::OR, DL, VT, HiSmall, Overflow);
1026   SDValue LoSmall = DAG.getNode(ISD::SHL, DL, VT, Lo, Shift);
1027
1028   SDValue HiBig = DAG.getNode(ISD::SHL, DL, VT, Lo, BigShift);
1029   SDValue LoBig = Zero;
1030
1031   Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
1032   Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
1033
1034   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
1035 }
1036
1037 SDValue R600TargetLowering::LowerSRXParts(SDValue Op, SelectionDAG &DAG) const {
1038   SDLoc DL(Op);
1039   EVT VT = Op.getValueType();
1040
1041   SDValue Lo = Op.getOperand(0);
1042   SDValue Hi = Op.getOperand(1);
1043   SDValue Shift = Op.getOperand(2);
1044   SDValue Zero = DAG.getConstant(0, VT);
1045   SDValue One  = DAG.getConstant(1, VT);
1046
1047   const bool SRA = Op.getOpcode() == ISD::SRA_PARTS;
1048
1049   SDValue Width  = DAG.getConstant(VT.getSizeInBits(), VT);
1050   SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, VT);
1051   SDValue BigShift  = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
1052   SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
1053
1054   // The dance around Width1 is necessary for 0 special case.
1055   // Without it the CompShift might be 32, producing incorrect results in
1056   // Overflow. So we do the shift in two steps, the alternative is to
1057   // add a conditional to filter the special case.
1058
1059   SDValue Overflow = DAG.getNode(ISD::SHL, DL, VT, Hi, CompShift);
1060   Overflow = DAG.getNode(ISD::SHL, DL, VT, Overflow, One);
1061
1062   SDValue HiSmall = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, Shift);
1063   SDValue LoSmall = DAG.getNode(ISD::SRL, DL, VT, Lo, Shift);
1064   LoSmall = DAG.getNode(ISD::OR, DL, VT, LoSmall, Overflow);
1065
1066   SDValue LoBig = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, BigShift);
1067   SDValue HiBig = SRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, Width1) : Zero;
1068
1069   Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
1070   Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
1071
1072   return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
1073 }
1074
1075 SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
1076   return DAG.getNode(
1077       ISD::SETCC,
1078       SDLoc(Op),
1079       MVT::i1,
1080       Op, DAG.getConstantFP(0.0f, MVT::f32),
1081       DAG.getCondCode(ISD::SETNE)
1082       );
1083 }
1084
1085 SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
1086                                                    SDLoc DL,
1087                                                    unsigned DwordOffset) const {
1088   unsigned ByteOffset = DwordOffset * 4;
1089   PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1090                                       AMDGPUAS::CONSTANT_BUFFER_0);
1091
1092   // We shouldn't be using an offset wider than 16-bits for implicit parameters.
1093   assert(isInt<16>(ByteOffset));
1094
1095   return DAG.getLoad(VT, DL, DAG.getEntryNode(),
1096                      DAG.getConstant(ByteOffset, MVT::i32), // PTR
1097                      MachinePointerInfo(ConstantPointerNull::get(PtrType)),
1098                      false, false, false, 0);
1099 }
1100
1101 bool R600TargetLowering::isZero(SDValue Op) const {
1102   if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
1103     return Cst->isNullValue();
1104   } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
1105     return CstFP->isZero();
1106   } else {
1107     return false;
1108   }
1109 }
1110
1111 SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
1112   SDLoc DL(Op);
1113   EVT VT = Op.getValueType();
1114
1115   SDValue LHS = Op.getOperand(0);
1116   SDValue RHS = Op.getOperand(1);
1117   SDValue True = Op.getOperand(2);
1118   SDValue False = Op.getOperand(3);
1119   SDValue CC = Op.getOperand(4);
1120   SDValue Temp;
1121
1122   if (VT == MVT::f32) {
1123     DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
1124     SDValue MinMax = CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI);
1125     if (MinMax)
1126       return MinMax;
1127   }
1128
1129   // LHS and RHS are guaranteed to be the same value type
1130   EVT CompareVT = LHS.getValueType();
1131
1132   // Check if we can lower this to a native operation.
1133
1134   // Try to lower to a SET* instruction:
1135   //
1136   // SET* can match the following patterns:
1137   //
1138   // select_cc f32, f32, -1,  0, cc_supported
1139   // select_cc f32, f32, 1.0f, 0.0f, cc_supported
1140   // select_cc i32, i32, -1,  0, cc_supported
1141   //
1142
1143   // Move hardware True/False values to the correct operand.
1144   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1145   ISD::CondCode InverseCC =
1146      ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
1147   if (isHWTrueValue(False) && isHWFalseValue(True)) {
1148     if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) {
1149       std::swap(False, True);
1150       CC = DAG.getCondCode(InverseCC);
1151     } else {
1152       ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC);
1153       if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) {
1154         std::swap(False, True);
1155         std::swap(LHS, RHS);
1156         CC = DAG.getCondCode(SwapInvCC);
1157       }
1158     }
1159   }
1160
1161   if (isHWTrueValue(True) && isHWFalseValue(False) &&
1162       (CompareVT == VT || VT == MVT::i32)) {
1163     // This can be matched by a SET* instruction.
1164     return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
1165   }
1166
1167   // Try to lower to a CND* instruction:
1168   //
1169   // CND* can match the following patterns:
1170   //
1171   // select_cc f32, 0.0, f32, f32, cc_supported
1172   // select_cc f32, 0.0, i32, i32, cc_supported
1173   // select_cc i32, 0,   f32, f32, cc_supported
1174   // select_cc i32, 0,   i32, i32, cc_supported
1175   //
1176
1177   // Try to move the zero value to the RHS
1178   if (isZero(LHS)) {
1179     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1180     // Try swapping the operands
1181     ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode);
1182     if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
1183       std::swap(LHS, RHS);
1184       CC = DAG.getCondCode(CCSwapped);
1185     } else {
1186       // Try inverting the conditon and then swapping the operands
1187       ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT.isInteger());
1188       CCSwapped = ISD::getSetCCSwappedOperands(CCInv);
1189       if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
1190         std::swap(True, False);
1191         std::swap(LHS, RHS);
1192         CC = DAG.getCondCode(CCSwapped);
1193       }
1194     }
1195   }
1196   if (isZero(RHS)) {
1197     SDValue Cond = LHS;
1198     SDValue Zero = RHS;
1199     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1200     if (CompareVT != VT) {
1201       // Bitcast True / False to the correct types.  This will end up being
1202       // a nop, but it allows us to define only a single pattern in the
1203       // .TD files for each CND* instruction rather than having to have
1204       // one pattern for integer True/False and one for fp True/False
1205       True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
1206       False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
1207     }
1208
1209     switch (CCOpcode) {
1210     case ISD::SETONE:
1211     case ISD::SETUNE:
1212     case ISD::SETNE:
1213       CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
1214       Temp = True;
1215       True = False;
1216       False = Temp;
1217       break;
1218     default:
1219       break;
1220     }
1221     SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
1222         Cond, Zero,
1223         True, False,
1224         DAG.getCondCode(CCOpcode));
1225     return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
1226   }
1227
1228   // If we make it this for it means we have no native instructions to handle
1229   // this SELECT_CC, so we must lower it.
1230   SDValue HWTrue, HWFalse;
1231
1232   if (CompareVT == MVT::f32) {
1233     HWTrue = DAG.getConstantFP(1.0f, CompareVT);
1234     HWFalse = DAG.getConstantFP(0.0f, CompareVT);
1235   } else if (CompareVT == MVT::i32) {
1236     HWTrue = DAG.getConstant(-1, CompareVT);
1237     HWFalse = DAG.getConstant(0, CompareVT);
1238   }
1239   else {
1240     llvm_unreachable("Unhandled value type in LowerSELECT_CC");
1241   }
1242
1243   // Lower this unsupported SELECT_CC into a combination of two supported
1244   // SELECT_CC operations.
1245   SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
1246
1247   return DAG.getNode(ISD::SELECT_CC, DL, VT,
1248       Cond, HWFalse,
1249       True, False,
1250       DAG.getCondCode(ISD::SETNE));
1251 }
1252
1253 /// LLVM generates byte-addressed pointers.  For indirect addressing, we need to
1254 /// convert these pointers to a register index.  Each register holds
1255 /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
1256 /// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
1257 /// for indirect addressing.
1258 SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
1259                                                unsigned StackWidth,
1260                                                SelectionDAG &DAG) const {
1261   unsigned SRLPad;
1262   switch(StackWidth) {
1263   case 1:
1264     SRLPad = 2;
1265     break;
1266   case 2:
1267     SRLPad = 3;
1268     break;
1269   case 4:
1270     SRLPad = 4;
1271     break;
1272   default: llvm_unreachable("Invalid stack width");
1273   }
1274
1275   return DAG.getNode(ISD::SRL, SDLoc(Ptr), Ptr.getValueType(), Ptr,
1276                      DAG.getConstant(SRLPad, MVT::i32));
1277 }
1278
1279 void R600TargetLowering::getStackAddress(unsigned StackWidth,
1280                                          unsigned ElemIdx,
1281                                          unsigned &Channel,
1282                                          unsigned &PtrIncr) const {
1283   switch (StackWidth) {
1284   default:
1285   case 1:
1286     Channel = 0;
1287     if (ElemIdx > 0) {
1288       PtrIncr = 1;
1289     } else {
1290       PtrIncr = 0;
1291     }
1292     break;
1293   case 2:
1294     Channel = ElemIdx % 2;
1295     if (ElemIdx == 2) {
1296       PtrIncr = 1;
1297     } else {
1298       PtrIncr = 0;
1299     }
1300     break;
1301   case 4:
1302     Channel = ElemIdx;
1303     PtrIncr = 0;
1304     break;
1305   }
1306 }
1307
1308 SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
1309   SDLoc DL(Op);
1310   StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
1311   SDValue Chain = Op.getOperand(0);
1312   SDValue Value = Op.getOperand(1);
1313   SDValue Ptr = Op.getOperand(2);
1314
1315   SDValue Result = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
1316   if (Result.getNode()) {
1317     return Result;
1318   }
1319
1320   if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS) {
1321     if (StoreNode->isTruncatingStore()) {
1322       EVT VT = Value.getValueType();
1323       assert(VT.bitsLE(MVT::i32));
1324       EVT MemVT = StoreNode->getMemoryVT();
1325       SDValue MaskConstant;
1326       if (MemVT == MVT::i8) {
1327         MaskConstant = DAG.getConstant(0xFF, MVT::i32);
1328       } else {
1329         assert(MemVT == MVT::i16);
1330         MaskConstant = DAG.getConstant(0xFFFF, MVT::i32);
1331       }
1332       SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr,
1333                                       DAG.getConstant(2, MVT::i32));
1334       SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr,
1335                                       DAG.getConstant(0x00000003, VT));
1336       SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant);
1337       SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex,
1338                                    DAG.getConstant(3, VT));
1339       SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift);
1340       SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift);
1341       // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32
1342       // vector instead.
1343       SDValue Src[4] = {
1344         ShiftedValue,
1345         DAG.getConstant(0, MVT::i32),
1346         DAG.getConstant(0, MVT::i32),
1347         Mask
1348       };
1349       SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src);
1350       SDValue Args[3] = { Chain, Input, DWordAddr };
1351       return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL,
1352                                      Op->getVTList(), Args, MemVT,
1353                                      StoreNode->getMemOperand());
1354     } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR &&
1355                Value.getValueType().bitsGE(MVT::i32)) {
1356       // Convert pointer from byte address to dword address.
1357       Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
1358                         DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
1359                                     Ptr, DAG.getConstant(2, MVT::i32)));
1360
1361       if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
1362         llvm_unreachable("Truncated and indexed stores not supported yet");
1363       } else {
1364         Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
1365       }
1366       return Chain;
1367     }
1368   }
1369
1370   EVT ValueVT = Value.getValueType();
1371
1372   if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1373     return SDValue();
1374   }
1375
1376   SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
1377   if (Ret.getNode()) {
1378     return Ret;
1379   }
1380   // Lowering for indirect addressing
1381
1382   const MachineFunction &MF = DAG.getMachineFunction();
1383   const AMDGPUFrameLowering *TFL =
1384       static_cast<const AMDGPUFrameLowering *>(Subtarget->getFrameLowering());
1385   unsigned StackWidth = TFL->getStackWidth(MF);
1386
1387   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1388
1389   if (ValueVT.isVector()) {
1390     unsigned NumElemVT = ValueVT.getVectorNumElements();
1391     EVT ElemVT = ValueVT.getVectorElementType();
1392     SmallVector<SDValue, 4> Stores(NumElemVT);
1393
1394     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1395                                       "vector width in load");
1396
1397     for (unsigned i = 0; i < NumElemVT; ++i) {
1398       unsigned Channel, PtrIncr;
1399       getStackAddress(StackWidth, i, Channel, PtrIncr);
1400       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1401                         DAG.getConstant(PtrIncr, MVT::i32));
1402       SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
1403                                  Value, DAG.getConstant(i, MVT::i32));
1404
1405       Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
1406                               Chain, Elem, Ptr,
1407                               DAG.getTargetConstant(Channel, MVT::i32));
1408     }
1409      Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
1410    } else {
1411     if (ValueVT == MVT::i8) {
1412       Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
1413     }
1414     Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
1415     DAG.getTargetConstant(0, MVT::i32)); // Channel
1416   }
1417
1418   return Chain;
1419 }
1420
1421 // return (512 + (kc_bank << 12)
1422 static int
1423 ConstantAddressBlock(unsigned AddressSpace) {
1424   switch (AddressSpace) {
1425   case AMDGPUAS::CONSTANT_BUFFER_0:
1426     return 512;
1427   case AMDGPUAS::CONSTANT_BUFFER_1:
1428     return 512 + 4096;
1429   case AMDGPUAS::CONSTANT_BUFFER_2:
1430     return 512 + 4096 * 2;
1431   case AMDGPUAS::CONSTANT_BUFFER_3:
1432     return 512 + 4096 * 3;
1433   case AMDGPUAS::CONSTANT_BUFFER_4:
1434     return 512 + 4096 * 4;
1435   case AMDGPUAS::CONSTANT_BUFFER_5:
1436     return 512 + 4096 * 5;
1437   case AMDGPUAS::CONSTANT_BUFFER_6:
1438     return 512 + 4096 * 6;
1439   case AMDGPUAS::CONSTANT_BUFFER_7:
1440     return 512 + 4096 * 7;
1441   case AMDGPUAS::CONSTANT_BUFFER_8:
1442     return 512 + 4096 * 8;
1443   case AMDGPUAS::CONSTANT_BUFFER_9:
1444     return 512 + 4096 * 9;
1445   case AMDGPUAS::CONSTANT_BUFFER_10:
1446     return 512 + 4096 * 10;
1447   case AMDGPUAS::CONSTANT_BUFFER_11:
1448     return 512 + 4096 * 11;
1449   case AMDGPUAS::CONSTANT_BUFFER_12:
1450     return 512 + 4096 * 12;
1451   case AMDGPUAS::CONSTANT_BUFFER_13:
1452     return 512 + 4096 * 13;
1453   case AMDGPUAS::CONSTANT_BUFFER_14:
1454     return 512 + 4096 * 14;
1455   case AMDGPUAS::CONSTANT_BUFFER_15:
1456     return 512 + 4096 * 15;
1457   default:
1458     return -1;
1459   }
1460 }
1461
1462 SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
1463 {
1464   EVT VT = Op.getValueType();
1465   SDLoc DL(Op);
1466   LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
1467   SDValue Chain = Op.getOperand(0);
1468   SDValue Ptr = Op.getOperand(1);
1469   SDValue LoweredLoad;
1470
1471   SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG);
1472   if (Ret.getNode()) {
1473     SDValue Ops[2] = {
1474       Ret,
1475       Chain
1476     };
1477     return DAG.getMergeValues(Ops, DL);
1478   }
1479
1480   // Lower loads constant address space global variable loads
1481   if (LoadNode->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
1482       isa<GlobalVariable>(
1483           GetUnderlyingObject(LoadNode->getMemOperand()->getValue()))) {
1484
1485     SDValue Ptr = DAG.getZExtOrTrunc(LoadNode->getBasePtr(), DL,
1486         getPointerTy(AMDGPUAS::PRIVATE_ADDRESS));
1487     Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
1488         DAG.getConstant(2, MVT::i32));
1489     return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op->getVTList(),
1490                        LoadNode->getChain(), Ptr,
1491                        DAG.getTargetConstant(0, MVT::i32), Op.getOperand(2));
1492   }
1493
1494   if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) {
1495     SDValue MergedValues[2] = {
1496       ScalarizeVectorLoad(Op, DAG),
1497       Chain
1498     };
1499     return DAG.getMergeValues(MergedValues, DL);
1500   }
1501
1502   int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
1503   if (ConstantBlock > -1 &&
1504       ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) ||
1505        (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) {
1506     SDValue Result;
1507     if (isa<ConstantExpr>(LoadNode->getMemOperand()->getValue()) ||
1508         isa<Constant>(LoadNode->getMemOperand()->getValue()) ||
1509         isa<ConstantSDNode>(Ptr)) {
1510       SDValue Slots[4];
1511       for (unsigned i = 0; i < 4; i++) {
1512         // We want Const position encoded with the following formula :
1513         // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
1514         // const_index is Ptr computed by llvm using an alignment of 16.
1515         // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
1516         // then div by 4 at the ISel step
1517         SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
1518             DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32));
1519         Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
1520       }
1521       EVT NewVT = MVT::v4i32;
1522       unsigned NumElements = 4;
1523       if (VT.isVector()) {
1524         NewVT = VT;
1525         NumElements = VT.getVectorNumElements();
1526       }
1527       Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT,
1528                            makeArrayRef(Slots, NumElements));
1529     } else {
1530       // non-constant ptr can't be folded, keeps it as a v4f32 load
1531       Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
1532           DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)),
1533           DAG.getConstant(LoadNode->getAddressSpace() -
1534                           AMDGPUAS::CONSTANT_BUFFER_0, MVT::i32)
1535           );
1536     }
1537
1538     if (!VT.isVector()) {
1539       Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
1540           DAG.getConstant(0, MVT::i32));
1541     }
1542
1543     SDValue MergedValues[2] = {
1544       Result,
1545       Chain
1546     };
1547     return DAG.getMergeValues(MergedValues, DL);
1548   }
1549
1550   // For most operations returning SDValue() will result in the node being
1551   // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we
1552   // need to manually expand loads that may be legal in some address spaces and
1553   // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for
1554   // compute shaders, since the data is sign extended when it is uploaded to the
1555   // buffer. However SEXT loads from other address spaces are not supported, so
1556   // we need to expand them here.
1557   if (LoadNode->getExtensionType() == ISD::SEXTLOAD) {
1558     EVT MemVT = LoadNode->getMemoryVT();
1559     assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8));
1560     SDValue ShiftAmount =
1561           DAG.getConstant(VT.getSizeInBits() - MemVT.getSizeInBits(), MVT::i32);
1562     SDValue NewLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, Chain, Ptr,
1563                                   LoadNode->getPointerInfo(), MemVT,
1564                                   LoadNode->isVolatile(),
1565                                   LoadNode->isNonTemporal(),
1566                                   LoadNode->isInvariant(),
1567                                   LoadNode->getAlignment());
1568     SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, NewLoad, ShiftAmount);
1569     SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Shl, ShiftAmount);
1570
1571     SDValue MergedValues[2] = { Sra, Chain };
1572     return DAG.getMergeValues(MergedValues, DL);
1573   }
1574
1575   if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1576     return SDValue();
1577   }
1578
1579   // Lowering for indirect addressing
1580   const MachineFunction &MF = DAG.getMachineFunction();
1581   const AMDGPUFrameLowering *TFL =
1582       static_cast<const AMDGPUFrameLowering *>(Subtarget->getFrameLowering());
1583   unsigned StackWidth = TFL->getStackWidth(MF);
1584
1585   Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1586
1587   if (VT.isVector()) {
1588     unsigned NumElemVT = VT.getVectorNumElements();
1589     EVT ElemVT = VT.getVectorElementType();
1590     SDValue Loads[4];
1591
1592     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1593                                       "vector width in load");
1594
1595     for (unsigned i = 0; i < NumElemVT; ++i) {
1596       unsigned Channel, PtrIncr;
1597       getStackAddress(StackWidth, i, Channel, PtrIncr);
1598       Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1599                         DAG.getConstant(PtrIncr, MVT::i32));
1600       Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
1601                              Chain, Ptr,
1602                              DAG.getTargetConstant(Channel, MVT::i32),
1603                              Op.getOperand(2));
1604     }
1605     for (unsigned i = NumElemVT; i < 4; ++i) {
1606       Loads[i] = DAG.getUNDEF(ElemVT);
1607     }
1608     EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4);
1609     LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads);
1610   } else {
1611     LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
1612                               Chain, Ptr,
1613                               DAG.getTargetConstant(0, MVT::i32), // Channel
1614                               Op.getOperand(2));
1615   }
1616
1617   SDValue Ops[2] = {
1618     LoweredLoad,
1619     Chain
1620   };
1621
1622   return DAG.getMergeValues(Ops, DL);
1623 }
1624
1625 SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
1626   SDValue Chain = Op.getOperand(0);
1627   SDValue Cond  = Op.getOperand(1);
1628   SDValue Jump  = Op.getOperand(2);
1629
1630   return DAG.getNode(AMDGPUISD::BRANCH_COND, SDLoc(Op), Op.getValueType(),
1631                      Chain, Jump, Cond);
1632 }
1633
1634 /// XXX Only kernel functions are supported, so we can assume for now that
1635 /// every function is a kernel function, but in the future we should use
1636 /// separate calling conventions for kernel and non-kernel functions.
1637 SDValue R600TargetLowering::LowerFormalArguments(
1638                                       SDValue Chain,
1639                                       CallingConv::ID CallConv,
1640                                       bool isVarArg,
1641                                       const SmallVectorImpl<ISD::InputArg> &Ins,
1642                                       SDLoc DL, SelectionDAG &DAG,
1643                                       SmallVectorImpl<SDValue> &InVals) const {
1644   SmallVector<CCValAssign, 16> ArgLocs;
1645   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
1646                  *DAG.getContext());
1647   MachineFunction &MF = DAG.getMachineFunction();
1648   R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
1649
1650   SmallVector<ISD::InputArg, 8> LocalIns;
1651
1652   getOriginalFunctionArgs(DAG, MF.getFunction(), Ins, LocalIns);
1653
1654   AnalyzeFormalArguments(CCInfo, LocalIns);
1655
1656   for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
1657     CCValAssign &VA = ArgLocs[i];
1658     const ISD::InputArg &In = Ins[i];
1659     EVT VT = In.VT;
1660     EVT MemVT = VA.getLocVT();
1661     if (!VT.isVector() && MemVT.isVector()) {
1662       // Get load source type if scalarized.
1663       MemVT = MemVT.getVectorElementType();
1664     }
1665
1666     if (MFI->getShaderType() != ShaderType::COMPUTE) {
1667       unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass);
1668       SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
1669       InVals.push_back(Register);
1670       continue;
1671     }
1672
1673     PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1674                                           AMDGPUAS::CONSTANT_BUFFER_0);
1675
1676     // i64 isn't a legal type, so the register type used ends up as i32, which
1677     // isn't expected here. It attempts to create this sextload, but it ends up
1678     // being invalid. Somehow this seems to work with i64 arguments, but breaks
1679     // for <1 x i64>.
1680
1681     // The first 36 bytes of the input buffer contains information about
1682     // thread group and global sizes.
1683     ISD::LoadExtType Ext = ISD::NON_EXTLOAD;
1684     if (MemVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) {
1685       // FIXME: This should really check the extload type, but the handling of
1686       // extload vector parameters seems to be broken.
1687
1688       // Ext = In.Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
1689       Ext = ISD::SEXTLOAD;
1690     }
1691
1692     // Compute the offset from the value.
1693     // XXX - I think PartOffset should give you this, but it seems to give the
1694     // size of the register which isn't useful.
1695
1696     unsigned ValBase = ArgLocs[In.getOrigArgIndex()].getLocMemOffset();
1697     unsigned PartOffset = VA.getLocMemOffset();
1698     unsigned Offset = 36 + VA.getLocMemOffset();
1699
1700     MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase);
1701     SDValue Arg = DAG.getLoad(ISD::UNINDEXED, Ext, VT, DL, Chain,
1702                               DAG.getConstant(Offset, MVT::i32),
1703                               DAG.getUNDEF(MVT::i32),
1704                               PtrInfo,
1705                               MemVT, false, true, true, 4);
1706
1707     // 4 is the preferred alignment for the CONSTANT memory space.
1708     InVals.push_back(Arg);
1709     MFI->ABIArgOffset = Offset + MemVT.getStoreSize();
1710   }
1711   return Chain;
1712 }
1713
1714 EVT R600TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
1715    if (!VT.isVector())
1716      return MVT::i32;
1717    return VT.changeVectorElementTypeToInteger();
1718 }
1719
1720 static SDValue CompactSwizzlableVector(
1721   SelectionDAG &DAG, SDValue VectorEntry,
1722   DenseMap<unsigned, unsigned> &RemapSwizzle) {
1723   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1724   assert(RemapSwizzle.empty());
1725   SDValue NewBldVec[4] = {
1726     VectorEntry.getOperand(0),
1727     VectorEntry.getOperand(1),
1728     VectorEntry.getOperand(2),
1729     VectorEntry.getOperand(3)
1730   };
1731
1732   for (unsigned i = 0; i < 4; i++) {
1733     if (NewBldVec[i].getOpcode() == ISD::UNDEF)
1734       // We mask write here to teach later passes that the ith element of this
1735       // vector is undef. Thus we can use it to reduce 128 bits reg usage,
1736       // break false dependencies and additionnaly make assembly easier to read.
1737       RemapSwizzle[i] = 7; // SEL_MASK_WRITE
1738     if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) {
1739       if (C->isZero()) {
1740         RemapSwizzle[i] = 4; // SEL_0
1741         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1742       } else if (C->isExactlyValue(1.0)) {
1743         RemapSwizzle[i] = 5; // SEL_1
1744         NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1745       }
1746     }
1747
1748     if (NewBldVec[i].getOpcode() == ISD::UNDEF)
1749       continue;
1750     for (unsigned j = 0; j < i; j++) {
1751       if (NewBldVec[i] == NewBldVec[j]) {
1752         NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType());
1753         RemapSwizzle[i] = j;
1754         break;
1755       }
1756     }
1757   }
1758
1759   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1760                      VectorEntry.getValueType(), NewBldVec);
1761 }
1762
1763 static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
1764                                 DenseMap<unsigned, unsigned> &RemapSwizzle) {
1765   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1766   assert(RemapSwizzle.empty());
1767   SDValue NewBldVec[4] = {
1768       VectorEntry.getOperand(0),
1769       VectorEntry.getOperand(1),
1770       VectorEntry.getOperand(2),
1771       VectorEntry.getOperand(3)
1772   };
1773   bool isUnmovable[4] = { false, false, false, false };
1774   for (unsigned i = 0; i < 4; i++) {
1775     RemapSwizzle[i] = i;
1776     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1777       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1778           ->getZExtValue();
1779       if (i == Idx)
1780         isUnmovable[Idx] = true;
1781     }
1782   }
1783
1784   for (unsigned i = 0; i < 4; i++) {
1785     if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1786       unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1787           ->getZExtValue();
1788       if (isUnmovable[Idx])
1789         continue;
1790       // Swap i and Idx
1791       std::swap(NewBldVec[Idx], NewBldVec[i]);
1792       std::swap(RemapSwizzle[i], RemapSwizzle[Idx]);
1793       break;
1794     }
1795   }
1796
1797   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1798                      VectorEntry.getValueType(), NewBldVec);
1799 }
1800
1801
1802 SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector,
1803 SDValue Swz[4], SelectionDAG &DAG) const {
1804   assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR);
1805   // Old -> New swizzle values
1806   DenseMap<unsigned, unsigned> SwizzleRemap;
1807
1808   BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap);
1809   for (unsigned i = 0; i < 4; i++) {
1810     unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue();
1811     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1812       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
1813   }
1814
1815   SwizzleRemap.clear();
1816   BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap);
1817   for (unsigned i = 0; i < 4; i++) {
1818     unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue();
1819     if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1820       Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
1821   }
1822
1823   return BuildVector;
1824 }
1825
1826
1827 //===----------------------------------------------------------------------===//
1828 // Custom DAG Optimizations
1829 //===----------------------------------------------------------------------===//
1830
1831 SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
1832                                               DAGCombinerInfo &DCI) const {
1833   SelectionDAG &DAG = DCI.DAG;
1834
1835   switch (N->getOpcode()) {
1836   default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
1837   // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
1838   case ISD::FP_ROUND: {
1839       SDValue Arg = N->getOperand(0);
1840       if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
1841         return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0),
1842                            Arg.getOperand(0));
1843       }
1844       break;
1845     }
1846
1847   // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
1848   // (i32 select_cc f32, f32, -1, 0 cc)
1849   //
1850   // Mesa's GLSL frontend generates the above pattern a lot and we can lower
1851   // this to one of the SET*_DX10 instructions.
1852   case ISD::FP_TO_SINT: {
1853     SDValue FNeg = N->getOperand(0);
1854     if (FNeg.getOpcode() != ISD::FNEG) {
1855       return SDValue();
1856     }
1857     SDValue SelectCC = FNeg.getOperand(0);
1858     if (SelectCC.getOpcode() != ISD::SELECT_CC ||
1859         SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
1860         SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
1861         !isHWTrueValue(SelectCC.getOperand(2)) ||
1862         !isHWFalseValue(SelectCC.getOperand(3))) {
1863       return SDValue();
1864     }
1865
1866     return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N->getValueType(0),
1867                            SelectCC.getOperand(0), // LHS
1868                            SelectCC.getOperand(1), // RHS
1869                            DAG.getConstant(-1, MVT::i32), // True
1870                            DAG.getConstant(0, MVT::i32),  // Flase
1871                            SelectCC.getOperand(4)); // CC
1872
1873     break;
1874   }
1875
1876   // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx
1877   // => build_vector elt0, ... , NewEltIdx, ... , eltN
1878   case ISD::INSERT_VECTOR_ELT: {
1879     SDValue InVec = N->getOperand(0);
1880     SDValue InVal = N->getOperand(1);
1881     SDValue EltNo = N->getOperand(2);
1882     SDLoc dl(N);
1883
1884     // If the inserted element is an UNDEF, just use the input vector.
1885     if (InVal.getOpcode() == ISD::UNDEF)
1886       return InVec;
1887
1888     EVT VT = InVec.getValueType();
1889
1890     // If we can't generate a legal BUILD_VECTOR, exit
1891     if (!isOperationLegal(ISD::BUILD_VECTOR, VT))
1892       return SDValue();
1893
1894     // Check that we know which element is being inserted
1895     if (!isa<ConstantSDNode>(EltNo))
1896       return SDValue();
1897     unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
1898
1899     // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
1900     // be converted to a BUILD_VECTOR).  Fill in the Ops vector with the
1901     // vector elements.
1902     SmallVector<SDValue, 8> Ops;
1903     if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
1904       Ops.append(InVec.getNode()->op_begin(),
1905                  InVec.getNode()->op_end());
1906     } else if (InVec.getOpcode() == ISD::UNDEF) {
1907       unsigned NElts = VT.getVectorNumElements();
1908       Ops.append(NElts, DAG.getUNDEF(InVal.getValueType()));
1909     } else {
1910       return SDValue();
1911     }
1912
1913     // Insert the element
1914     if (Elt < Ops.size()) {
1915       // All the operands of BUILD_VECTOR must have the same type;
1916       // we enforce that here.
1917       EVT OpVT = Ops[0].getValueType();
1918       if (InVal.getValueType() != OpVT)
1919         InVal = OpVT.bitsGT(InVal.getValueType()) ?
1920           DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) :
1921           DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal);
1922       Ops[Elt] = InVal;
1923     }
1924
1925     // Return the new vector
1926     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
1927   }
1928
1929   // Extract_vec (Build_vector) generated by custom lowering
1930   // also needs to be customly combined
1931   case ISD::EXTRACT_VECTOR_ELT: {
1932     SDValue Arg = N->getOperand(0);
1933     if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
1934       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1935         unsigned Element = Const->getZExtValue();
1936         return Arg->getOperand(Element);
1937       }
1938     }
1939     if (Arg.getOpcode() == ISD::BITCAST &&
1940         Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
1941       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1942         unsigned Element = Const->getZExtValue();
1943         return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(),
1944             Arg->getOperand(0).getOperand(Element));
1945       }
1946     }
1947   }
1948
1949   case ISD::SELECT_CC: {
1950     // Try common optimizations
1951     SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
1952     if (Ret.getNode())
1953       return Ret;
1954
1955     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
1956     //      selectcc x, y, a, b, inv(cc)
1957     //
1958     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
1959     //      selectcc x, y, a, b, cc
1960     SDValue LHS = N->getOperand(0);
1961     if (LHS.getOpcode() != ISD::SELECT_CC) {
1962       return SDValue();
1963     }
1964
1965     SDValue RHS = N->getOperand(1);
1966     SDValue True = N->getOperand(2);
1967     SDValue False = N->getOperand(3);
1968     ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
1969
1970     if (LHS.getOperand(2).getNode() != True.getNode() ||
1971         LHS.getOperand(3).getNode() != False.getNode() ||
1972         RHS.getNode() != False.getNode()) {
1973       return SDValue();
1974     }
1975
1976     switch (NCC) {
1977     default: return SDValue();
1978     case ISD::SETNE: return LHS;
1979     case ISD::SETEQ: {
1980       ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
1981       LHSCC = ISD::getSetCCInverse(LHSCC,
1982                                   LHS.getOperand(0).getValueType().isInteger());
1983       if (DCI.isBeforeLegalizeOps() ||
1984           isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType()))
1985         return DAG.getSelectCC(SDLoc(N),
1986                                LHS.getOperand(0),
1987                                LHS.getOperand(1),
1988                                LHS.getOperand(2),
1989                                LHS.getOperand(3),
1990                                LHSCC);
1991       break;
1992     }
1993     }
1994     return SDValue();
1995   }
1996
1997   case AMDGPUISD::EXPORT: {
1998     SDValue Arg = N->getOperand(1);
1999     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
2000       break;
2001
2002     SDValue NewArgs[8] = {
2003       N->getOperand(0), // Chain
2004       SDValue(),
2005       N->getOperand(2), // ArrayBase
2006       N->getOperand(3), // Type
2007       N->getOperand(4), // SWZ_X
2008       N->getOperand(5), // SWZ_Y
2009       N->getOperand(6), // SWZ_Z
2010       N->getOperand(7) // SWZ_W
2011     };
2012     SDLoc DL(N);
2013     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG);
2014     return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs);
2015   }
2016   case AMDGPUISD::TEXTURE_FETCH: {
2017     SDValue Arg = N->getOperand(1);
2018     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
2019       break;
2020
2021     SDValue NewArgs[19] = {
2022       N->getOperand(0),
2023       N->getOperand(1),
2024       N->getOperand(2),
2025       N->getOperand(3),
2026       N->getOperand(4),
2027       N->getOperand(5),
2028       N->getOperand(6),
2029       N->getOperand(7),
2030       N->getOperand(8),
2031       N->getOperand(9),
2032       N->getOperand(10),
2033       N->getOperand(11),
2034       N->getOperand(12),
2035       N->getOperand(13),
2036       N->getOperand(14),
2037       N->getOperand(15),
2038       N->getOperand(16),
2039       N->getOperand(17),
2040       N->getOperand(18),
2041     };
2042     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG);
2043     return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, SDLoc(N), N->getVTList(),
2044         NewArgs);
2045   }
2046   }
2047
2048   return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
2049 }
2050
2051 static bool
2052 FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg,
2053             SDValue &Abs, SDValue &Sel, SDValue &Imm, SelectionDAG &DAG) {
2054   const R600InstrInfo *TII =
2055       static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo());
2056   if (!Src.isMachineOpcode())
2057     return false;
2058   switch (Src.getMachineOpcode()) {
2059   case AMDGPU::FNEG_R600:
2060     if (!Neg.getNode())
2061       return false;
2062     Src = Src.getOperand(0);
2063     Neg = DAG.getTargetConstant(1, MVT::i32);
2064     return true;
2065   case AMDGPU::FABS_R600:
2066     if (!Abs.getNode())
2067       return false;
2068     Src = Src.getOperand(0);
2069     Abs = DAG.getTargetConstant(1, MVT::i32);
2070     return true;
2071   case AMDGPU::CONST_COPY: {
2072     unsigned Opcode = ParentNode->getMachineOpcode();
2073     bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2074
2075     if (!Sel.getNode())
2076       return false;
2077
2078     SDValue CstOffset = Src.getOperand(0);
2079     if (ParentNode->getValueType(0).isVector())
2080       return false;
2081
2082     // Gather constants values
2083     int SrcIndices[] = {
2084       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
2085       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
2086       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2),
2087       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
2088       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
2089       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
2090       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
2091       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
2092       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
2093       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
2094       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
2095     };
2096     std::vector<unsigned> Consts;
2097     for (int OtherSrcIdx : SrcIndices) {
2098       int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx);
2099       if (OtherSrcIdx < 0 || OtherSelIdx < 0)
2100         continue;
2101       if (HasDst) {
2102         OtherSrcIdx--;
2103         OtherSelIdx--;
2104       }
2105       if (RegisterSDNode *Reg =
2106           dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) {
2107         if (Reg->getReg() == AMDGPU::ALU_CONST) {
2108           ConstantSDNode *Cst
2109             = cast<ConstantSDNode>(ParentNode->getOperand(OtherSelIdx));
2110           Consts.push_back(Cst->getZExtValue());
2111         }
2112       }
2113     }
2114
2115     ConstantSDNode *Cst = cast<ConstantSDNode>(CstOffset);
2116     Consts.push_back(Cst->getZExtValue());
2117     if (!TII->fitsConstReadLimitations(Consts)) {
2118       return false;
2119     }
2120
2121     Sel = CstOffset;
2122     Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32);
2123     return true;
2124   }
2125   case AMDGPU::MOV_IMM_I32:
2126   case AMDGPU::MOV_IMM_F32: {
2127     unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
2128     uint64_t ImmValue = 0;
2129
2130
2131     if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) {
2132       ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
2133       float FloatValue = FPC->getValueAPF().convertToFloat();
2134       if (FloatValue == 0.0) {
2135         ImmReg = AMDGPU::ZERO;
2136       } else if (FloatValue == 0.5) {
2137         ImmReg = AMDGPU::HALF;
2138       } else if (FloatValue == 1.0) {
2139         ImmReg = AMDGPU::ONE;
2140       } else {
2141         ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
2142       }
2143     } else {
2144       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0));
2145       uint64_t Value = C->getZExtValue();
2146       if (Value == 0) {
2147         ImmReg = AMDGPU::ZERO;
2148       } else if (Value == 1) {
2149         ImmReg = AMDGPU::ONE_INT;
2150       } else {
2151         ImmValue = Value;
2152       }
2153     }
2154
2155     // Check that we aren't already using an immediate.
2156     // XXX: It's possible for an instruction to have more than one
2157     // immediate operand, but this is not supported yet.
2158     if (ImmReg == AMDGPU::ALU_LITERAL_X) {
2159       if (!Imm.getNode())
2160         return false;
2161       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm);
2162       assert(C);
2163       if (C->getZExtValue())
2164         return false;
2165       Imm = DAG.getTargetConstant(ImmValue, MVT::i32);
2166     }
2167     Src = DAG.getRegister(ImmReg, MVT::i32);
2168     return true;
2169   }
2170   default:
2171     return false;
2172   }
2173 }
2174
2175
2176 /// \brief Fold the instructions after selecting them
2177 SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
2178                                             SelectionDAG &DAG) const {
2179   const R600InstrInfo *TII =
2180       static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo());
2181   if (!Node->isMachineOpcode())
2182     return Node;
2183   unsigned Opcode = Node->getMachineOpcode();
2184   SDValue FakeOp;
2185
2186   std::vector<SDValue> Ops(Node->op_begin(), Node->op_end());
2187
2188   if (Opcode == AMDGPU::DOT_4) {
2189     int OperandIdx[] = {
2190       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
2191       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
2192       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
2193       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
2194       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
2195       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
2196       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
2197       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
2198         };
2199     int NegIdx[] = {
2200       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X),
2201       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y),
2202       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z),
2203       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W),
2204       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X),
2205       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y),
2206       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z),
2207       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W)
2208     };
2209     int AbsIdx[] = {
2210       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X),
2211       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y),
2212       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z),
2213       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W),
2214       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X),
2215       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y),
2216       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z),
2217       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W)
2218     };
2219     for (unsigned i = 0; i < 8; i++) {
2220       if (OperandIdx[i] < 0)
2221         return Node;
2222       SDValue &Src = Ops[OperandIdx[i] - 1];
2223       SDValue &Neg = Ops[NegIdx[i] - 1];
2224       SDValue &Abs = Ops[AbsIdx[i] - 1];
2225       bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2226       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
2227       if (HasDst)
2228         SelIdx--;
2229       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
2230       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG))
2231         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2232     }
2233   } else if (Opcode == AMDGPU::REG_SEQUENCE) {
2234     for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) {
2235       SDValue &Src = Ops[i];
2236       if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG))
2237         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2238     }
2239   } else if (Opcode == AMDGPU::CLAMP_R600) {
2240     SDValue Src = Node->getOperand(0);
2241     if (!Src.isMachineOpcode() ||
2242         !TII->hasInstrModifiers(Src.getMachineOpcode()))
2243       return Node;
2244     int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(),
2245         AMDGPU::OpName::clamp);
2246     if (ClampIdx < 0)
2247       return Node;
2248     std::vector<SDValue> Ops(Src->op_begin(), Src->op_end());
2249     Ops[ClampIdx - 1] = DAG.getTargetConstant(1, MVT::i32);
2250     return DAG.getMachineNode(Src.getMachineOpcode(), SDLoc(Node),
2251         Node->getVTList(), Ops);
2252   } else {
2253     if (!TII->hasInstrModifiers(Opcode))
2254       return Node;
2255     int OperandIdx[] = {
2256       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
2257       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
2258       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2)
2259     };
2260     int NegIdx[] = {
2261       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg),
2262       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg),
2263       TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg)
2264     };
2265     int AbsIdx[] = {
2266       TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs),
2267       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs),
2268       -1
2269     };
2270     for (unsigned i = 0; i < 3; i++) {
2271       if (OperandIdx[i] < 0)
2272         return Node;
2273       SDValue &Src = Ops[OperandIdx[i] - 1];
2274       SDValue &Neg = Ops[NegIdx[i] - 1];
2275       SDValue FakeAbs;
2276       SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs;
2277       bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2278       int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
2279       int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal);
2280       if (HasDst) {
2281         SelIdx--;
2282         ImmIdx--;
2283       }
2284       SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
2285       SDValue &Imm = Ops[ImmIdx];
2286       if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG))
2287         return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2288     }
2289   }
2290
2291   return Node;
2292 }