src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp

   1 /*
   2  * Copyright 2011 Christoph Bumiller
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice shall be included in
  12  * all copies or substantial portions of the Software.
  13  *
  14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  17  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
  18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  20  * OTHER DEALINGS IN THE SOFTWARE.
  21  */
  22
  23 #include "codegen/nv50_ir.h"
  24 #include "codegen/nv50_ir_build_util.h"
  25
  26 #include "codegen/nv50_ir_target_nvc0.h"
  27
  28 #include <limits>
  29
  30 namespace nv50_ir {
  31
  32 #define QOP_ADD  0
  33 #define QOP_SUBR 1
  34 #define QOP_SUB  2
  35 #define QOP_MOV2 3
  36
  37 //             UL UR LL LR
  38 #define QUADOP(q, r, s, t)                      \
  39    ((QOP_##q << 6) | (QOP_##r << 4) |           \
  40     (QOP_##s << 2) | (QOP_##t << 0))
  41
  42 class NVC0LegalizeSSA : public Pass
  43 {
  44 private:
  45    virtual bool visit(BasicBlock *);
  46    virtual bool visit(Function *);
  47
  48    // we want to insert calls to the builtin library only after optimization
  49    void handleDIV(Instruction *); // integer division, modulus
  50    void handleRCPRSQ(Instruction *); // double precision float recip/rsqrt
  51
  52 private:
  53    BuildUtil bld;
  54 };
  55
  56 void
  57 NVC0LegalizeSSA::handleDIV(Instruction *i)
  58 {
  59    FlowInstruction *call;
  60    int builtin;
  61    Value *def[2];
  62
  63    bld.setPosition(i, false);
  64    def[0] = bld.mkMovToReg(0, i->getSrc(0))->getDef(0);
  65    def[1] = bld.mkMovToReg(1, i->getSrc(1))->getDef(0);
  66    switch (i->dType) {
  67    case TYPE_U32: builtin = NVC0_BUILTIN_DIV_U32; break;
  68    case TYPE_S32: builtin = NVC0_BUILTIN_DIV_S32; break;
  69    default:
  70       return;
  71    }
  72    call = bld.mkFlow(OP_CALL, NULL, CC_ALWAYS, NULL);
  73    bld.mkMov(i->getDef(0), def[(i->op == OP_DIV) ? 0 : 1]);
  74    bld.mkClobber(FILE_GPR, (i->op == OP_DIV) ? 0xe : 0xd, 2);
  75    bld.mkClobber(FILE_PREDICATE, (i->dType == TYPE_S32) ? 0xf : 0x3, 0);
  76
  77    call->fixed = 1;
  78    call->absolute = call->builtin = 1;
  79    call->target.builtin = builtin;
  80    delete_Instruction(prog, i);
  81 }
  82
  83 void
  84 NVC0LegalizeSSA::handleRCPRSQ(Instruction *i)
  85 {
  86    // TODO
  87 }
  88
  89 bool
  90 NVC0LegalizeSSA::visit(Function *fn)
  91 {
  92    bld.setProgram(fn->getProgram());
  93    return true;
  94 }
  95
  96 bool
  97 NVC0LegalizeSSA::visit(BasicBlock *bb)
  98 {
  99    Instruction *next;
 100    for (Instruction *i = bb->getEntry(); i; i = next) {
 101       next = i->next;
 102       if (i->dType == TYPE_F32)
 103          continue;
 104       switch (i->op) {
 105       case OP_DIV:
 106       case OP_MOD:
 107          handleDIV(i);
 108          break;
 109       case OP_RCP:
 110       case OP_RSQ:
 111          if (i->dType == TYPE_F64)
 112             handleRCPRSQ(i);
 113          break;
 114       default:
 115          break;
 116       }
 117    }
 118    return true;
 119 }
 120
 121 class NVC0LegalizePostRA : public Pass
 122 {
 123 public:
 124    NVC0LegalizePostRA(const Program *);
 125
 126 private:
 127    virtual bool visit(Function *);
 128    virtual bool visit(BasicBlock *);
 129
 130    void replaceZero(Instruction *);
 131    bool tryReplaceContWithBra(BasicBlock *);
 132    void propagateJoin(BasicBlock *);
 133
 134    struct TexUse
 135    {
 136       TexUse(Instruction *use, const Instruction *tex)
 137          : insn(use), tex(tex), level(-1) { }
 138       Instruction *insn;
 139       const Instruction *tex; // or split / mov
 140       int level;
 141    };
 142    struct Limits
 143    {
 144       Limits() { }
 145       Limits(int min, int max) : min(min), max(max) { }
 146       int min, max;
 147    };
 148    bool insertTextureBarriers(Function *);
 149    inline bool insnDominatedBy(const Instruction *, const Instruction *) const;
 150    void findFirstUses(const Instruction *tex, const Instruction *def,
 151                       std::list<TexUse>&);
 152    void findOverwritingDefs(const Instruction *tex, Instruction *insn,
 153                             const BasicBlock *term,
 154                             std::list<TexUse>&);
 155    void addTexUse(std::list<TexUse>&, Instruction *, const Instruction *);
 156    const Instruction *recurseDef(const Instruction *);
 157
 158 private:
 159    LValue *rZero;
 160    LValue *carry;
 161    const bool needTexBar;
 162 };
 163
 164 NVC0LegalizePostRA::NVC0LegalizePostRA(const Program *prog)
 165    : rZero(NULL),
 166      carry(NULL),
 167      needTexBar(prog->getTarget()->getChipset() >= 0xe0)
 168 {
 169 }
 170
 171 bool
 172 NVC0LegalizePostRA::insnDominatedBy(const Instruction *later,
 173                                     const Instruction *early) const
 174 {
 175    if (early->bb == later->bb)
 176       return early->serial < later->serial;
 177    return later->bb->dominatedBy(early->bb);
 178 }
 179
 180 void
 181 NVC0LegalizePostRA::addTexUse(std::list<TexUse> &uses,
 182                               Instruction *usei, const Instruction *insn)
 183 {
 184    bool add = true;
 185    for (std::list<TexUse>::iterator it = uses.begin();
 186         it != uses.end();) {
 187       if (insnDominatedBy(usei, it->insn)) {
 188          add = false;
 189          break;
 190       }
 191       if (insnDominatedBy(it->insn, usei))
 192          it = uses.erase(it);
 193       else
 194          ++it;
 195    }
 196    if (add)
 197       uses.push_back(TexUse(usei, insn));
 198 }
 199
 200 void
 201 NVC0LegalizePostRA::findOverwritingDefs(const Instruction *texi,
 202                                         Instruction *insn,
 203                                         const BasicBlock *term,
 204                                         std::list<TexUse> &uses)
 205 {
 206    while (insn->op == OP_MOV && insn->getDef(0)->equals(insn->getSrc(0)))
 207       insn = insn->getSrc(0)->getUniqueInsn();
 208
 209    if (!insn || !insn->bb->reachableBy(texi->bb, term))
 210       return;
 211
 212    switch (insn->op) {
 213    /* Values not connected to the tex's definition through any of these should
 214     * not be conflicting.
 215     */
 216    case OP_SPLIT:
 217    case OP_MERGE:
 218    case OP_PHI:
 219    case OP_UNION:
 220       /* recurse again */
 221       for (int s = 0; insn->srcExists(s); ++s)
 222          findOverwritingDefs(texi, insn->getSrc(s)->getUniqueInsn(), term,
 223                              uses);
 224       break;
 225    default:
 226       // if (!isTextureOp(insn->op)) // TODO: are TEXes always ordered ?
 227       addTexUse(uses, insn, texi);
 228       break;
 229    }
 230 }
 231
 232 void
 233 NVC0LegalizePostRA::findFirstUses(const Instruction *texi,
 234                                   const Instruction *insn,
 235                                   std::list<TexUse> &uses)
 236 {
 237    for (int d = 0; insn->defExists(d); ++d) {
 238       Value *v = insn->getDef(d);
 239       for (Value::UseIterator u = v->uses.begin(); u != v->uses.end(); ++u) {
 240          Instruction *usei = (*u)->getInsn();
 241
 242          if (usei->op == OP_PHI || usei->op == OP_UNION) {
 243             // need a barrier before WAW cases
 244             for (int s = 0; usei->srcExists(s); ++s) {
 245                Instruction *defi = usei->getSrc(s)->getUniqueInsn();
 246                if (defi && &usei->src(s) != *u)
 247                   findOverwritingDefs(texi, defi, usei->bb, uses);
 248             }
 249          }
 250
 251          if (usei->op == OP_SPLIT ||
 252              usei->op == OP_MERGE ||
 253              usei->op == OP_PHI ||
 254              usei->op == OP_UNION) {
 255             // these uses don't manifest in the machine code
 256             findFirstUses(texi, usei, uses);
 257          } else
 258          if (usei->op == OP_MOV && usei->getDef(0)->equals(usei->getSrc(0)) &&
 259              usei->subOp != NV50_IR_SUBOP_MOV_FINAL) {
 260             findFirstUses(texi, usei, uses);
 261          } else {
 262             addTexUse(uses, usei, insn);
 263          }
 264       }
 265    }
 266 }
 267
 268 // Texture barriers:
 269 // This pass is a bit long and ugly and can probably be optimized.
 270 //
 271 // 1. obtain a list of TEXes and their outputs' first use(s)
 272 // 2. calculate the barrier level of each first use (minimal number of TEXes,
 273 //    over all paths, between the TEX and the use in question)
 274 // 3. for each barrier, if all paths from the source TEX to that barrier
 275 //    contain a barrier of lesser level, it can be culled
 276 bool
 277 NVC0LegalizePostRA::insertTextureBarriers(Function *fn)
 278 {
 279    std::list<TexUse> *uses;
 280    std::vector<Instruction *> texes;
 281    std::vector<int> bbFirstTex;
 282    std::vector<int> bbFirstUse;
 283    std::vector<int> texCounts;
 284    std::vector<TexUse> useVec;
 285    ArrayList insns;
 286
 287    fn->orderInstructions(insns);
 288
 289    texCounts.resize(fn->allBBlocks.getSize(), 0);
 290    bbFirstTex.resize(fn->allBBlocks.getSize(), insns.getSize());
 291    bbFirstUse.resize(fn->allBBlocks.getSize(), insns.getSize());
 292
 293    // tag BB CFG nodes by their id for later
 294    for (ArrayList::Iterator i = fn->allBBlocks.iterator(); !i.end(); i.next()) {
 295       BasicBlock *bb = reinterpret_cast<BasicBlock *>(i.get());
 296       if (bb)
 297          bb->cfg.tag = bb->getId();
 298    }
 299
 300    // gather the first uses for each TEX
 301    for (int i = 0; i < insns.getSize(); ++i) {
 302       Instruction *tex = reinterpret_cast<Instruction *>(insns.get(i));
 303       if (isTextureOp(tex->op)) {
 304          texes.push_back(tex);
 305          if (!texCounts.at(tex->bb->getId()))
 306             bbFirstTex[tex->bb->getId()] = texes.size() - 1;
 307          texCounts[tex->bb->getId()]++;
 308       }
 309    }
 310    insns.clear();
 311    if (texes.empty())
 312       return false;
 313    uses = new std::list<TexUse>[texes.size()];
 314    if (!uses)
 315       return false;
 316    for (size_t i = 0; i < texes.size(); ++i)
 317       findFirstUses(texes[i], texes[i], uses[i]);
 318
 319    // determine the barrier level at each use
 320    for (size_t i = 0; i < texes.size(); ++i) {
 321       for (std::list<TexUse>::iterator u = uses[i].begin(); u != uses[i].end();
 322            ++u) {
 323          BasicBlock *tb = texes[i]->bb;
 324          BasicBlock *ub = u->insn->bb;
 325          if (tb == ub) {
 326             u->level = 0;
 327             for (size_t j = i + 1; j < texes.size() &&
 328                     texes[j]->bb == tb && texes[j]->serial < u->insn->serial;
 329                  ++j)
 330                u->level++;
 331          } else {
 332             u->level = fn->cfg.findLightestPathWeight(&tb->cfg,
 333                                                       &ub->cfg, texCounts);
 334             if (u->level < 0) {
 335                WARN("Failed to find path TEX -> TEXBAR\n");
 336                u->level = 0;
 337                continue;
 338             }
 339             // this counted all TEXes in the origin block, correct that
 340             u->level -= i - bbFirstTex.at(tb->getId()) + 1 /* this TEX */;
 341             // and did not count the TEXes in the destination block, add those
 342             for (size_t j = bbFirstTex.at(ub->getId()); j < texes.size() &&
 343                     texes[j]->bb == ub && texes[j]->serial < u->insn->serial;
 344                  ++j)
 345                u->level++;
 346          }
 347          assert(u->level >= 0);
 348          useVec.push_back(*u);
 349       }
 350    }
 351    delete[] uses;
 352    uses = NULL;
 353
 354    // insert the barriers
 355    for (size_t i = 0; i < useVec.size(); ++i) {
 356       Instruction *prev = useVec[i].insn->prev;
 357       if (useVec[i].level < 0)
 358          continue;
 359       if (prev && prev->op == OP_TEXBAR) {
 360          if (prev->subOp > useVec[i].level)
 361             prev->subOp = useVec[i].level;
 362          prev->setSrc(prev->srcCount(), useVec[i].tex->getDef(0));
 363       } else {
 364          Instruction *bar = new_Instruction(func, OP_TEXBAR, TYPE_NONE);
 365          bar->fixed = 1;
 366          bar->subOp = useVec[i].level;
 367          // make use explicit to ease latency calculation
 368          bar->setSrc(bar->srcCount(), useVec[i].tex->getDef(0));
 369          useVec[i].insn->bb->insertBefore(useVec[i].insn, bar);
 370       }
 371    }
 372
 373    if (fn->getProgram()->optLevel < 3) {
 374       if (uses)
 375          delete[] uses;
 376       return true;
 377    }
 378
 379    std::vector<Limits> limitT, limitB, limitS; // entry, exit, single
 380
 381    limitT.resize(fn->allBBlocks.getSize(), Limits(0, 0));
 382    limitB.resize(fn->allBBlocks.getSize(), Limits(0, 0));
 383    limitS.resize(fn->allBBlocks.getSize());
 384
 385    // cull unneeded barriers (should do that earlier, but for simplicity)
 386    IteratorRef bi = fn->cfg.iteratorCFG();
 387    // first calculate min/max outstanding TEXes for each BB
 388    for (bi->reset(); !bi->end(); bi->next()) {
 389       Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());
 390       BasicBlock *bb = BasicBlock::get(n);
 391       int min = 0;
 392       int max = std::numeric_limits<int>::max();
 393       for (Instruction *i = bb->getFirst(); i; i = i->next) {
 394          if (isTextureOp(i->op)) {
 395             min++;
 396             if (max < std::numeric_limits<int>::max())
 397                max++;
 398          } else
 399          if (i->op == OP_TEXBAR) {
 400             min = MIN2(min, i->subOp);
 401             max = MIN2(max, i->subOp);
 402          }
 403       }
 404       // limits when looking at an isolated block
 405       limitS[bb->getId()].min = min;
 406       limitS[bb->getId()].max = max;
 407    }
 408    // propagate the min/max values
 409    for (unsigned int l = 0; l <= fn->loopNestingBound; ++l) {
 410       for (bi->reset(); !bi->end(); bi->next()) {
 411          Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());
 412          BasicBlock *bb = BasicBlock::get(n);
 413          const int bbId = bb->getId();
 414          for (Graph::EdgeIterator ei = n->incident(); !ei.end(); ei.next()) {
 415             BasicBlock *in = BasicBlock::get(ei.getNode());
 416             const int inId = in->getId();
 417             limitT[bbId].min = MAX2(limitT[bbId].min, limitB[inId].min);
 418             limitT[bbId].max = MAX2(limitT[bbId].max, limitB[inId].max);
 419          }
 420          // I just hope this is correct ...
 421          if (limitS[bbId].max == std::numeric_limits<int>::max()) {
 422             // no barrier
 423             limitB[bbId].min = limitT[bbId].min + limitS[bbId].min;
 424             limitB[bbId].max = limitT[bbId].max + limitS[bbId].min;
 425          } else {
 426             // block contained a barrier
 427             limitB[bbId].min = MIN2(limitS[bbId].max,
 428                                     limitT[bbId].min + limitS[bbId].min);
 429             limitB[bbId].max = MIN2(limitS[bbId].max,
 430                                     limitT[bbId].max + limitS[bbId].min);
 431          }
 432       }
 433    }
 434    // finally delete unnecessary barriers
 435    for (bi->reset(); !bi->end(); bi->next()) {
 436       Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());
 437       BasicBlock *bb = BasicBlock::get(n);
 438       Instruction *prev = NULL;
 439       Instruction *next;
 440       int max = limitT[bb->getId()].max;
 441       for (Instruction *i = bb->getFirst(); i; i = next) {
 442          next = i->next;
 443          if (i->op == OP_TEXBAR) {
 444             if (i->subOp >= max) {
 445                delete_Instruction(prog, i);
 446                i = NULL;
 447             } else {
 448                max = i->subOp;
 449                if (prev && prev->op == OP_TEXBAR && prev->subOp >= max) {
 450                   delete_Instruction(prog, prev);
 451                   prev = NULL;
 452                }
 453             }
 454          } else
 455          if (isTextureOp(i->op)) {
 456             max++;
 457          }
 458          if (i && !i->isNop())
 459             prev = i;
 460       }
 461    }
 462    if (uses)
 463       delete[] uses;
 464    return true;
 465 }
 466
 467 bool
 468 NVC0LegalizePostRA::visit(Function *fn)
 469 {
 470    if (needTexBar)
 471       insertTextureBarriers(fn);
 472
 473    rZero = new_LValue(fn, FILE_GPR);
 474    carry = new_LValue(fn, FILE_FLAGS);
 475
 476    rZero->reg.data.id = prog->getTarget()->getFileSize(FILE_GPR);
 477    carry->reg.data.id = 0;
 478
 479    return true;
 480 }
 481
 482 void
 483 NVC0LegalizePostRA::replaceZero(Instruction *i)
 484 {
 485    for (int s = 0; i->srcExists(s); ++s) {
 486       if (s == 2 && i->op == OP_SUCLAMP)
 487          continue;
 488       ImmediateValue *imm = i->getSrc(s)->asImm();
 489       if (imm && imm->reg.data.u64 == 0)
 490          i->setSrc(s, rZero);
 491    }
 492 }
 493
 494 // replace CONT with BRA for single unconditional continue
 495 bool
 496 NVC0LegalizePostRA::tryReplaceContWithBra(BasicBlock *bb)
 497 {
 498    if (bb->cfg.incidentCount() != 2 || bb->getEntry()->op != OP_PRECONT)
 499       return false;
 500    Graph::EdgeIterator ei = bb->cfg.incident();
 501    if (ei.getType() != Graph::Edge::BACK)
 502       ei.next();
 503    if (ei.getType() != Graph::Edge::BACK)
 504       return false;
 505    BasicBlock *contBB = BasicBlock::get(ei.getNode());
 506
 507    if (!contBB->getExit() || contBB->getExit()->op != OP_CONT ||
 508        contBB->getExit()->getPredicate())
 509       return false;
 510    contBB->getExit()->op = OP_BRA;
 511    bb->remove(bb->getEntry()); // delete PRECONT
 512
 513    ei.next();
 514    assert(ei.end() || ei.getType() != Graph::Edge::BACK);
 515    return true;
 516 }
 517
 518 // replace branches to join blocks with join ops
 519 void
 520 NVC0LegalizePostRA::propagateJoin(BasicBlock *bb)
 521 {
 522    if (bb->getEntry()->op != OP_JOIN || bb->getEntry()->asFlow()->limit)
 523       return;
 524    for (Graph::EdgeIterator ei = bb->cfg.incident(); !ei.end(); ei.next()) {
 525       BasicBlock *in = BasicBlock::get(ei.getNode());
 526       Instruction *exit = in->getExit();
 527       if (!exit) {
 528          in->insertTail(new FlowInstruction(func, OP_JOIN, bb));
 529          // there should always be a terminator instruction
 530          WARN("inserted missing terminator in BB:%i\n", in->getId());
 531       } else
 532       if (exit->op == OP_BRA) {
 533          exit->op = OP_JOIN;
 534          exit->asFlow()->limit = 1; // must-not-propagate marker
 535       }
 536    }
 537    bb->remove(bb->getEntry());
 538 }
 539
 540 bool
 541 NVC0LegalizePostRA::visit(BasicBlock *bb)
 542 {
 543    Instruction *i, *next;
 544
 545    // remove pseudo operations and non-fixed no-ops, split 64 bit operations
 546    for (i = bb->getFirst(); i; i = next) {
 547       next = i->next;
 548       if (i->op == OP_EMIT || i->op == OP_RESTART) {
 549          if (!i->getDef(0)->refCount())
 550             i->setDef(0, NULL);
 551          if (i->src(0).getFile() == FILE_IMMEDIATE)
 552             i->setSrc(0, rZero); // initial value must be 0
 553       } else
 554       if (i->isNop()) {
 555          bb->remove(i);
 556       } else {
 557          // TODO: Move this to before register allocation for operations that
 558          // need the $c register !
 559          if (typeSizeof(i->dType) == 8) {
 560             Instruction *hi;
 561             hi = BuildUtil::split64BitOpPostRA(func, i, rZero, carry);
 562             if (hi)
 563                next = hi;
 564          }
 565
 566          if (i->op != OP_MOV && i->op != OP_PFETCH)
 567             replaceZero(i);
 568       }
 569    }
 570    if (!bb->getEntry())
 571       return true;
 572
 573    if (!tryReplaceContWithBra(bb))
 574       propagateJoin(bb);
 575
 576    return true;
 577 }
 578
 579 class NVC0LoweringPass : public Pass
 580 {
 581 public:
 582    NVC0LoweringPass(Program *);
 583
 584 private:
 585    virtual bool visit(Function *);
 586    virtual bool visit(BasicBlock *);
 587    virtual bool visit(Instruction *);
 588
 589    bool handleRDSV(Instruction *);
 590    bool handleWRSV(Instruction *);
 591    bool handleEXPORT(Instruction *);
 592    bool handleOUT(Instruction *);
 593    bool handleDIV(Instruction *);
 594    bool handleMOD(Instruction *);
 595    bool handleSQRT(Instruction *);
 596    bool handlePOW(Instruction *);
 597    bool handleTEX(TexInstruction *);
 598    bool handleTXD(TexInstruction *);
 599    bool handleTXQ(TexInstruction *);
 600    bool handleManualTXD(TexInstruction *);
 601    bool handleTXLQ(TexInstruction *);
 602    bool handleATOM(Instruction *);
 603    bool handleCasExch(Instruction *, bool needCctl);
 604    void handleSurfaceOpNVE4(TexInstruction *);
 605
 606    void checkPredicate(Instruction *);
 607
 608    void readTessCoord(LValue *dst, int c);
 609
 610    Value *loadResInfo32(Value *ptr, uint32_t off);
 611    Value *loadMsInfo32(Value *ptr, uint32_t off);
 612    Value *loadTexHandle(Value *ptr, unsigned int slot);
 613
 614    void adjustCoordinatesMS(TexInstruction *);
 615    void processSurfaceCoordsNVE4(TexInstruction *);
 616
 617 private:
 618    const Target *const targ;
 619
 620    BuildUtil bld;
 621
 622    Symbol *gMemBase;
 623    LValue *gpEmitAddress;
 624 };
 625
 626 NVC0LoweringPass::NVC0LoweringPass(Program *prog) : targ(prog->getTarget())
 627 {
 628    bld.setProgram(prog);
 629    gMemBase = NULL;
 630 }
 631
 632 bool
 633 NVC0LoweringPass::visit(Function *fn)
 634 {
 635    if (prog->getType() == Program::TYPE_GEOMETRY) {
 636       assert(!strncmp(fn->getName(), "MAIN", 4));
 637       // TODO: when we generate actual functions pass this value along somehow
 638       bld.setPosition(BasicBlock::get(fn->cfg.getRoot()), false);
 639       gpEmitAddress = bld.loadImm(NULL, 0)->asLValue();
 640       if (fn->cfgExit) {
 641          bld.setPosition(BasicBlock::get(fn->cfgExit)->getExit(), false);
 642          bld.mkMovToReg(0, gpEmitAddress);
 643       }
 644    }
 645    return true;
 646 }
 647
 648 bool
 649 NVC0LoweringPass::visit(BasicBlock *bb)
 650 {
 651    return true;
 652 }
 653
 654 inline Value *
 655 NVC0LoweringPass::loadTexHandle(Value *ptr, unsigned int slot)
 656 {
 657    uint8_t b = prog->driver->io.resInfoCBSlot;
 658    uint32_t off = prog->driver->io.texBindBase + slot * 4;
 659    return bld.
 660       mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
 661 }
 662
 663 // move array source to first slot, convert to u16, add indirections
 664 bool
 665 NVC0LoweringPass::handleTEX(TexInstruction *i)
 666 {
 667    const int dim = i->tex.target.getDim() + i->tex.target.isCube();
 668    const int arg = i->tex.target.getArgCount();
 669    const int lyr = arg - (i->tex.target.isMS() ? 2 : 1);
 670    const int chipset = prog->getTarget()->getChipset();
 671
 672    if (chipset >= NVISA_GK104_CHIPSET) {
 673       if (i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0) {
 674          WARN("indirect TEX not implemented\n");
 675       }
 676       if (i->tex.r == i->tex.s) {
 677          i->tex.r += prog->driver->io.texBindBase / 4;
 678          i->tex.s  = 0; // only a single cX[] value possible here
 679       } else {
 680          Value *hnd = bld.getScratch();
 681          Value *rHnd = loadTexHandle(NULL, i->tex.r);
 682          Value *sHnd = loadTexHandle(NULL, i->tex.s);
 683
 684          bld.mkOp3(OP_INSBF, TYPE_U32, hnd, rHnd, bld.mkImm(0x1400), sHnd);
 685
 686          i->tex.r = 0; // not used for indirect tex
 687          i->tex.s = 0;
 688          i->setIndirectR(hnd);
 689       }
 690       if (i->tex.target.isArray()) {
 691          LValue *layer = new_LValue(func, FILE_GPR);
 692          Value *src = i->getSrc(lyr);
 693          const int sat = (i->op == OP_TXF) ? 1 : 0;
 694          DataType sTy = (i->op == OP_TXF) ? TYPE_U32 : TYPE_F32;
 695          bld.mkCvt(OP_CVT, TYPE_U16, layer, sTy, src)->saturate = sat;
 696          for (int s = dim; s >= 1; --s)
 697             i->setSrc(s, i->getSrc(s - 1));
 698          i->setSrc(0, layer);
 699       }
 700    } else
 701    // (nvc0) generate and move the tsc/tic/array source to the front
 702    if (i->tex.target.isArray() || i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0) {
 703       LValue *src = new_LValue(func, FILE_GPR); // 0xttxsaaaa
 704
 705       Value *arrayIndex = i->tex.target.isArray() ? i->getSrc(lyr) : NULL;
 706       for (int s = dim; s >= 1; --s)
 707          i->setSrc(s, i->getSrc(s - 1));
 708       i->setSrc(0, arrayIndex);
 709
 710       Value *ticRel = i->getIndirectR();
 711       Value *tscRel = i->getIndirectS();
 712
 713       if (arrayIndex) {
 714          int sat = (i->op == OP_TXF) ? 1 : 0;
 715          DataType sTy = (i->op == OP_TXF) ? TYPE_U32 : TYPE_F32;
 716          bld.mkCvt(OP_CVT, TYPE_U16, src, sTy, arrayIndex)->saturate = sat;
 717       } else {
 718          bld.loadImm(src, 0);
 719       }
 720
 721       if (ticRel) {
 722          i->setSrc(i->tex.rIndirectSrc, NULL);
 723          bld.mkOp3(OP_INSBF, TYPE_U32, src, ticRel, bld.mkImm(0x0917), src);
 724       }
 725       if (tscRel) {
 726          i->setSrc(i->tex.sIndirectSrc, NULL);
 727          bld.mkOp3(OP_INSBF, TYPE_U32, src, tscRel, bld.mkImm(0x0710), src);
 728       }
 729
 730       i->setSrc(0, src);
 731    }
 732
 733    // For nvc0, the sample id has to be in the second operand, as the offset
 734    // does. Right now we don't know how to pass both in, and this case can't
 735    // happen with OpenGL. On nve0, the sample id is part of the texture
 736    // coordinate argument.
 737    assert(chipset >= NVISA_GK104_CHIPSET ||
 738           !i->tex.useOffsets || !i->tex.target.isMS());
 739
 740    // offset is between lod and dc
 741    if (i->tex.useOffsets) {
 742       int n, c;
 743       int s = i->srcCount(0xff, true);
 744       if (i->tex.target.isShadow())
 745          s--;
 746       if (i->srcExists(s)) // move potential predicate out of the way
 747          i->moveSources(s, 1);
 748       if (i->tex.useOffsets == 4 && i->srcExists(s + 1))
 749          i->moveSources(s + 1, 1);
 750       if (i->op == OP_TXG) {
 751          // Either there is 1 offset, which goes into the 2 low bytes of the
 752          // first source, or there are 4 offsets, which go into 2 sources (8
 753          // values, 1 byte each).
 754          Value *offs[2] = {NULL, NULL};
 755          for (n = 0; n < i->tex.useOffsets; n++) {
 756             for (c = 0; c < 2; ++c) {
 757                if ((n % 2) == 0 && c == 0)
 758                   offs[n / 2] = i->offset[n][c].get();
 759                else
 760                   bld.mkOp3(OP_INSBF, TYPE_U32,
 761                             offs[n / 2],
 762                             i->offset[n][c].get(),
 763                             bld.mkImm(0x800 | ((n * 16 + c * 8) % 32)),
 764                             offs[n / 2]);
 765             }
 766          }
 767          i->setSrc(s, offs[0]);
 768          if (offs[1])
 769             i->setSrc(s + 1, offs[1]);
 770       } else {
 771          unsigned imm = 0;
 772          assert(i->tex.useOffsets == 1);
 773          for (c = 0; c < 3; ++c) {
 774             ImmediateValue val;
 775             assert(i->offset[0][c].getImmediate(val));
 776             imm |= (val.reg.data.u32 & 0xf) << (c * 4);
 777          }
 778          i->setSrc(s, bld.loadImm(NULL, imm));
 779       }
 780    }
 781
 782    if (chipset >= NVISA_GK104_CHIPSET) {
 783       //
 784       // If TEX requires more than 4 sources, the 2nd register tuple must be
 785       // aligned to 4, even if it consists of just a single 4-byte register.
 786       //
 787       // XXX HACK: We insert 0 sources to avoid the 5 or 6 regs case.
 788       //
 789       int s = i->srcCount(0xff, true);
 790       if (s > 4 && s < 7) {
 791          if (i->srcExists(s)) // move potential predicate out of the way
 792             i->moveSources(s, 7 - s);
 793          while (s < 7)
 794             i->setSrc(s++, bld.loadImm(NULL, 0));
 795       }
 796    }
 797
 798    return true;
 799 }
 800
 801 bool
 802 NVC0LoweringPass::handleManualTXD(TexInstruction *i)
 803 {
 804    static const uint8_t qOps[4][2] =
 805    {
 806       { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(MOV2, MOV2, ADD,  ADD) }, // l0
 807       { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(MOV2, MOV2, ADD,  ADD) }, // l1
 808       { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l2
 809       { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l3
 810    };
 811    Value *def[4][4];
 812    Value *crd[3];
 813    Instruction *tex;
 814    Value *zero = bld.loadImm(bld.getSSA(), 0);
 815    int l, c;
 816    const int dim = i->tex.target.getDim();
 817
 818    i->op = OP_TEX; // no need to clone dPdx/dPdy later
 819
 820    for (c = 0; c < dim; ++c)
 821       crd[c] = bld.getScratch();
 822
 823    bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
 824    for (l = 0; l < 4; ++l) {
 825       // mov coordinates from lane l to all lanes
 826       for (c = 0; c < dim; ++c)
 827          bld.mkQuadop(0x00, crd[c], l, i->getSrc(c), zero);
 828       // add dPdx from lane l to lanes dx
 829       for (c = 0; c < dim; ++c)
 830          bld.mkQuadop(qOps[l][0], crd[c], l, i->dPdx[c].get(), crd[c]);
 831       // add dPdy from lane l to lanes dy
 832       for (c = 0; c < dim; ++c)
 833          bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]);
 834       // texture
 835       bld.insert(tex = cloneForward(func, i));
 836       for (c = 0; c < dim; ++c)
 837          tex->setSrc(c, crd[c]);
 838       // save results
 839       for (c = 0; i->defExists(c); ++c) {
 840          Instruction *mov;
 841          def[c][l] = bld.getSSA();
 842          mov = bld.mkMov(def[c][l], tex->getDef(c));
 843          mov->fixed = 1;
 844          mov->lanes = 1 << l;
 845       }
 846    }
 847    bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL);
 848
 849    for (c = 0; i->defExists(c); ++c) {
 850       Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c));
 851       for (l = 0; l < 4; ++l)
 852          u->setSrc(l, def[c][l]);
 853    }
 854
 855    i->bb->remove(i);
 856    return true;
 857 }
 858
 859 bool
 860 NVC0LoweringPass::handleTXD(TexInstruction *txd)
 861 {
 862    int dim = txd->tex.target.getDim();
 863    int arg = txd->tex.target.getArgCount();
 864
 865    handleTEX(txd);
 866    while (txd->srcExists(arg))
 867       ++arg;
 868
 869    txd->tex.derivAll = true;
 870    if (dim > 2 ||
 871        txd->tex.target.isCube() ||
 872        arg > 4 ||
 873        txd->tex.target.isShadow())
 874       return handleManualTXD(txd);
 875
 876    for (int c = 0; c < dim; ++c) {
 877       txd->setSrc(arg + c * 2 + 0, txd->dPdx[c]);
 878       txd->setSrc(arg + c * 2 + 1, txd->dPdy[c]);
 879       txd->dPdx[c].set(NULL);
 880       txd->dPdy[c].set(NULL);
 881    }
 882    return true;
 883 }
 884
 885 bool
 886 NVC0LoweringPass::handleTXQ(TexInstruction *txq)
 887 {
 888    // TODO: indirect resource/sampler index
 889    return true;
 890 }
 891
 892 bool
 893 NVC0LoweringPass::handleTXLQ(TexInstruction *i)
 894 {
 895    /* The outputs are inverted compared to what the TGSI instruction
 896     * expects. Take that into account in the mask.
 897     */
 898    assert((i->tex.mask & ~3) == 0);
 899    if (i->tex.mask == 1)
 900       i->tex.mask = 2;
 901    else if (i->tex.mask == 2)
 902       i->tex.mask = 1;
 903    handleTEX(i);
 904    bld.setPosition(i, true);
 905
 906    /* The returned values are not quite what we want:
 907     * (a) convert from s16/u16 to f32
 908     * (b) multiply by 1/256
 909     */
 910    for (int def = 0; def < 2; ++def) {
 911       if (!i->defExists(def))
 912          continue;
 913       enum DataType type = TYPE_S16;
 914       if (i->tex.mask == 2 || def > 0)
 915          type = TYPE_U16;
 916       bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(def), type, i->getDef(def));
 917       bld.mkOp2(OP_MUL, TYPE_F32, i->getDef(def),
 918                 i->getDef(def), bld.loadImm(NULL, 1.0f / 256));
 919    }
 920    if (i->tex.mask == 3) {
 921       LValue *t = new_LValue(func, FILE_GPR);
 922       bld.mkMov(t, i->getDef(0));
 923       bld.mkMov(i->getDef(0), i->getDef(1));
 924       bld.mkMov(i->getDef(1), t);
 925    }
 926    return true;
 927 }
 928
 929
 930 bool
 931 NVC0LoweringPass::handleATOM(Instruction *atom)
 932 {
 933    SVSemantic sv;
 934
 935    switch (atom->src(0).getFile()) {
 936    case FILE_MEMORY_LOCAL:
 937       sv = SV_LBASE;
 938       break;
 939    case FILE_MEMORY_SHARED:
 940       sv = SV_SBASE;
 941       break;
 942    default:
 943       assert(atom->src(0).getFile() == FILE_MEMORY_GLOBAL);
 944       return true;
 945    }
 946    Value *base =
 947       bld.mkOp1v(OP_RDSV, TYPE_U32, bld.getScratch(), bld.mkSysVal(sv, 0));
 948    Value *ptr = atom->getIndirect(0, 0);
 949
 950    atom->setSrc(0, cloneShallow(func, atom->getSrc(0)));
 951    atom->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;
 952    if (ptr)
 953       base = bld.mkOp2v(OP_ADD, TYPE_U32, base, base, ptr);
 954    atom->setIndirect(0, 0, base);
 955
 956    return true;
 957 }
 958
 959 bool
 960 NVC0LoweringPass::handleCasExch(Instruction *cas, bool needCctl)
 961 {
 962    if (cas->subOp != NV50_IR_SUBOP_ATOM_CAS &&
 963        cas->subOp != NV50_IR_SUBOP_ATOM_EXCH)
 964       return false;
 965    bld.setPosition(cas, true);
 966
 967    if (needCctl) {
 968       Instruction *cctl = bld.mkOp1(OP_CCTL, TYPE_NONE, NULL, cas->getSrc(0));
 969       cctl->setIndirect(0, 0, cas->getIndirect(0, 0));
 970       cctl->fixed = 1;
 971       cctl->subOp = NV50_IR_SUBOP_CCTL_IV;
 972       if (cas->isPredicated())
 973          cctl->setPredicate(cas->cc, cas->getPredicate());
 974    }
 975
 976    if (cas->defExists(0) && cas->subOp == NV50_IR_SUBOP_ATOM_CAS) {
 977       // CAS is crazy. It's 2nd source is a double reg, and the 3rd source
 978       // should be set to the high part of the double reg or bad things will
 979       // happen elsewhere in the universe.
 980       // Also, it sometimes returns the new value instead of the old one
 981       // under mysterious circumstances.
 982       Value *dreg = bld.getSSA(8);
 983       bld.setPosition(cas, false);
 984       bld.mkOp2(OP_MERGE, TYPE_U64, dreg, cas->getSrc(1), cas->getSrc(2));
 985       cas->setSrc(1, dreg);
 986    }
 987
 988    return true;
 989 }
 990
 991 inline Value *
 992 NVC0LoweringPass::loadResInfo32(Value *ptr, uint32_t off)
 993 {
 994    uint8_t b = prog->driver->io.resInfoCBSlot;
 995    off += prog->driver->io.suInfoBase;
 996    return bld.
 997       mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
 998 }
 999
1000 inline Value *
1001 NVC0LoweringPass::loadMsInfo32(Value *ptr, uint32_t off)
1002 {
1003    uint8_t b = prog->driver->io.msInfoCBSlot;
1004    off += prog->driver->io.msInfoBase;
1005    return bld.
1006       mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
1007 }
1008
1009 /* On nvc0, surface info is obtained via the surface binding points passed
1010  * to the SULD/SUST instructions.
1011  * On nve4, surface info is stored in c[] and is used by various special
1012  * instructions, e.g. for clamping coordiantes or generating an address.
1013  * They couldn't just have added an equivalent to TIC now, couldn't they ?
1014  */
1015 #define NVE4_SU_INFO_ADDR   0x00
1016 #define NVE4_SU_INFO_FMT    0x04
1017 #define NVE4_SU_INFO_DIM_X  0x08
1018 #define NVE4_SU_INFO_PITCH  0x0c
1019 #define NVE4_SU_INFO_DIM_Y  0x10
1020 #define NVE4_SU_INFO_ARRAY  0x14
1021 #define NVE4_SU_INFO_DIM_Z  0x18
1022 #define NVE4_SU_INFO_UNK1C  0x1c
1023 #define NVE4_SU_INFO_WIDTH  0x20
1024 #define NVE4_SU_INFO_HEIGHT 0x24
1025 #define NVE4_SU_INFO_DEPTH  0x28
1026 #define NVE4_SU_INFO_TARGET 0x2c
1027 #define NVE4_SU_INFO_CALL   0x30
1028 #define NVE4_SU_INFO_RAW_X  0x34
1029 #define NVE4_SU_INFO_MS_X   0x38
1030 #define NVE4_SU_INFO_MS_Y   0x3c
1031
1032 #define NVE4_SU_INFO__STRIDE 0x40
1033
1034 #define NVE4_SU_INFO_DIM(i)  (0x08 + (i) * 8)
1035 #define NVE4_SU_INFO_SIZE(i) (0x20 + (i) * 4)
1036 #define NVE4_SU_INFO_MS(i)   (0x38 + (i) * 4)
1037
1038 static inline uint16_t getSuClampSubOp(const TexInstruction *su, int c)
1039 {
1040    switch (su->tex.target.getEnum()) {
1041    case TEX_TARGET_BUFFER:      return NV50_IR_SUBOP_SUCLAMP_PL(0, 1);
1042    case TEX_TARGET_RECT:        return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1043    case TEX_TARGET_1D:          return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1044    case TEX_TARGET_1D_ARRAY:    return (c == 1) ?
1045                                    NV50_IR_SUBOP_SUCLAMP_PL(0, 2) :
1046                                    NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1047    case TEX_TARGET_2D:          return NV50_IR_SUBOP_SUCLAMP_BL(0, 2);
1048    case TEX_TARGET_2D_MS:       return NV50_IR_SUBOP_SUCLAMP_BL(0, 2);
1049    case TEX_TARGET_2D_ARRAY:    return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1050    case TEX_TARGET_2D_MS_ARRAY: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1051    case TEX_TARGET_3D:          return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1052    case TEX_TARGET_CUBE:        return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1053    case TEX_TARGET_CUBE_ARRAY:  return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
1054    default:
1055       assert(0);
1056       return 0;
1057    }
1058 }
1059
1060 void
1061 NVC0LoweringPass::adjustCoordinatesMS(TexInstruction *tex)
1062 {
1063    const uint16_t base = tex->tex.r * NVE4_SU_INFO__STRIDE;
1064    const int arg = tex->tex.target.getArgCount();
1065
1066    if (tex->tex.target == TEX_TARGET_2D_MS)
1067       tex->tex.target = TEX_TARGET_2D;
1068    else
1069    if (tex->tex.target == TEX_TARGET_2D_MS_ARRAY)
1070       tex->tex.target = TEX_TARGET_2D_ARRAY;
1071    else
1072       return;
1073
1074    Value *x = tex->getSrc(0);
1075    Value *y = tex->getSrc(1);
1076    Value *s = tex->getSrc(arg - 1);
1077
1078    Value *tx = bld.getSSA(), *ty = bld.getSSA(), *ts = bld.getSSA();
1079
1080    Value *ms_x = loadResInfo32(NULL, base + NVE4_SU_INFO_MS(0));
1081    Value *ms_y = loadResInfo32(NULL, base + NVE4_SU_INFO_MS(1));
1082
1083    bld.mkOp2(OP_SHL, TYPE_U32, tx, x, ms_x);
1084    bld.mkOp2(OP_SHL, TYPE_U32, ty, y, ms_y);
1085
1086    s = bld.mkOp2v(OP_AND, TYPE_U32, ts, s, bld.loadImm(NULL, 0x7));
1087    s = bld.mkOp2v(OP_SHL, TYPE_U32, ts, ts, bld.mkImm(3));
1088
1089    Value *dx = loadMsInfo32(ts, 0x0);
1090    Value *dy = loadMsInfo32(ts, 0x4);
1091
1092    bld.mkOp2(OP_ADD, TYPE_U32, tx, tx, dx);
1093    bld.mkOp2(OP_ADD, TYPE_U32, ty, ty, dy);
1094
1095    tex->setSrc(0, tx);
1096    tex->setSrc(1, ty);
1097    tex->moveSources(arg, -1);
1098 }
1099
1100 // Sets 64-bit "generic address", predicate and format sources for SULD/SUST.
1101 // They're computed from the coordinates using the surface info in c[] space.
1102 void
1103 NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su)
1104 {
1105    Instruction *insn;
1106    const bool atom = su->op == OP_SUREDB || su->op == OP_SUREDP;
1107    const bool raw =
1108       su->op == OP_SULDB || su->op == OP_SUSTB || su->op == OP_SUREDB;
1109    const int idx = su->tex.r;
1110    const int dim = su->tex.target.getDim();
1111    const int arg = dim + (su->tex.target.isArray() ? 1 : 0);
1112    const uint16_t base = idx * NVE4_SU_INFO__STRIDE;
1113    int c;
1114    Value *zero = bld.mkImm(0);
1115    Value *p1 = NULL;
1116    Value *v;
1117    Value *src[3];
1118    Value *bf, *eau, *off;
1119    Value *addr, *pred;
1120
1121    off = bld.getScratch(4);
1122    bf = bld.getScratch(4);
1123    addr = bld.getSSA(8);
1124    pred = bld.getScratch(1, FILE_PREDICATE);
1125
1126    bld.setPosition(su, false);
1127
1128    adjustCoordinatesMS(su);
1129
1130    // calculate clamped coordinates
1131    for (c = 0; c < arg; ++c) {
1132       src[c] = bld.getScratch();
1133       if (c == 0 && raw)
1134          v = loadResInfo32(NULL, base + NVE4_SU_INFO_RAW_X);
1135       else
1136          v = loadResInfo32(NULL, base + NVE4_SU_INFO_DIM(c));
1137       bld.mkOp3(OP_SUCLAMP, TYPE_S32, src[c], su->getSrc(c), v, zero)
1138          ->subOp = getSuClampSubOp(su, c);
1139    }
1140    for (; c < 3; ++c)
1141       src[c] = zero;
1142
1143    // set predicate output
1144    if (su->tex.target == TEX_TARGET_BUFFER) {
1145       src[0]->getInsn()->setFlagsDef(1, pred);
1146    } else
1147    if (su->tex.target.isArray()) {
1148       p1 = bld.getSSA(1, FILE_PREDICATE);
1149       src[dim]->getInsn()->setFlagsDef(1, p1);
1150    }
1151
1152    // calculate pixel offset
1153    if (dim == 1) {
1154       if (su->tex.target != TEX_TARGET_BUFFER)
1155          bld.mkOp2(OP_AND, TYPE_U32, off, src[0], bld.loadImm(NULL, 0xffff));
1156    } else
1157    if (dim == 3) {
1158       v = loadResInfo32(NULL, base + NVE4_SU_INFO_UNK1C);
1159       bld.mkOp3(OP_MADSP, TYPE_U32, off, src[2], v, src[1])
1160          ->subOp = NV50_IR_SUBOP_MADSP(4,2,8); // u16l u16l u16l
1161
1162       v = loadResInfo32(NULL, base + NVE4_SU_INFO_PITCH);
1163       bld.mkOp3(OP_MADSP, TYPE_U32, off, off, v, src[0])
1164          ->subOp = NV50_IR_SUBOP_MADSP(0,2,8); // u32 u16l u16l
1165    } else {
1166       assert(dim == 2);
1167       v = loadResInfo32(NULL, base + NVE4_SU_INFO_PITCH);
1168       bld.mkOp3(OP_MADSP, TYPE_U32, off, src[1], v, src[0])
1169          ->subOp = su->tex.target.isArray() ?
1170          NV50_IR_SUBOP_MADSP_SD : NV50_IR_SUBOP_MADSP(4,2,8); // u16l u16l u16l
1171    }
1172
1173    // calculate effective address part 1
1174    if (su->tex.target == TEX_TARGET_BUFFER) {
1175       if (raw) {
1176          bf = src[0];
1177       } else {
1178          v = loadResInfo32(NULL, base + NVE4_SU_INFO_FMT);
1179          bld.mkOp3(OP_VSHL, TYPE_U32, bf, src[0], v, zero)
1180             ->subOp = NV50_IR_SUBOP_V1(7,6,8|2);
1181       }
1182    } else {
1183       Value *y = src[1];
1184       Value *z = src[2];
1185       uint16_t subOp = 0;
1186
1187       switch (dim) {
1188       case 1:
1189          y = zero;
1190          z = zero;
1191          break;
1192       case 2:
1193          z = off;
1194          if (!su->tex.target.isArray()) {
1195             z = loadResInfo32(NULL, base + NVE4_SU_INFO_UNK1C);
1196             subOp = NV50_IR_SUBOP_SUBFM_3D;
1197          }
1198          break;
1199       default:
1200          subOp = NV50_IR_SUBOP_SUBFM_3D;
1201          assert(dim == 3);
1202          break;
1203       }
1204       insn = bld.mkOp3(OP_SUBFM, TYPE_U32, bf, src[0], y, z);
1205       insn->subOp = subOp;
1206       insn->setFlagsDef(1, pred);
1207    }
1208
1209    // part 2
1210    v = loadResInfo32(NULL, base + NVE4_SU_INFO_ADDR);
1211
1212    if (su->tex.target == TEX_TARGET_BUFFER) {
1213       eau = v;
1214    } else {
1215       eau = bld.mkOp3v(OP_SUEAU, TYPE_U32, bld.getScratch(4), off, bf, v);
1216    }
1217    // add array layer offset
1218    if (su->tex.target.isArray()) {
1219       v = loadResInfo32(NULL, base + NVE4_SU_INFO_ARRAY);
1220       if (dim == 1)
1221          bld.mkOp3(OP_MADSP, TYPE_U32, eau, src[1], v, eau)
1222             ->subOp = NV50_IR_SUBOP_MADSP(4,0,0); // u16 u24 u32
1223       else
1224          bld.mkOp3(OP_MADSP, TYPE_U32, eau, v, src[2], eau)
1225             ->subOp = NV50_IR_SUBOP_MADSP(0,0,0); // u32 u24 u32
1226       // combine predicates
1227       assert(p1);
1228       bld.mkOp2(OP_OR, TYPE_U8, pred, pred, p1);
1229    }
1230
1231    if (atom) {
1232       Value *lo = bf;
1233       if (su->tex.target == TEX_TARGET_BUFFER) {
1234          lo = zero;
1235          bld.mkMov(off, bf);
1236       }
1237       //  bf == g[] address & 0xff
1238       // eau == g[] address >> 8
1239       bld.mkOp3(OP_PERMT, TYPE_U32,  bf,   lo, bld.loadImm(NULL, 0x6540), eau);
1240       bld.mkOp3(OP_PERMT, TYPE_U32, eau, zero, bld.loadImm(NULL, 0x0007), eau);
1241    } else
1242    if (su->op == OP_SULDP && su->tex.target == TEX_TARGET_BUFFER) {
1243       // Convert from u32 to u8 address format, which is what the library code
1244       // doing SULDP currently uses.
1245       // XXX: can SUEAU do this ?
1246       // XXX: does it matter that we don't mask high bytes in bf ?
1247       // Grrr.
1248       bld.mkOp2(OP_SHR, TYPE_U32, off, bf, bld.mkImm(8));
1249       bld.mkOp2(OP_ADD, TYPE_U32, eau, eau, off);
1250    }
1251
1252    bld.mkOp2(OP_MERGE, TYPE_U64, addr, bf, eau);
1253
1254    if (atom && su->tex.target == TEX_TARGET_BUFFER)
1255       bld.mkOp2(OP_ADD, TYPE_U64, addr, addr, off);
1256
1257    // let's just set it 0 for raw access and hope it works
1258    v = raw ?
1259       bld.mkImm(0) : loadResInfo32(NULL, base + NVE4_SU_INFO_FMT);
1260
1261    // get rid of old coordinate sources, make space for fmt info and predicate
1262    su->moveSources(arg, 3 - arg);
1263    // set 64 bit address and 32-bit format sources
1264    su->setSrc(0, addr);
1265    su->setSrc(1, v);
1266    su->setSrc(2, pred);
1267 }
1268
1269 void
1270 NVC0LoweringPass::handleSurfaceOpNVE4(TexInstruction *su)
1271 {
1272    processSurfaceCoordsNVE4(su);
1273
1274    // Who do we hate more ? The person who decided that nvc0's SULD doesn't
1275    // have to support conversion or the person who decided that, in OpenCL,
1276    // you don't have to specify the format here like you do in OpenGL ?
1277
1278    if (su->op == OP_SULDP) {
1279       // We don't patch shaders. Ever.
1280       // You get an indirect call to our library blob here.
1281       // But at least it's uniform.
1282       FlowInstruction *call;
1283       LValue *p[3];
1284       LValue *r[5];
1285       uint16_t base = su->tex.r * NVE4_SU_INFO__STRIDE + NVE4_SU_INFO_CALL;
1286
1287       for (int i = 0; i < 4; ++i)
1288          (r[i] = bld.getScratch(4, FILE_GPR))->reg.data.id = i;
1289       for (int i = 0; i < 3; ++i)
1290          (p[i] = bld.getScratch(1, FILE_PREDICATE))->reg.data.id = i;
1291       (r[4] = bld.getScratch(8, FILE_GPR))->reg.data.id = 4;
1292
1293       bld.mkMov(p[1], bld.mkImm((su->cache == CACHE_CA) ? 1 : 0), TYPE_U8);
1294       bld.mkMov(p[2], bld.mkImm((su->cache == CACHE_CG) ? 1 : 0), TYPE_U8);
1295       bld.mkMov(p[0], su->getSrc(2), TYPE_U8);
1296       bld.mkMov(r[4], su->getSrc(0), TYPE_U64);
1297       bld.mkMov(r[2], su->getSrc(1), TYPE_U32);
1298
1299       call = bld.mkFlow(OP_CALL, NULL, su->cc, su->getPredicate());
1300
1301       call->indirect = 1;
1302       call->absolute = 1;
1303       call->setSrc(0, bld.mkSymbol(FILE_MEMORY_CONST,
1304                                    prog->driver->io.resInfoCBSlot, TYPE_U32,
1305                                    prog->driver->io.suInfoBase + base));
1306       call->setSrc(1, r[2]);
1307       call->setSrc(2, r[4]);
1308       for (int i = 0; i < 3; ++i)
1309          call->setSrc(3 + i, p[i]);
1310       for (int i = 0; i < 4; ++i) {
1311          call->setDef(i, r[i]);
1312          bld.mkMov(su->getDef(i), r[i]);
1313       }
1314       call->setDef(4, p[1]);
1315       delete_Instruction(bld.getProgram(), su);
1316    }
1317
1318    if (su->op == OP_SUREDB || su->op == OP_SUREDP) {
1319       // FIXME: for out of bounds access, destination value will be undefined !
1320       Value *pred = su->getSrc(2);
1321       CondCode cc = CC_NOT_P;
1322       if (su->getPredicate()) {
1323          pred = bld.getScratch(1, FILE_PREDICATE);
1324          cc = su->cc;
1325          if (cc == CC_NOT_P) {
1326             bld.mkOp2(OP_OR, TYPE_U8, pred, su->getPredicate(), su->getSrc(2));
1327          } else {
1328             bld.mkOp2(OP_AND, TYPE_U8, pred, su->getPredicate(), su->getSrc(2));
1329             pred->getInsn()->src(1).mod = Modifier(NV50_IR_MOD_NOT);
1330          }
1331       }
1332       Instruction *red = bld.mkOp(OP_ATOM, su->dType, su->getDef(0));
1333       red->subOp = su->subOp;
1334       if (!gMemBase)
1335          gMemBase = bld.mkSymbol(FILE_MEMORY_GLOBAL, 0, TYPE_U32, 0);
1336       red->setSrc(0, gMemBase);
1337       red->setSrc(1, su->getSrc(3));
1338       if (su->subOp == NV50_IR_SUBOP_ATOM_CAS)
1339          red->setSrc(2, su->getSrc(4));
1340       red->setIndirect(0, 0, su->getSrc(0));
1341       red->setPredicate(cc, pred);
1342       delete_Instruction(bld.getProgram(), su);
1343       handleCasExch(red, true);
1344    } else {
1345       su->sType = (su->tex.target == TEX_TARGET_BUFFER) ? TYPE_U32 : TYPE_U8;
1346    }
1347 }
1348
1349 bool
1350 NVC0LoweringPass::handleWRSV(Instruction *i)
1351 {
1352    Instruction *st;
1353    Symbol *sym;
1354    uint32_t addr;
1355
1356    // must replace, $sreg are not writeable
1357    addr = targ->getSVAddress(FILE_SHADER_OUTPUT, i->getSrc(0)->asSym());
1358    if (addr >= 0x400)
1359       return false;
1360    sym = bld.mkSymbol(FILE_SHADER_OUTPUT, 0, i->sType, addr);
1361
1362    st = bld.mkStore(OP_EXPORT, i->dType, sym, i->getIndirect(0, 0),
1363                     i->getSrc(1));
1364    st->perPatch = i->perPatch;
1365
1366    bld.getBB()->remove(i);
1367    return true;
1368 }
1369
1370 void
1371 NVC0LoweringPass::readTessCoord(LValue *dst, int c)
1372 {
1373    Value *laneid = bld.getSSA();
1374    Value *x, *y;
1375
1376    bld.mkOp1(OP_RDSV, TYPE_U32, laneid, bld.mkSysVal(SV_LANEID, 0));
1377
1378    if (c == 0) {
1379       x = dst;
1380       y = NULL;
1381    } else
1382    if (c == 1) {
1383       x = NULL;
1384       y = dst;
1385    } else {
1386       assert(c == 2);
1387       x = bld.getSSA();
1388       y = bld.getSSA();
1389    }
1390    if (x)
1391       bld.mkFetch(x, TYPE_F32, FILE_SHADER_OUTPUT, 0x2f0, NULL, laneid);
1392    if (y)
1393       bld.mkFetch(y, TYPE_F32, FILE_SHADER_OUTPUT, 0x2f4, NULL, laneid);
1394
1395    if (c == 2) {
1396       bld.mkOp2(OP_ADD, TYPE_F32, dst, x, y);
1397       bld.mkOp2(OP_SUB, TYPE_F32, dst, bld.loadImm(NULL, 1.0f), dst);
1398    }
1399 }
1400
1401 bool
1402 NVC0LoweringPass::handleRDSV(Instruction *i)
1403 {
1404    Symbol *sym = i->getSrc(0)->asSym();
1405    const SVSemantic sv = sym->reg.data.sv.sv;
1406    Value *vtx = NULL;
1407    Instruction *ld;
1408    uint32_t addr = targ->getSVAddress(FILE_SHADER_INPUT, sym);
1409
1410    if (addr >= 0x400) {
1411       // mov $sreg
1412       if (sym->reg.data.sv.index == 3) {
1413          // TGSI backend may use 4th component of TID,NTID,CTAID,NCTAID
1414          i->op = OP_MOV;
1415          i->setSrc(0, bld.mkImm((sv == SV_NTID || sv == SV_NCTAID) ? 1 : 0));
1416       }
1417       return true;
1418    }
1419
1420    switch (sv) {
1421    case SV_POSITION:
1422       assert(prog->getType() == Program::TYPE_FRAGMENT);
1423       bld.mkInterp(NV50_IR_INTERP_LINEAR, i->getDef(0), addr, NULL);
1424       break;
1425    case SV_FACE:
1426    {
1427       Value *face = i->getDef(0);
1428       bld.mkInterp(NV50_IR_INTERP_FLAT, face, addr, NULL);
1429       if (i->dType == TYPE_F32) {
1430          bld.mkOp2(OP_AND, TYPE_U32, face, face, bld.mkImm(0x80000000));
1431          bld.mkOp2(OP_XOR, TYPE_U32, face, face, bld.mkImm(0xbf800000));
1432       }
1433    }
1434       break;
1435    case SV_TESS_COORD:
1436       assert(prog->getType() == Program::TYPE_TESSELLATION_EVAL);
1437       readTessCoord(i->getDef(0)->asLValue(), i->getSrc(0)->reg.data.sv.index);
1438       break;
1439    case SV_NTID:
1440    case SV_NCTAID:
1441    case SV_GRIDID:
1442       assert(targ->getChipset() >= NVISA_GK104_CHIPSET); // mov $sreg otherwise
1443       if (sym->reg.data.sv.index == 3) {
1444          i->op = OP_MOV;
1445          i->setSrc(0, bld.mkImm(sv == SV_GRIDID ? 0 : 1));
1446          return true;
1447       }
1448       addr += prog->driver->prop.cp.gridInfoBase;
1449       bld.mkLoad(TYPE_U32, i->getDef(0),
1450                  bld.mkSymbol(FILE_MEMORY_CONST, 0, TYPE_U32, addr), NULL);
1451       break;
1452    case SV_SAMPLE_INDEX:
1453       // TODO: Properly pass source as an address in the PIX address space
1454       // (which can be of the form [r0+offset]). But this is currently
1455       // unnecessary.
1456       ld = bld.mkOp1(OP_PIXLD, TYPE_U32, i->getDef(0), bld.mkImm(0));
1457       ld->subOp = NV50_IR_SUBOP_PIXLD_SAMPLEID;
1458       break;
1459    case SV_SAMPLE_POS: {
1460       Value *off = new_LValue(func, FILE_GPR);
1461       ld = bld.mkOp1(OP_PIXLD, TYPE_U32, i->getDef(0), bld.mkImm(0));
1462       ld->subOp = NV50_IR_SUBOP_PIXLD_SAMPLEID;
1463       bld.mkOp2(OP_SHL, TYPE_U32, off, i->getDef(0), bld.mkImm(3));
1464       bld.mkLoad(TYPE_F32,
1465                  i->getDef(0),
1466                  bld.mkSymbol(
1467                        FILE_MEMORY_CONST, prog->driver->io.resInfoCBSlot,
1468                        TYPE_U32, prog->driver->io.sampleInfoBase +
1469                        4 * sym->reg.data.sv.index),
1470                  off);
1471       break;
1472    }
1473    case SV_SAMPLE_MASK:
1474       ld = bld.mkOp1(OP_PIXLD, TYPE_U32, i->getDef(0), bld.mkImm(0));
1475       ld->subOp = NV50_IR_SUBOP_PIXLD_COVMASK;
1476       break;
1477    default:
1478       if (prog->getType() == Program::TYPE_TESSELLATION_EVAL)
1479          vtx = bld.mkOp1v(OP_PFETCH, TYPE_U32, bld.getSSA(), bld.mkImm(0));
1480       ld = bld.mkFetch(i->getDef(0), i->dType,
1481                        FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), vtx);
1482       ld->perPatch = i->perPatch;
1483       break;
1484    }
1485    bld.getBB()->remove(i);
1486    return true;
1487 }
1488
1489 bool
1490 NVC0LoweringPass::handleDIV(Instruction *i)
1491 {
1492    if (!isFloatType(i->dType))
1493       return true;
1494    bld.setPosition(i, false);
1495    Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(), i->getSrc(1));
1496    i->op = OP_MUL;
1497    i->setSrc(1, rcp->getDef(0));
1498    return true;
1499 }
1500
1501 bool
1502 NVC0LoweringPass::handleMOD(Instruction *i)
1503 {
1504    if (i->dType != TYPE_F32)
1505       return true;
1506    LValue *value = bld.getScratch();
1507    bld.mkOp1(OP_RCP, TYPE_F32, value, i->getSrc(1));
1508    bld.mkOp2(OP_MUL, TYPE_F32, value, i->getSrc(0), value);
1509    bld.mkOp1(OP_TRUNC, TYPE_F32, value, value);
1510    bld.mkOp2(OP_MUL, TYPE_F32, value, i->getSrc(1), value);
1511    i->op = OP_SUB;
1512    i->setSrc(1, value);
1513    return true;
1514 }
1515
1516 bool
1517 NVC0LoweringPass::handleSQRT(Instruction *i)
1518 {
1519    Instruction *rsq = bld.mkOp1(OP_RSQ, TYPE_F32,
1520                                 bld.getSSA(), i->getSrc(0));
1521    i->op = OP_MUL;
1522    i->setSrc(1, rsq->getDef(0));
1523
1524    return true;
1525 }
1526
1527 bool
1528 NVC0LoweringPass::handlePOW(Instruction *i)
1529 {
1530    LValue *val = bld.getScratch();
1531
1532    bld.mkOp1(OP_LG2, TYPE_F32, val, i->getSrc(0));
1533    bld.mkOp2(OP_MUL, TYPE_F32, val, i->getSrc(1), val)->dnz = 1;
1534    bld.mkOp1(OP_PREEX2, TYPE_F32, val, val);
1535
1536    i->op = OP_EX2;
1537    i->setSrc(0, val);
1538    i->setSrc(1, NULL);
1539
1540    return true;
1541 }
1542
1543 bool
1544 NVC0LoweringPass::handleEXPORT(Instruction *i)
1545 {
1546    if (prog->getType() == Program::TYPE_FRAGMENT) {
1547       int id = i->getSrc(0)->reg.data.offset / 4;
1548
1549       if (i->src(0).isIndirect(0)) // TODO, ugly
1550          return false;
1551       i->op = OP_MOV;
1552       i->subOp = NV50_IR_SUBOP_MOV_FINAL;
1553       i->src(0).set(i->src(1));
1554       i->setSrc(1, NULL);
1555       i->setDef(0, new_LValue(func, FILE_GPR));
1556       i->getDef(0)->reg.data.id = id;
1557
1558       prog->maxGPR = MAX2(prog->maxGPR, id);
1559    } else
1560    if (prog->getType() == Program::TYPE_GEOMETRY) {
1561       i->setIndirect(0, 1, gpEmitAddress);
1562    }
1563    return true;
1564 }
1565
1566 bool
1567 NVC0LoweringPass::handleOUT(Instruction *i)
1568 {
1569    if (i->op == OP_RESTART && i->prev && i->prev->op == OP_EMIT) {
1570       i->prev->subOp = NV50_IR_SUBOP_EMIT_RESTART;
1571       delete_Instruction(prog, i);
1572    } else {
1573       assert(gpEmitAddress);
1574       i->setDef(0, gpEmitAddress);
1575       if (i->srcExists(0))
1576          i->setSrc(1, i->getSrc(0));
1577       i->setSrc(0, gpEmitAddress);
1578    }
1579    return true;
1580 }
1581
1582 // Generate a binary predicate if an instruction is predicated by
1583 // e.g. an f32 value.
1584 void
1585 NVC0LoweringPass::checkPredicate(Instruction *insn)
1586 {
1587    Value *pred = insn->getPredicate();
1588    Value *pdst;
1589
1590    if (!pred || pred->reg.file == FILE_PREDICATE)
1591       return;
1592    pdst = new_LValue(func, FILE_PREDICATE);
1593
1594    // CAUTION: don't use pdst->getInsn, the definition might not be unique,
1595    //  delay turning PSET(FSET(x,y),0) into PSET(x,y) to a later pass
1596
1597    bld.mkCmp(OP_SET, CC_NEU, insn->dType, pdst, insn->dType, bld.mkImm(0), pred);
1598
1599    insn->setPredicate(insn->cc, pdst);
1600 }
1601
1602 //
1603 // - add quadop dance for texturing
1604 // - put FP outputs in GPRs
1605 // - convert instruction sequences
1606 //
1607 bool
1608 NVC0LoweringPass::visit(Instruction *i)
1609 {
1610    bld.setPosition(i, false);
1611
1612    if (i->cc != CC_ALWAYS)
1613       checkPredicate(i);
1614
1615    switch (i->op) {
1616    case OP_TEX:
1617    case OP_TXB:
1618    case OP_TXL:
1619    case OP_TXF:
1620    case OP_TXG:
1621       return handleTEX(i->asTex());
1622    case OP_TXD:
1623       return handleTXD(i->asTex());
1624    case OP_TXLQ:
1625       return handleTXLQ(i->asTex());
1626    case OP_TXQ:
1627      return handleTXQ(i->asTex());
1628    case OP_EX2:
1629       bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0));
1630       i->setSrc(0, i->getDef(0));
1631       break;
1632    case OP_POW:
1633       return handlePOW(i);
1634    case OP_DIV:
1635       return handleDIV(i);
1636    case OP_MOD:
1637       return handleMOD(i);
1638    case OP_SQRT:
1639       return handleSQRT(i);
1640    case OP_EXPORT:
1641       return handleEXPORT(i);
1642    case OP_EMIT:
1643    case OP_RESTART:
1644       return handleOUT(i);
1645    case OP_RDSV:
1646       return handleRDSV(i);
1647    case OP_WRSV:
1648       return handleWRSV(i);
1649    case OP_LOAD:
1650       if (i->src(0).getFile() == FILE_SHADER_INPUT) {
1651          if (prog->getType() == Program::TYPE_COMPUTE) {
1652             i->getSrc(0)->reg.file = FILE_MEMORY_CONST;
1653             i->getSrc(0)->reg.fileIndex = 0;
1654          } else
1655          if (prog->getType() == Program::TYPE_GEOMETRY &&
1656              i->src(0).isIndirect(0)) {
1657             // XXX: this assumes vec4 units
1658             Value *ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
1659                                     i->getIndirect(0, 0), bld.mkImm(4));
1660             i->setIndirect(0, 0, ptr);
1661          } else {
1662             i->op = OP_VFETCH;
1663             assert(prog->getType() != Program::TYPE_FRAGMENT); // INTERP
1664          }
1665       }
1666       break;
1667    case OP_ATOM:
1668    {
1669       const bool cctl = i->src(0).getFile() == FILE_MEMORY_GLOBAL;
1670       handleATOM(i);
1671       handleCasExch(i, cctl);
1672    }
1673       break;
1674    case OP_SULDB:
1675    case OP_SULDP:
1676    case OP_SUSTB:
1677    case OP_SUSTP:
1678    case OP_SUREDB:
1679    case OP_SUREDP:
1680       if (targ->getChipset() >= NVISA_GK104_CHIPSET)
1681          handleSurfaceOpNVE4(i->asTex());
1682       break;
1683    default:
1684       break;
1685    }
1686    return true;
1687 }
1688
1689 bool
1690 TargetNVC0::runLegalizePass(Program *prog, CGStage stage) const
1691 {
1692    if (stage == CG_STAGE_PRE_SSA) {
1693       NVC0LoweringPass pass(prog);
1694       return pass.run(prog, false, true);
1695    } else
1696    if (stage == CG_STAGE_POST_RA) {
1697       NVC0LegalizePostRA pass(prog);
1698       return pass.run(prog, false, true);
1699    } else
1700    if (stage == CG_STAGE_SSA) {
1701       NVC0LegalizeSSA pass;
1702       return pass.run(prog, false, true);
1703    }
1704    return false;
1705 }
1706
1707 } // namespace nv50_ir