src/mesa/program/prog_execute.c

   1 /*
   2  * Mesa 3-D graphics library
   3  * Version:  7.3
   4  *
   5  * Copyright (C) 1999-2008  Brian Paul   All Rights Reserved.
   6  *
   7  * Permission is hereby granted, free of charge, to any person obtaining a
   8  * copy of this software and associated documentation files (the "Software"),
   9  * to deal in the Software without restriction, including without limitation
  10  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  11  * and/or sell copies of the Software, and to permit persons to whom the
  12  * Software is furnished to do so, subject to the following conditions:
  13  *
  14  * The above copyright notice and this permission notice shall be included
  15  * in all copies or substantial portions of the Software.
  16  *
  17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  18  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  20  * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
  21  * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  22  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  23  */
  24
  25 /**
  26  * \file prog_execute.c
  27  * Software interpreter for vertex/fragment programs.
  28  * \author Brian Paul
  29  */
  30
  31 /*
  32  * NOTE: we do everything in single-precision floating point; we don't
  33  * currently observe the single/half/fixed-precision qualifiers.
  34  *
  35  */
  36
  37
  38 #include "main/glheader.h"
  39 #include "main/colormac.h"
  40 #include "main/macros.h"
  41 #include "prog_execute.h"
  42 #include "prog_instruction.h"
  43 #include "prog_parameter.h"
  44 #include "prog_print.h"
  45 #include "prog_noise.h"
  46
  47
  48 /* debug predicate */
  49 #define DEBUG_PROG 0
  50
  51
  52 /**
  53  * Set x to positive or negative infinity.
  54  */
  55 #if defined(USE_IEEE) || defined(_WIN32)
  56 #define SET_POS_INFINITY(x)                  \
  57    do {                                      \
  58          fi_type fi;                         \
  59          fi.i = 0x7F800000;                  \
  60          x = fi.f;                           \
  61    } while (0)
  62 #define SET_NEG_INFINITY(x)                  \
  63    do {                                      \
  64          fi_type fi;                         \
  65          fi.i = 0xFF800000;                  \
  66          x = fi.f;                           \
  67    } while (0)
  68 #else
  69 #define SET_POS_INFINITY(x)  x = (GLfloat) HUGE_VAL
  70 #define SET_NEG_INFINITY(x)  x = (GLfloat) -HUGE_VAL
  71 #endif
  72
  73 #define SET_FLOAT_BITS(x, bits) ((fi_type *) (void *) &(x))->i = bits
  74
  75
  76 static const GLfloat ZeroVec[4] = { 0.0F, 0.0F, 0.0F, 0.0F };
  77
  78
  79 /**
  80  * Return a pointer to the 4-element float vector specified by the given
  81  * source register.
  82  */
  83 static inline const GLfloat *
  84 get_src_register_pointer(const struct prog_src_register *source,
  85                          const struct gl_program_machine *machine)
  86 {
  87    const struct gl_program *prog = machine->CurProgram;
  88    GLint reg = source->Index;
  89
  90    if (source->RelAddr) {
  91       /* add address register value to src index/offset */
  92       reg += machine->AddressReg[0][0];
  93       if (reg < 0) {
  94          return ZeroVec;
  95       }
  96    }
  97
  98    switch (source->File) {
  99    case PROGRAM_TEMPORARY:
 100       if (reg >= MAX_PROGRAM_TEMPS)
 101          return ZeroVec;
 102       return machine->Temporaries[reg];
 103
 104    case PROGRAM_INPUT:
 105       if (prog->Target == GL_VERTEX_PROGRAM_ARB) {
 106          if (reg >= VERT_ATTRIB_MAX)
 107             return ZeroVec;
 108          return machine->VertAttribs[reg];
 109       }
 110       else {
 111          if (reg >= VARYING_SLOT_MAX)
 112             return ZeroVec;
 113          return machine->Attribs[reg][machine->CurElement];
 114       }
 115
 116    case PROGRAM_OUTPUT:
 117       if (reg >= MAX_PROGRAM_OUTPUTS)
 118          return ZeroVec;
 119       return machine->Outputs[reg];
 120
 121    case PROGRAM_LOCAL_PARAM:
 122       if (reg >= MAX_PROGRAM_LOCAL_PARAMS)
 123          return ZeroVec;
 124       return machine->CurProgram->LocalParams[reg];
 125
 126    case PROGRAM_ENV_PARAM:
 127       if (reg >= MAX_PROGRAM_ENV_PARAMS)
 128          return ZeroVec;
 129       return machine->EnvParams[reg];
 130
 131    case PROGRAM_STATE_VAR:
 132       /* Fallthrough */
 133    case PROGRAM_CONSTANT:
 134       /* Fallthrough */
 135    case PROGRAM_UNIFORM:
 136       if (reg >= (GLint) prog->Parameters->NumParameters)
 137          return ZeroVec;
 138       return (GLfloat *) prog->Parameters->ParameterValues[reg];
 139
 140    case PROGRAM_SYSTEM_VALUE:
 141       assert(reg < Elements(machine->SystemValues));
 142       return machine->SystemValues[reg];
 143
 144    default:
 145       _mesa_problem(NULL,
 146          "Invalid src register file %d in get_src_register_pointer()",
 147          source->File);
 148       return NULL;
 149    }
 150 }
 151
 152
 153 /**
 154  * Return a pointer to the 4-element float vector specified by the given
 155  * destination register.
 156  */
 157 static inline GLfloat *
 158 get_dst_register_pointer(const struct prog_dst_register *dest,
 159                          struct gl_program_machine *machine)
 160 {
 161    static GLfloat dummyReg[4];
 162    GLint reg = dest->Index;
 163
 164    if (dest->RelAddr) {
 165       /* add address register value to src index/offset */
 166       reg += machine->AddressReg[0][0];
 167       if (reg < 0) {
 168          return dummyReg;
 169       }
 170    }
 171
 172    switch (dest->File) {
 173    case PROGRAM_TEMPORARY:
 174       if (reg >= MAX_PROGRAM_TEMPS)
 175          return dummyReg;
 176       return machine->Temporaries[reg];
 177
 178    case PROGRAM_OUTPUT:
 179       if (reg >= MAX_PROGRAM_OUTPUTS)
 180          return dummyReg;
 181       return machine->Outputs[reg];
 182
 183    default:
 184       _mesa_problem(NULL,
 185          "Invalid dest register file %d in get_dst_register_pointer()",
 186          dest->File);
 187       return NULL;
 188    }
 189 }
 190
 191
 192
 193 /**
 194  * Fetch a 4-element float vector from the given source register.
 195  * Apply swizzling and negating as needed.
 196  */
 197 static void
 198 fetch_vector4(const struct prog_src_register *source,
 199               const struct gl_program_machine *machine, GLfloat result[4])
 200 {
 201    const GLfloat *src = get_src_register_pointer(source, machine);
 202    ASSERT(src);
 203
 204    if (source->Swizzle == SWIZZLE_NOOP) {
 205       /* no swizzling */
 206       COPY_4V(result, src);
 207    }
 208    else {
 209       ASSERT(GET_SWZ(source->Swizzle, 0) <= 3);
 210       ASSERT(GET_SWZ(source->Swizzle, 1) <= 3);
 211       ASSERT(GET_SWZ(source->Swizzle, 2) <= 3);
 212       ASSERT(GET_SWZ(source->Swizzle, 3) <= 3);
 213       result[0] = src[GET_SWZ(source->Swizzle, 0)];
 214       result[1] = src[GET_SWZ(source->Swizzle, 1)];
 215       result[2] = src[GET_SWZ(source->Swizzle, 2)];
 216       result[3] = src[GET_SWZ(source->Swizzle, 3)];
 217    }
 218
 219    if (source->Abs) {
 220       result[0] = FABSF(result[0]);
 221       result[1] = FABSF(result[1]);
 222       result[2] = FABSF(result[2]);
 223       result[3] = FABSF(result[3]);
 224    }
 225    if (source->Negate) {
 226       ASSERT(source->Negate == NEGATE_XYZW);
 227       result[0] = -result[0];
 228       result[1] = -result[1];
 229       result[2] = -result[2];
 230       result[3] = -result[3];
 231    }
 232
 233 #ifdef NAN_CHECK
 234    assert(!IS_INF_OR_NAN(result[0]));
 235    assert(!IS_INF_OR_NAN(result[0]));
 236    assert(!IS_INF_OR_NAN(result[0]));
 237    assert(!IS_INF_OR_NAN(result[0]));
 238 #endif
 239 }
 240
 241
 242 /**
 243  * Fetch the derivative with respect to X or Y for the given register.
 244  * XXX this currently only works for fragment program input attribs.
 245  */
 246 static void
 247 fetch_vector4_deriv(struct gl_context * ctx,
 248                     const struct prog_src_register *source,
 249                     const struct gl_program_machine *machine,
 250                     char xOrY, GLfloat result[4])
 251 {
 252    if (source->File == PROGRAM_INPUT &&
 253        source->Index < (GLint) machine->NumDeriv) {
 254       const GLint col = machine->CurElement;
 255       const GLfloat w = machine->Attribs[VARYING_SLOT_POS][col][3];
 256       const GLfloat invQ = 1.0f / w;
 257       GLfloat deriv[4];
 258
 259       if (xOrY == 'X') {
 260          deriv[0] = machine->DerivX[source->Index][0] * invQ;
 261          deriv[1] = machine->DerivX[source->Index][1] * invQ;
 262          deriv[2] = machine->DerivX[source->Index][2] * invQ;
 263          deriv[3] = machine->DerivX[source->Index][3] * invQ;
 264       }
 265       else {
 266          deriv[0] = machine->DerivY[source->Index][0] * invQ;
 267          deriv[1] = machine->DerivY[source->Index][1] * invQ;
 268          deriv[2] = machine->DerivY[source->Index][2] * invQ;
 269          deriv[3] = machine->DerivY[source->Index][3] * invQ;
 270       }
 271
 272       result[0] = deriv[GET_SWZ(source->Swizzle, 0)];
 273       result[1] = deriv[GET_SWZ(source->Swizzle, 1)];
 274       result[2] = deriv[GET_SWZ(source->Swizzle, 2)];
 275       result[3] = deriv[GET_SWZ(source->Swizzle, 3)];
 276
 277       if (source->Abs) {
 278          result[0] = FABSF(result[0]);
 279          result[1] = FABSF(result[1]);
 280          result[2] = FABSF(result[2]);
 281          result[3] = FABSF(result[3]);
 282       }
 283       if (source->Negate) {
 284          ASSERT(source->Negate == NEGATE_XYZW);
 285          result[0] = -result[0];
 286          result[1] = -result[1];
 287          result[2] = -result[2];
 288          result[3] = -result[3];
 289       }
 290    }
 291    else {
 292       ASSIGN_4V(result, 0.0, 0.0, 0.0, 0.0);
 293    }
 294 }
 295
 296
 297 /**
 298  * As above, but only return result[0] element.
 299  */
 300 static void
 301 fetch_vector1(const struct prog_src_register *source,
 302               const struct gl_program_machine *machine, GLfloat result[4])
 303 {
 304    const GLfloat *src = get_src_register_pointer(source, machine);
 305    ASSERT(src);
 306
 307    result[0] = src[GET_SWZ(source->Swizzle, 0)];
 308
 309    if (source->Abs) {
 310       result[0] = FABSF(result[0]);
 311    }
 312    if (source->Negate) {
 313       result[0] = -result[0];
 314    }
 315 }
 316
 317
 318 static GLuint
 319 fetch_vector1ui(const struct prog_src_register *source,
 320                 const struct gl_program_machine *machine)
 321 {
 322    const GLuint *src = (GLuint *) get_src_register_pointer(source, machine);
 323    return src[GET_SWZ(source->Swizzle, 0)];
 324 }
 325
 326
 327 /**
 328  * Fetch texel from texture.  Use partial derivatives when possible.
 329  */
 330 static inline void
 331 fetch_texel(struct gl_context *ctx,
 332             const struct gl_program_machine *machine,
 333             const struct prog_instruction *inst,
 334             const GLfloat texcoord[4], GLfloat lodBias,
 335             GLfloat color[4])
 336 {
 337    const GLuint unit = machine->Samplers[inst->TexSrcUnit];
 338
 339    /* Note: we only have the right derivatives for fragment input attribs.
 340     */
 341    if (machine->NumDeriv > 0 &&
 342        inst->SrcReg[0].File == PROGRAM_INPUT &&
 343        inst->SrcReg[0].Index == VARYING_SLOT_TEX0 + inst->TexSrcUnit) {
 344       /* simple texture fetch for which we should have derivatives */
 345       GLuint attr = inst->SrcReg[0].Index;
 346       machine->FetchTexelDeriv(ctx, texcoord,
 347                                machine->DerivX[attr],
 348                                machine->DerivY[attr],
 349                                lodBias, unit, color);
 350    }
 351    else {
 352       machine->FetchTexelLod(ctx, texcoord, lodBias, unit, color);
 353    }
 354 }
 355
 356
 357 /**
 358  * Test value against zero and return GT, LT, EQ or UN if NaN.
 359  */
 360 static inline GLuint
 361 generate_cc(float value)
 362 {
 363    if (value != value)
 364       return COND_UN;           /* NaN */
 365    if (value > 0.0F)
 366       return COND_GT;
 367    if (value < 0.0F)
 368       return COND_LT;
 369    return COND_EQ;
 370 }
 371
 372
 373 /**
 374  * Test if the ccMaskRule is satisfied by the given condition code.
 375  * Used to mask destination writes according to the current condition code.
 376  */
 377 static inline GLboolean
 378 test_cc(GLuint condCode, GLuint ccMaskRule)
 379 {
 380    switch (ccMaskRule) {
 381    case COND_EQ: return (condCode == COND_EQ);
 382    case COND_NE: return (condCode != COND_EQ);
 383    case COND_LT: return (condCode == COND_LT);
 384    case COND_GE: return (condCode == COND_GT || condCode == COND_EQ);
 385    case COND_LE: return (condCode == COND_LT || condCode == COND_EQ);
 386    case COND_GT: return (condCode == COND_GT);
 387    case COND_TR: return GL_TRUE;
 388    case COND_FL: return GL_FALSE;
 389    default:      return GL_TRUE;
 390    }
 391 }
 392
 393
 394 /**
 395  * Evaluate the 4 condition codes against a predicate and return GL_TRUE
 396  * or GL_FALSE to indicate result.
 397  */
 398 static inline GLboolean
 399 eval_condition(const struct gl_program_machine *machine,
 400                const struct prog_instruction *inst)
 401 {
 402    const GLuint swizzle = inst->DstReg.CondSwizzle;
 403    const GLuint condMask = inst->DstReg.CondMask;
 404    if (test_cc(machine->CondCodes[GET_SWZ(swizzle, 0)], condMask) ||
 405        test_cc(machine->CondCodes[GET_SWZ(swizzle, 1)], condMask) ||
 406        test_cc(machine->CondCodes[GET_SWZ(swizzle, 2)], condMask) ||
 407        test_cc(machine->CondCodes[GET_SWZ(swizzle, 3)], condMask)) {
 408       return GL_TRUE;
 409    }
 410    else {
 411       return GL_FALSE;
 412    }
 413 }
 414
 415
 416
 417 /**
 418  * Store 4 floats into a register.  Observe the instructions saturate and
 419  * set-condition-code flags.
 420  */
 421 static void
 422 store_vector4(const struct prog_instruction *inst,
 423               struct gl_program_machine *machine, const GLfloat value[4])
 424 {
 425    const struct prog_dst_register *dstReg = &(inst->DstReg);
 426    const GLboolean clamp = inst->SaturateMode == SATURATE_ZERO_ONE;
 427    GLuint writeMask = dstReg->WriteMask;
 428    GLfloat clampedValue[4];
 429    GLfloat *dst = get_dst_register_pointer(dstReg, machine);
 430
 431 #if 0
 432    if (value[0] > 1.0e10 ||
 433        IS_INF_OR_NAN(value[0]) ||
 434        IS_INF_OR_NAN(value[1]) ||
 435        IS_INF_OR_NAN(value[2]) || IS_INF_OR_NAN(value[3]))
 436       printf("store %g %g %g %g\n", value[0], value[1], value[2], value[3]);
 437 #endif
 438
 439    if (clamp) {
 440       clampedValue[0] = CLAMP(value[0], 0.0F, 1.0F);
 441       clampedValue[1] = CLAMP(value[1], 0.0F, 1.0F);
 442       clampedValue[2] = CLAMP(value[2], 0.0F, 1.0F);
 443       clampedValue[3] = CLAMP(value[3], 0.0F, 1.0F);
 444       value = clampedValue;
 445    }
 446
 447    if (dstReg->CondMask != COND_TR) {
 448       /* condition codes may turn off some writes */
 449       if (writeMask & WRITEMASK_X) {
 450          if (!test_cc(machine->CondCodes[GET_SWZ(dstReg->CondSwizzle, 0)],
 451                       dstReg->CondMask))
 452             writeMask &= ~WRITEMASK_X;
 453       }
 454       if (writeMask & WRITEMASK_Y) {
 455          if (!test_cc(machine->CondCodes[GET_SWZ(dstReg->CondSwizzle, 1)],
 456                       dstReg->CondMask))
 457             writeMask &= ~WRITEMASK_Y;
 458       }
 459       if (writeMask & WRITEMASK_Z) {
 460          if (!test_cc(machine->CondCodes[GET_SWZ(dstReg->CondSwizzle, 2)],
 461                       dstReg->CondMask))
 462             writeMask &= ~WRITEMASK_Z;
 463       }
 464       if (writeMask & WRITEMASK_W) {
 465          if (!test_cc(machine->CondCodes[GET_SWZ(dstReg->CondSwizzle, 3)],
 466                       dstReg->CondMask))
 467             writeMask &= ~WRITEMASK_W;
 468       }
 469    }
 470
 471 #ifdef NAN_CHECK
 472    assert(!IS_INF_OR_NAN(value[0]));
 473    assert(!IS_INF_OR_NAN(value[0]));
 474    assert(!IS_INF_OR_NAN(value[0]));
 475    assert(!IS_INF_OR_NAN(value[0]));
 476 #endif
 477
 478    if (writeMask & WRITEMASK_X)
 479       dst[0] = value[0];
 480    if (writeMask & WRITEMASK_Y)
 481       dst[1] = value[1];
 482    if (writeMask & WRITEMASK_Z)
 483       dst[2] = value[2];
 484    if (writeMask & WRITEMASK_W)
 485       dst[3] = value[3];
 486
 487    if (inst->CondUpdate) {
 488       if (writeMask & WRITEMASK_X)
 489          machine->CondCodes[0] = generate_cc(value[0]);
 490       if (writeMask & WRITEMASK_Y)
 491          machine->CondCodes[1] = generate_cc(value[1]);
 492       if (writeMask & WRITEMASK_Z)
 493          machine->CondCodes[2] = generate_cc(value[2]);
 494       if (writeMask & WRITEMASK_W)
 495          machine->CondCodes[3] = generate_cc(value[3]);
 496 #if DEBUG_PROG
 497       printf("CondCodes=(%s,%s,%s,%s) for:\n",
 498              _mesa_condcode_string(machine->CondCodes[0]),
 499              _mesa_condcode_string(machine->CondCodes[1]),
 500              _mesa_condcode_string(machine->CondCodes[2]),
 501              _mesa_condcode_string(machine->CondCodes[3]));
 502 #endif
 503    }
 504 }
 505
 506
 507 /**
 508  * Store 4 uints into a register.  Observe the set-condition-code flags.
 509  */
 510 static void
 511 store_vector4ui(const struct prog_instruction *inst,
 512                 struct gl_program_machine *machine, const GLuint value[4])
 513 {
 514    const struct prog_dst_register *dstReg = &(inst->DstReg);
 515    GLuint writeMask = dstReg->WriteMask;
 516    GLuint *dst = (GLuint *) get_dst_register_pointer(dstReg, machine);
 517
 518    if (dstReg->CondMask != COND_TR) {
 519       /* condition codes may turn off some writes */
 520       if (writeMask & WRITEMASK_X) {
 521          if (!test_cc(machine->CondCodes[GET_SWZ(dstReg->CondSwizzle, 0)],
 522                       dstReg->CondMask))
 523             writeMask &= ~WRITEMASK_X;
 524       }
 525       if (writeMask & WRITEMASK_Y) {
 526          if (!test_cc(machine->CondCodes[GET_SWZ(dstReg->CondSwizzle, 1)],
 527                       dstReg->CondMask))
 528             writeMask &= ~WRITEMASK_Y;
 529       }
 530       if (writeMask & WRITEMASK_Z) {
 531          if (!test_cc(machine->CondCodes[GET_SWZ(dstReg->CondSwizzle, 2)],
 532                       dstReg->CondMask))
 533             writeMask &= ~WRITEMASK_Z;
 534       }
 535       if (writeMask & WRITEMASK_W) {
 536          if (!test_cc(machine->CondCodes[GET_SWZ(dstReg->CondSwizzle, 3)],
 537                       dstReg->CondMask))
 538             writeMask &= ~WRITEMASK_W;
 539       }
 540    }
 541
 542    if (writeMask & WRITEMASK_X)
 543       dst[0] = value[0];
 544    if (writeMask & WRITEMASK_Y)
 545       dst[1] = value[1];
 546    if (writeMask & WRITEMASK_Z)
 547       dst[2] = value[2];
 548    if (writeMask & WRITEMASK_W)
 549       dst[3] = value[3];
 550
 551    if (inst->CondUpdate) {
 552       if (writeMask & WRITEMASK_X)
 553          machine->CondCodes[0] = generate_cc((float)value[0]);
 554       if (writeMask & WRITEMASK_Y)
 555          machine->CondCodes[1] = generate_cc((float)value[1]);
 556       if (writeMask & WRITEMASK_Z)
 557          machine->CondCodes[2] = generate_cc((float)value[2]);
 558       if (writeMask & WRITEMASK_W)
 559          machine->CondCodes[3] = generate_cc((float)value[3]);
 560 #if DEBUG_PROG
 561       printf("CondCodes=(%s,%s,%s,%s) for:\n",
 562              _mesa_condcode_string(machine->CondCodes[0]),
 563              _mesa_condcode_string(machine->CondCodes[1]),
 564              _mesa_condcode_string(machine->CondCodes[2]),
 565              _mesa_condcode_string(machine->CondCodes[3]));
 566 #endif
 567    }
 568 }
 569
 570
 571
 572 /**
 573  * Execute the given vertex/fragment program.
 574  *
 575  * \param ctx  rendering context
 576  * \param program  the program to execute
 577  * \param machine  machine state (must be initialized)
 578  * \return GL_TRUE if program completed or GL_FALSE if program executed KIL.
 579  */
 580 GLboolean
 581 _mesa_execute_program(struct gl_context * ctx,
 582                       const struct gl_program *program,
 583                       struct gl_program_machine *machine)
 584 {
 585    const GLuint numInst = program->NumInstructions;
 586    const GLuint maxExec = 65536;
 587    GLuint pc, numExec = 0;
 588
 589    machine->CurProgram = program;
 590
 591    if (DEBUG_PROG) {
 592       printf("execute program %u --------------------\n", program->Id);
 593    }
 594
 595    if (program->Target == GL_VERTEX_PROGRAM_ARB) {
 596       machine->EnvParams = ctx->VertexProgram.Parameters;
 597    }
 598    else {
 599       machine->EnvParams = ctx->FragmentProgram.Parameters;
 600    }
 601
 602    for (pc = 0; pc < numInst; pc++) {
 603       const struct prog_instruction *inst = program->Instructions + pc;
 604
 605       if (DEBUG_PROG) {
 606          _mesa_print_instruction(inst);
 607       }
 608
 609       switch (inst->Opcode) {
 610       case OPCODE_ABS:
 611          {
 612             GLfloat a[4], result[4];
 613             fetch_vector4(&inst->SrcReg[0], machine, a);
 614             result[0] = FABSF(a[0]);
 615             result[1] = FABSF(a[1]);
 616             result[2] = FABSF(a[2]);
 617             result[3] = FABSF(a[3]);
 618             store_vector4(inst, machine, result);
 619          }
 620          break;
 621       case OPCODE_ADD:
 622          {
 623             GLfloat a[4], b[4], result[4];
 624             fetch_vector4(&inst->SrcReg[0], machine, a);
 625             fetch_vector4(&inst->SrcReg[1], machine, b);
 626             result[0] = a[0] + b[0];
 627             result[1] = a[1] + b[1];
 628             result[2] = a[2] + b[2];
 629             result[3] = a[3] + b[3];
 630             store_vector4(inst, machine, result);
 631             if (DEBUG_PROG) {
 632                printf("ADD (%g %g %g %g) = (%g %g %g %g) + (%g %g %g %g)\n",
 633                       result[0], result[1], result[2], result[3],
 634                       a[0], a[1], a[2], a[3], b[0], b[1], b[2], b[3]);
 635             }
 636          }
 637          break;
 638       case OPCODE_ARL:
 639          {
 640             GLfloat t[4];
 641             fetch_vector4(&inst->SrcReg[0], machine, t);
 642             machine->AddressReg[0][0] = IFLOOR(t[0]);
 643             if (DEBUG_PROG) {
 644                printf("ARL %d\n", machine->AddressReg[0][0]);
 645             }
 646          }
 647          break;
 648       case OPCODE_BGNLOOP:
 649          /* no-op */
 650          ASSERT(program->Instructions[inst->BranchTarget].Opcode
 651                 == OPCODE_ENDLOOP);
 652          break;
 653       case OPCODE_ENDLOOP:
 654          /* subtract 1 here since pc is incremented by for(pc) loop */
 655          ASSERT(program->Instructions[inst->BranchTarget].Opcode
 656                 == OPCODE_BGNLOOP);
 657          pc = inst->BranchTarget - 1;   /* go to matching BNGLOOP */
 658          break;
 659       case OPCODE_BGNSUB:      /* begin subroutine */
 660          break;
 661       case OPCODE_ENDSUB:      /* end subroutine */
 662          break;
 663       case OPCODE_BRK:         /* break out of loop (conditional) */
 664          ASSERT(program->Instructions[inst->BranchTarget].Opcode
 665                 == OPCODE_ENDLOOP);
 666          if (eval_condition(machine, inst)) {
 667             /* break out of loop */
 668             /* pc++ at end of for-loop will put us after the ENDLOOP inst */
 669             pc = inst->BranchTarget;
 670          }
 671          break;
 672       case OPCODE_CONT:        /* continue loop (conditional) */
 673          ASSERT(program->Instructions[inst->BranchTarget].Opcode
 674                 == OPCODE_ENDLOOP);
 675          if (eval_condition(machine, inst)) {
 676             /* continue at ENDLOOP */
 677             /* Subtract 1 here since we'll do pc++ at end of for-loop */
 678             pc = inst->BranchTarget - 1;
 679          }
 680          break;
 681       case OPCODE_CAL:         /* Call subroutine (conditional) */
 682          if (eval_condition(machine, inst)) {
 683             /* call the subroutine */
 684             if (machine->StackDepth >= MAX_PROGRAM_CALL_DEPTH) {
 685                return GL_TRUE;  /* Per GL_NV_vertex_program2 spec */
 686             }
 687             machine->CallStack[machine->StackDepth++] = pc + 1; /* next inst */
 688             /* Subtract 1 here since we'll do pc++ at end of for-loop */
 689             pc = inst->BranchTarget - 1;
 690          }
 691          break;
 692       case OPCODE_CMP:
 693          {
 694             GLfloat a[4], b[4], c[4], result[4];
 695             fetch_vector4(&inst->SrcReg[0], machine, a);
 696             fetch_vector4(&inst->SrcReg[1], machine, b);
 697             fetch_vector4(&inst->SrcReg[2], machine, c);
 698             result[0] = a[0] < 0.0F ? b[0] : c[0];
 699             result[1] = a[1] < 0.0F ? b[1] : c[1];
 700             result[2] = a[2] < 0.0F ? b[2] : c[2];
 701             result[3] = a[3] < 0.0F ? b[3] : c[3];
 702             store_vector4(inst, machine, result);
 703             if (DEBUG_PROG) {
 704                printf("CMP (%g %g %g %g) = (%g %g %g %g) < 0 ? (%g %g %g %g) : (%g %g %g %g)\n",
 705                       result[0], result[1], result[2], result[3],
 706                       a[0], a[1], a[2], a[3],
 707                       b[0], b[1], b[2], b[3],
 708                       c[0], c[1], c[2], c[3]);
 709             }
 710          }
 711          break;
 712       case OPCODE_COS:
 713          {
 714             GLfloat a[4], result[4];
 715             fetch_vector1(&inst->SrcReg[0], machine, a);
 716             result[0] = result[1] = result[2] = result[3]
 717                = (GLfloat) cos(a[0]);
 718             store_vector4(inst, machine, result);
 719          }
 720          break;
 721       case OPCODE_DDX:         /* Partial derivative with respect to X */
 722          {
 723             GLfloat result[4];
 724             fetch_vector4_deriv(ctx, &inst->SrcReg[0], machine,
 725                                 'X', result);
 726             store_vector4(inst, machine, result);
 727          }
 728          break;
 729       case OPCODE_DDY:         /* Partial derivative with respect to Y */
 730          {
 731             GLfloat result[4];
 732             fetch_vector4_deriv(ctx, &inst->SrcReg[0], machine,
 733                                 'Y', result);
 734             store_vector4(inst, machine, result);
 735          }
 736          break;
 737       case OPCODE_DP2:
 738          {
 739             GLfloat a[4], b[4], result[4];
 740             fetch_vector4(&inst->SrcReg[0], machine, a);
 741             fetch_vector4(&inst->SrcReg[1], machine, b);
 742             result[0] = result[1] = result[2] = result[3] = DOT2(a, b);
 743             store_vector4(inst, machine, result);
 744             if (DEBUG_PROG) {
 745                printf("DP2 %g = (%g %g) . (%g %g)\n",
 746                       result[0], a[0], a[1], b[0], b[1]);
 747             }
 748          }
 749          break;
 750       case OPCODE_DP3:
 751          {
 752             GLfloat a[4], b[4], result[4];
 753             fetch_vector4(&inst->SrcReg[0], machine, a);
 754             fetch_vector4(&inst->SrcReg[1], machine, b);
 755             result[0] = result[1] = result[2] = result[3] = DOT3(a, b);
 756             store_vector4(inst, machine, result);
 757             if (DEBUG_PROG) {
 758                printf("DP3 %g = (%g %g %g) . (%g %g %g)\n",
 759                       result[0], a[0], a[1], a[2], b[0], b[1], b[2]);
 760             }
 761          }
 762          break;
 763       case OPCODE_DP4:
 764          {
 765             GLfloat a[4], b[4], result[4];
 766             fetch_vector4(&inst->SrcReg[0], machine, a);
 767             fetch_vector4(&inst->SrcReg[1], machine, b);
 768             result[0] = result[1] = result[2] = result[3] = DOT4(a, b);
 769             store_vector4(inst, machine, result);
 770             if (DEBUG_PROG) {
 771                printf("DP4 %g = (%g, %g %g %g) . (%g, %g %g %g)\n",
 772                       result[0], a[0], a[1], a[2], a[3],
 773                       b[0], b[1], b[2], b[3]);
 774             }
 775          }
 776          break;
 777       case OPCODE_DPH:
 778          {
 779             GLfloat a[4], b[4], result[4];
 780             fetch_vector4(&inst->SrcReg[0], machine, a);
 781             fetch_vector4(&inst->SrcReg[1], machine, b);
 782             result[0] = result[1] = result[2] = result[3] = DOT3(a, b) + b[3];
 783             store_vector4(inst, machine, result);
 784          }
 785          break;
 786       case OPCODE_DST:         /* Distance vector */
 787          {
 788             GLfloat a[4], b[4], result[4];
 789             fetch_vector4(&inst->SrcReg[0], machine, a);
 790             fetch_vector4(&inst->SrcReg[1], machine, b);
 791             result[0] = 1.0F;
 792             result[1] = a[1] * b[1];
 793             result[2] = a[2];
 794             result[3] = b[3];
 795             store_vector4(inst, machine, result);
 796          }
 797          break;
 798       case OPCODE_EXP:
 799          {
 800             GLfloat t[4], q[4], floor_t0;
 801             fetch_vector1(&inst->SrcReg[0], machine, t);
 802             floor_t0 = FLOORF(t[0]);
 803             if (floor_t0 > FLT_MAX_EXP) {
 804                SET_POS_INFINITY(q[0]);
 805                SET_POS_INFINITY(q[2]);
 806             }
 807             else if (floor_t0 < FLT_MIN_EXP) {
 808                q[0] = 0.0F;
 809                q[2] = 0.0F;
 810             }
 811             else {
 812                q[0] = LDEXPF(1.0, (int) floor_t0);
 813                /* Note: GL_NV_vertex_program expects
 814                 * result.z = result.x * APPX(result.y)
 815                 * We do what the ARB extension says.
 816                 */
 817                q[2] = (GLfloat) pow(2.0, t[0]);
 818             }
 819             q[1] = t[0] - floor_t0;
 820             q[3] = 1.0F;
 821             store_vector4( inst, machine, q );
 822          }
 823          break;
 824       case OPCODE_EX2:         /* Exponential base 2 */
 825          {
 826             GLfloat a[4], result[4], val;
 827             fetch_vector1(&inst->SrcReg[0], machine, a);
 828             val = (GLfloat) pow(2.0, a[0]);
 829             /*
 830             if (IS_INF_OR_NAN(val))
 831                val = 1.0e10;
 832             */
 833             result[0] = result[1] = result[2] = result[3] = val;
 834             store_vector4(inst, machine, result);
 835          }
 836          break;
 837       case OPCODE_FLR:
 838          {
 839             GLfloat a[4], result[4];
 840             fetch_vector4(&inst->SrcReg[0], machine, a);
 841             result[0] = FLOORF(a[0]);
 842             result[1] = FLOORF(a[1]);
 843             result[2] = FLOORF(a[2]);
 844             result[3] = FLOORF(a[3]);
 845             store_vector4(inst, machine, result);
 846          }
 847          break;
 848       case OPCODE_FRC:
 849          {
 850             GLfloat a[4], result[4];
 851             fetch_vector4(&inst->SrcReg[0], machine, a);
 852             result[0] = a[0] - FLOORF(a[0]);
 853             result[1] = a[1] - FLOORF(a[1]);
 854             result[2] = a[2] - FLOORF(a[2]);
 855             result[3] = a[3] - FLOORF(a[3]);
 856             store_vector4(inst, machine, result);
 857          }
 858          break;
 859       case OPCODE_IF:
 860          {
 861             GLboolean cond;
 862             ASSERT(program->Instructions[inst->BranchTarget].Opcode
 863                    == OPCODE_ELSE ||
 864                    program->Instructions[inst->BranchTarget].Opcode
 865                    == OPCODE_ENDIF);
 866             /* eval condition */
 867             if (inst->SrcReg[0].File != PROGRAM_UNDEFINED) {
 868                GLfloat a[4];
 869                fetch_vector1(&inst->SrcReg[0], machine, a);
 870                cond = (a[0] != 0.0);
 871             }
 872             else {
 873                cond = eval_condition(machine, inst);
 874             }
 875             if (DEBUG_PROG) {
 876                printf("IF: %d\n", cond);
 877             }
 878             /* do if/else */
 879             if (cond) {
 880                /* do if-clause (just continue execution) */
 881             }
 882             else {
 883                /* go to the instruction after ELSE or ENDIF */
 884                assert(inst->BranchTarget >= 0);
 885                pc = inst->BranchTarget;
 886             }
 887          }
 888          break;
 889       case OPCODE_ELSE:
 890          /* goto ENDIF */
 891          ASSERT(program->Instructions[inst->BranchTarget].Opcode
 892                 == OPCODE_ENDIF);
 893          assert(inst->BranchTarget >= 0);
 894          pc = inst->BranchTarget;
 895          break;
 896       case OPCODE_ENDIF:
 897          /* nothing */
 898          break;
 899       case OPCODE_KIL_NV:      /* NV_f_p only (conditional) */
 900          if (eval_condition(machine, inst)) {
 901             return GL_FALSE;
 902          }
 903          break;
 904       case OPCODE_KIL:         /* ARB_f_p only */
 905          {
 906             GLfloat a[4];
 907             fetch_vector4(&inst->SrcReg[0], machine, a);
 908             if (DEBUG_PROG) {
 909                printf("KIL if (%g %g %g %g) <= 0.0\n",
 910                       a[0], a[1], a[2], a[3]);
 911             }
 912
 913             if (a[0] < 0.0F || a[1] < 0.0F || a[2] < 0.0F || a[3] < 0.0F) {
 914                return GL_FALSE;
 915             }
 916          }
 917          break;
 918       case OPCODE_LG2:         /* log base 2 */
 919          {
 920             GLfloat a[4], result[4], val;
 921             fetch_vector1(&inst->SrcReg[0], machine, a);
 922             /* The fast LOG2 macro doesn't meet the precision requirements.
 923              */
 924             if (a[0] == 0.0F) {
 925                val = -FLT_MAX;
 926             }
 927             else {
 928                val = (float)(log(a[0]) * 1.442695F);
 929             }
 930             result[0] = result[1] = result[2] = result[3] = val;
 931             store_vector4(inst, machine, result);
 932          }
 933          break;
 934       case OPCODE_LIT:
 935          {
 936             const GLfloat epsilon = 1.0F / 256.0F;      /* from NV VP spec */
 937             GLfloat a[4], result[4];
 938             fetch_vector4(&inst->SrcReg[0], machine, a);
 939             a[0] = MAX2(a[0], 0.0F);
 940             a[1] = MAX2(a[1], 0.0F);
 941             /* XXX ARB version clamps a[3], NV version doesn't */
 942             a[3] = CLAMP(a[3], -(128.0F - epsilon), (128.0F - epsilon));
 943             result[0] = 1.0F;
 944             result[1] = a[0];
 945             /* XXX we could probably just use pow() here */
 946             if (a[0] > 0.0F) {
 947                if (a[1] == 0.0 && a[3] == 0.0)
 948                   result[2] = 1.0F;
 949                else
 950                   result[2] = (GLfloat) pow(a[1], a[3]);
 951             }
 952             else {
 953                result[2] = 0.0F;
 954             }
 955             result[3] = 1.0F;
 956             store_vector4(inst, machine, result);
 957             if (DEBUG_PROG) {
 958                printf("LIT (%g %g %g %g) : (%g %g %g %g)\n",
 959                       result[0], result[1], result[2], result[3],
 960                       a[0], a[1], a[2], a[3]);
 961             }
 962          }
 963          break;
 964       case OPCODE_LOG:
 965          {
 966             GLfloat t[4], q[4], abs_t0;
 967             fetch_vector1(&inst->SrcReg[0], machine, t);
 968             abs_t0 = FABSF(t[0]);
 969             if (abs_t0 != 0.0F) {
 970                if (IS_INF_OR_NAN(abs_t0))
 971                {
 972                   SET_POS_INFINITY(q[0]);
 973                   q[1] = 1.0F;
 974                   SET_POS_INFINITY(q[2]);
 975                }
 976                else {
 977                   int exponent;
 978                   GLfloat mantissa = FREXPF(t[0], &exponent);
 979                   q[0] = (GLfloat) (exponent - 1);
 980                   q[1] = (GLfloat) (2.0 * mantissa); /* map [.5, 1) -> [1, 2) */
 981
 982                   /* The fast LOG2 macro doesn't meet the precision
 983                    * requirements.
 984                    */
 985                   q[2] = (float)(log(t[0]) * 1.442695F);
 986                }
 987             }
 988             else {
 989                SET_NEG_INFINITY(q[0]);
 990                q[1] = 1.0F;
 991                SET_NEG_INFINITY(q[2]);
 992             }
 993             q[3] = 1.0;
 994             store_vector4(inst, machine, q);
 995          }
 996          break;
 997       case OPCODE_LRP:
 998          {
 999             GLfloat a[4], b[4], c[4], result[4];
1000             fetch_vector4(&inst->SrcReg[0], machine, a);
1001             fetch_vector4(&inst->SrcReg[1], machine, b);
1002             fetch_vector4(&inst->SrcReg[2], machine, c);
1003             result[0] = a[0] * b[0] + (1.0F - a[0]) * c[0];
1004             result[1] = a[1] * b[1] + (1.0F - a[1]) * c[1];
1005             result[2] = a[2] * b[2] + (1.0F - a[2]) * c[2];
1006             result[3] = a[3] * b[3] + (1.0F - a[3]) * c[3];
1007             store_vector4(inst, machine, result);
1008             if (DEBUG_PROG) {
1009                printf("LRP (%g %g %g %g) = (%g %g %g %g), "
1010                       "(%g %g %g %g), (%g %g %g %g)\n",
1011                       result[0], result[1], result[2], result[3],
1012                       a[0], a[1], a[2], a[3],
1013                       b[0], b[1], b[2], b[3], c[0], c[1], c[2], c[3]);
1014             }
1015          }
1016          break;
1017       case OPCODE_MAD:
1018          {
1019             GLfloat a[4], b[4], c[4], result[4];
1020             fetch_vector4(&inst->SrcReg[0], machine, a);
1021             fetch_vector4(&inst->SrcReg[1], machine, b);
1022             fetch_vector4(&inst->SrcReg[2], machine, c);
1023             result[0] = a[0] * b[0] + c[0];
1024             result[1] = a[1] * b[1] + c[1];
1025             result[2] = a[2] * b[2] + c[2];
1026             result[3] = a[3] * b[3] + c[3];
1027             store_vector4(inst, machine, result);
1028             if (DEBUG_PROG) {
1029                printf("MAD (%g %g %g %g) = (%g %g %g %g) * "
1030                       "(%g %g %g %g) + (%g %g %g %g)\n",
1031                       result[0], result[1], result[2], result[3],
1032                       a[0], a[1], a[2], a[3],
1033                       b[0], b[1], b[2], b[3], c[0], c[1], c[2], c[3]);
1034             }
1035          }
1036          break;
1037       case OPCODE_MAX:
1038          {
1039             GLfloat a[4], b[4], result[4];
1040             fetch_vector4(&inst->SrcReg[0], machine, a);
1041             fetch_vector4(&inst->SrcReg[1], machine, b);
1042             result[0] = MAX2(a[0], b[0]);
1043             result[1] = MAX2(a[1], b[1]);
1044             result[2] = MAX2(a[2], b[2]);
1045             result[3] = MAX2(a[3], b[3]);
1046             store_vector4(inst, machine, result);
1047             if (DEBUG_PROG) {
1048                printf("MAX (%g %g %g %g) = (%g %g %g %g), (%g %g %g %g)\n",
1049                       result[0], result[1], result[2], result[3],
1050                       a[0], a[1], a[2], a[3], b[0], b[1], b[2], b[3]);
1051             }
1052          }
1053          break;
1054       case OPCODE_MIN:
1055          {
1056             GLfloat a[4], b[4], result[4];
1057             fetch_vector4(&inst->SrcReg[0], machine, a);
1058             fetch_vector4(&inst->SrcReg[1], machine, b);
1059             result[0] = MIN2(a[0], b[0]);
1060             result[1] = MIN2(a[1], b[1]);
1061             result[2] = MIN2(a[2], b[2]);
1062             result[3] = MIN2(a[3], b[3]);
1063             store_vector4(inst, machine, result);
1064          }
1065          break;
1066       case OPCODE_MOV:
1067          {
1068             GLfloat result[4];
1069             fetch_vector4(&inst->SrcReg[0], machine, result);
1070             store_vector4(inst, machine, result);
1071             if (DEBUG_PROG) {
1072                printf("MOV (%g %g %g %g)\n",
1073                       result[0], result[1], result[2], result[3]);
1074             }
1075          }
1076          break;
1077       case OPCODE_MUL:
1078          {
1079             GLfloat a[4], b[4], result[4];
1080             fetch_vector4(&inst->SrcReg[0], machine, a);
1081             fetch_vector4(&inst->SrcReg[1], machine, b);
1082             result[0] = a[0] * b[0];
1083             result[1] = a[1] * b[1];
1084             result[2] = a[2] * b[2];
1085             result[3] = a[3] * b[3];
1086             store_vector4(inst, machine, result);
1087             if (DEBUG_PROG) {
1088                printf("MUL (%g %g %g %g) = (%g %g %g %g) * (%g %g %g %g)\n",
1089                       result[0], result[1], result[2], result[3],
1090                       a[0], a[1], a[2], a[3], b[0], b[1], b[2], b[3]);
1091             }
1092          }
1093          break;
1094       case OPCODE_NOISE1:
1095          {
1096             GLfloat a[4], result[4];
1097             fetch_vector1(&inst->SrcReg[0], machine, a);
1098             result[0] =
1099                result[1] =
1100                result[2] =
1101                result[3] = _mesa_noise1(a[0]);
1102             store_vector4(inst, machine, result);
1103          }
1104          break;
1105       case OPCODE_NOISE2:
1106          {
1107             GLfloat a[4], result[4];
1108             fetch_vector4(&inst->SrcReg[0], machine, a);
1109             result[0] =
1110                result[1] =
1111                result[2] = result[3] = _mesa_noise2(a[0], a[1]);
1112             store_vector4(inst, machine, result);
1113          }
1114          break;
1115       case OPCODE_NOISE3:
1116          {
1117             GLfloat a[4], result[4];
1118             fetch_vector4(&inst->SrcReg[0], machine, a);
1119             result[0] =
1120                result[1] =
1121                result[2] =
1122                result[3] = _mesa_noise3(a[0], a[1], a[2]);
1123             store_vector4(inst, machine, result);
1124          }
1125          break;
1126       case OPCODE_NOISE4:
1127          {
1128             GLfloat a[4], result[4];
1129             fetch_vector4(&inst->SrcReg[0], machine, a);
1130             result[0] =
1131                result[1] =
1132                result[2] =
1133                result[3] = _mesa_noise4(a[0], a[1], a[2], a[3]);
1134             store_vector4(inst, machine, result);
1135          }
1136          break;
1137       case OPCODE_NOP:
1138          break;
1139       case OPCODE_PK2H:        /* pack two 16-bit floats in one 32-bit float */
1140          {
1141             GLfloat a[4];
1142             GLuint result[4];
1143             GLhalfNV hx, hy;
1144             fetch_vector4(&inst->SrcReg[0], machine, a);
1145             hx = _mesa_float_to_half(a[0]);
1146             hy = _mesa_float_to_half(a[1]);
1147             result[0] =
1148             result[1] =
1149             result[2] =
1150             result[3] = hx | (hy << 16);
1151             store_vector4ui(inst, machine, result);
1152          }
1153          break;
1154       case OPCODE_PK2US:       /* pack two GLushorts into one 32-bit float */
1155          {
1156             GLfloat a[4];
1157             GLuint result[4], usx, usy;
1158             fetch_vector4(&inst->SrcReg[0], machine, a);
1159             a[0] = CLAMP(a[0], 0.0F, 1.0F);
1160             a[1] = CLAMP(a[1], 0.0F, 1.0F);
1161             usx = F_TO_I(a[0] * 65535.0F);
1162             usy = F_TO_I(a[1] * 65535.0F);
1163             result[0] =
1164             result[1] =
1165             result[2] =
1166             result[3] = usx | (usy << 16);
1167             store_vector4ui(inst, machine, result);
1168          }
1169          break;
1170       case OPCODE_PK4B:        /* pack four GLbytes into one 32-bit float */
1171          {
1172             GLfloat a[4];
1173             GLuint result[4], ubx, uby, ubz, ubw;
1174             fetch_vector4(&inst->SrcReg[0], machine, a);
1175             a[0] = CLAMP(a[0], -128.0F / 127.0F, 1.0F);
1176             a[1] = CLAMP(a[1], -128.0F / 127.0F, 1.0F);
1177             a[2] = CLAMP(a[2], -128.0F / 127.0F, 1.0F);
1178             a[3] = CLAMP(a[3], -128.0F / 127.0F, 1.0F);
1179             ubx = F_TO_I(127.0F * a[0] + 128.0F);
1180             uby = F_TO_I(127.0F * a[1] + 128.0F);
1181             ubz = F_TO_I(127.0F * a[2] + 128.0F);
1182             ubw = F_TO_I(127.0F * a[3] + 128.0F);
1183             result[0] =
1184             result[1] =
1185             result[2] =
1186             result[3] = ubx | (uby << 8) | (ubz << 16) | (ubw << 24);
1187             store_vector4ui(inst, machine, result);
1188          }
1189          break;
1190       case OPCODE_PK4UB:       /* pack four GLubytes into one 32-bit float */
1191          {
1192             GLfloat a[4];
1193             GLuint result[4], ubx, uby, ubz, ubw;
1194             fetch_vector4(&inst->SrcReg[0], machine, a);
1195             a[0] = CLAMP(a[0], 0.0F, 1.0F);
1196             a[1] = CLAMP(a[1], 0.0F, 1.0F);
1197             a[2] = CLAMP(a[2], 0.0F, 1.0F);
1198             a[3] = CLAMP(a[3], 0.0F, 1.0F);
1199             ubx = F_TO_I(255.0F * a[0]);
1200             uby = F_TO_I(255.0F * a[1]);
1201             ubz = F_TO_I(255.0F * a[2]);
1202             ubw = F_TO_I(255.0F * a[3]);
1203             result[0] =
1204             result[1] =
1205             result[2] =
1206             result[3] = ubx | (uby << 8) | (ubz << 16) | (ubw << 24);
1207             store_vector4ui(inst, machine, result);
1208          }
1209          break;
1210       case OPCODE_POW:
1211          {
1212             GLfloat a[4], b[4], result[4];
1213             fetch_vector1(&inst->SrcReg[0], machine, a);
1214             fetch_vector1(&inst->SrcReg[1], machine, b);
1215             result[0] = result[1] = result[2] = result[3]
1216                = (GLfloat) pow(a[0], b[0]);
1217             store_vector4(inst, machine, result);
1218          }
1219          break;
1220
1221       case OPCODE_RCP:
1222          {
1223             GLfloat a[4], result[4];
1224             fetch_vector1(&inst->SrcReg[0], machine, a);
1225             if (DEBUG_PROG) {
1226                if (a[0] == 0)
1227                   printf("RCP(0)\n");
1228                else if (IS_INF_OR_NAN(a[0]))
1229                   printf("RCP(inf)\n");
1230             }
1231             result[0] = result[1] = result[2] = result[3] = 1.0F / a[0];
1232             store_vector4(inst, machine, result);
1233          }
1234          break;
1235       case OPCODE_RET:         /* return from subroutine (conditional) */
1236          if (eval_condition(machine, inst)) {
1237             if (machine->StackDepth == 0) {
1238                return GL_TRUE;  /* Per GL_NV_vertex_program2 spec */
1239             }
1240             /* subtract one because of pc++ in the for loop */
1241             pc = machine->CallStack[--machine->StackDepth] - 1;
1242          }
1243          break;
1244       case OPCODE_RFL:         /* reflection vector */
1245          {
1246             GLfloat axis[4], dir[4], result[4], tmpX, tmpW;
1247             fetch_vector4(&inst->SrcReg[0], machine, axis);
1248             fetch_vector4(&inst->SrcReg[1], machine, dir);
1249             tmpW = DOT3(axis, axis);
1250             tmpX = (2.0F * DOT3(axis, dir)) / tmpW;
1251             result[0] = tmpX * axis[0] - dir[0];
1252             result[1] = tmpX * axis[1] - dir[1];
1253             result[2] = tmpX * axis[2] - dir[2];
1254             /* result[3] is never written! XXX enforce in parser! */
1255             store_vector4(inst, machine, result);
1256          }
1257          break;
1258       case OPCODE_RSQ:         /* 1 / sqrt() */
1259          {
1260             GLfloat a[4], result[4];
1261             fetch_vector1(&inst->SrcReg[0], machine, a);
1262             a[0] = FABSF(a[0]);
1263             result[0] = result[1] = result[2] = result[3] = INV_SQRTF(a[0]);
1264             store_vector4(inst, machine, result);
1265             if (DEBUG_PROG) {
1266                printf("RSQ %g = 1/sqrt(|%g|)\n", result[0], a[0]);
1267             }
1268          }
1269          break;
1270       case OPCODE_SCS:         /* sine and cos */
1271          {
1272             GLfloat a[4], result[4];
1273             fetch_vector1(&inst->SrcReg[0], machine, a);
1274             result[0] = (GLfloat) cos(a[0]);
1275             result[1] = (GLfloat) sin(a[0]);
1276             result[2] = 0.0;    /* undefined! */
1277             result[3] = 0.0;    /* undefined! */
1278             store_vector4(inst, machine, result);
1279          }
1280          break;
1281       case OPCODE_SEQ:         /* set on equal */
1282          {
1283             GLfloat a[4], b[4], result[4];
1284             fetch_vector4(&inst->SrcReg[0], machine, a);
1285             fetch_vector4(&inst->SrcReg[1], machine, b);
1286             result[0] = (a[0] == b[0]) ? 1.0F : 0.0F;
1287             result[1] = (a[1] == b[1]) ? 1.0F : 0.0F;
1288             result[2] = (a[2] == b[2]) ? 1.0F : 0.0F;
1289             result[3] = (a[3] == b[3]) ? 1.0F : 0.0F;
1290             store_vector4(inst, machine, result);
1291             if (DEBUG_PROG) {
1292                printf("SEQ (%g %g %g %g) = (%g %g %g %g) == (%g %g %g %g)\n",
1293                       result[0], result[1], result[2], result[3],
1294                       a[0], a[1], a[2], a[3],
1295                       b[0], b[1], b[2], b[3]);
1296             }
1297          }
1298          break;
1299       case OPCODE_SFL:         /* set false, operands ignored */
1300          {
1301             static const GLfloat result[4] = { 0.0F, 0.0F, 0.0F, 0.0F };
1302             store_vector4(inst, machine, result);
1303          }
1304          break;
1305       case OPCODE_SGE:         /* set on greater or equal */
1306          {
1307             GLfloat a[4], b[4], result[4];
1308             fetch_vector4(&inst->SrcReg[0], machine, a);
1309             fetch_vector4(&inst->SrcReg[1], machine, b);
1310             result[0] = (a[0] >= b[0]) ? 1.0F : 0.0F;
1311             result[1] = (a[1] >= b[1]) ? 1.0F : 0.0F;
1312             result[2] = (a[2] >= b[2]) ? 1.0F : 0.0F;
1313             result[3] = (a[3] >= b[3]) ? 1.0F : 0.0F;
1314             store_vector4(inst, machine, result);
1315             if (DEBUG_PROG) {
1316                printf("SGE (%g %g %g %g) = (%g %g %g %g) >= (%g %g %g %g)\n",
1317                       result[0], result[1], result[2], result[3],
1318                       a[0], a[1], a[2], a[3],
1319                       b[0], b[1], b[2], b[3]);
1320             }
1321          }
1322          break;
1323       case OPCODE_SGT:         /* set on greater */
1324          {
1325             GLfloat a[4], b[4], result[4];
1326             fetch_vector4(&inst->SrcReg[0], machine, a);
1327             fetch_vector4(&inst->SrcReg[1], machine, b);
1328             result[0] = (a[0] > b[0]) ? 1.0F : 0.0F;
1329             result[1] = (a[1] > b[1]) ? 1.0F : 0.0F;
1330             result[2] = (a[2] > b[2]) ? 1.0F : 0.0F;
1331             result[3] = (a[3] > b[3]) ? 1.0F : 0.0F;
1332             store_vector4(inst, machine, result);
1333             if (DEBUG_PROG) {
1334                printf("SGT (%g %g %g %g) = (%g %g %g %g) > (%g %g %g %g)\n",
1335                       result[0], result[1], result[2], result[3],
1336                       a[0], a[1], a[2], a[3],
1337                       b[0], b[1], b[2], b[3]);
1338             }
1339          }
1340          break;
1341       case OPCODE_SIN:
1342          {
1343             GLfloat a[4], result[4];
1344             fetch_vector1(&inst->SrcReg[0], machine, a);
1345             result[0] = result[1] = result[2] = result[3]
1346                = (GLfloat) sin(a[0]);
1347             store_vector4(inst, machine, result);
1348          }
1349          break;
1350       case OPCODE_SLE:         /* set on less or equal */
1351          {
1352             GLfloat a[4], b[4], result[4];
1353             fetch_vector4(&inst->SrcReg[0], machine, a);
1354             fetch_vector4(&inst->SrcReg[1], machine, b);
1355             result[0] = (a[0] <= b[0]) ? 1.0F : 0.0F;
1356             result[1] = (a[1] <= b[1]) ? 1.0F : 0.0F;
1357             result[2] = (a[2] <= b[2]) ? 1.0F : 0.0F;
1358             result[3] = (a[3] <= b[3]) ? 1.0F : 0.0F;
1359             store_vector4(inst, machine, result);
1360             if (DEBUG_PROG) {
1361                printf("SLE (%g %g %g %g) = (%g %g %g %g) <= (%g %g %g %g)\n",
1362                       result[0], result[1], result[2], result[3],
1363                       a[0], a[1], a[2], a[3],
1364                       b[0], b[1], b[2], b[3]);
1365             }
1366          }
1367          break;
1368       case OPCODE_SLT:         /* set on less */
1369          {
1370             GLfloat a[4], b[4], result[4];
1371             fetch_vector4(&inst->SrcReg[0], machine, a);
1372             fetch_vector4(&inst->SrcReg[1], machine, b);
1373             result[0] = (a[0] < b[0]) ? 1.0F : 0.0F;
1374             result[1] = (a[1] < b[1]) ? 1.0F : 0.0F;
1375             result[2] = (a[2] < b[2]) ? 1.0F : 0.0F;
1376             result[3] = (a[3] < b[3]) ? 1.0F : 0.0F;
1377             store_vector4(inst, machine, result);
1378             if (DEBUG_PROG) {
1379                printf("SLT (%g %g %g %g) = (%g %g %g %g) < (%g %g %g %g)\n",
1380                       result[0], result[1], result[2], result[3],
1381                       a[0], a[1], a[2], a[3],
1382                       b[0], b[1], b[2], b[3]);
1383             }
1384          }
1385          break;
1386       case OPCODE_SNE:         /* set on not equal */
1387          {
1388             GLfloat a[4], b[4], result[4];
1389             fetch_vector4(&inst->SrcReg[0], machine, a);
1390             fetch_vector4(&inst->SrcReg[1], machine, b);
1391             result[0] = (a[0] != b[0]) ? 1.0F : 0.0F;
1392             result[1] = (a[1] != b[1]) ? 1.0F : 0.0F;
1393             result[2] = (a[2] != b[2]) ? 1.0F : 0.0F;
1394             result[3] = (a[3] != b[3]) ? 1.0F : 0.0F;
1395             store_vector4(inst, machine, result);
1396             if (DEBUG_PROG) {
1397                printf("SNE (%g %g %g %g) = (%g %g %g %g) != (%g %g %g %g)\n",
1398                       result[0], result[1], result[2], result[3],
1399                       a[0], a[1], a[2], a[3],
1400                       b[0], b[1], b[2], b[3]);
1401             }
1402          }
1403          break;
1404       case OPCODE_SSG:         /* set sign (-1, 0 or +1) */
1405          {
1406             GLfloat a[4], result[4];
1407             fetch_vector4(&inst->SrcReg[0], machine, a);
1408             result[0] = (GLfloat) ((a[0] > 0.0F) - (a[0] < 0.0F));
1409             result[1] = (GLfloat) ((a[1] > 0.0F) - (a[1] < 0.0F));
1410             result[2] = (GLfloat) ((a[2] > 0.0F) - (a[2] < 0.0F));
1411             result[3] = (GLfloat) ((a[3] > 0.0F) - (a[3] < 0.0F));
1412             store_vector4(inst, machine, result);
1413          }
1414          break;
1415       case OPCODE_STR:         /* set true, operands ignored */
1416          {
1417             static const GLfloat result[4] = { 1.0F, 1.0F, 1.0F, 1.0F };
1418             store_vector4(inst, machine, result);
1419          }
1420          break;
1421       case OPCODE_SUB:
1422          {
1423             GLfloat a[4], b[4], result[4];
1424             fetch_vector4(&inst->SrcReg[0], machine, a);
1425             fetch_vector4(&inst->SrcReg[1], machine, b);
1426             result[0] = a[0] - b[0];
1427             result[1] = a[1] - b[1];
1428             result[2] = a[2] - b[2];
1429             result[3] = a[3] - b[3];
1430             store_vector4(inst, machine, result);
1431             if (DEBUG_PROG) {
1432                printf("SUB (%g %g %g %g) = (%g %g %g %g) - (%g %g %g %g)\n",
1433                       result[0], result[1], result[2], result[3],
1434                       a[0], a[1], a[2], a[3], b[0], b[1], b[2], b[3]);
1435             }
1436          }
1437          break;
1438       case OPCODE_SWZ:         /* extended swizzle */
1439          {
1440             const struct prog_src_register *source = &inst->SrcReg[0];
1441             const GLfloat *src = get_src_register_pointer(source, machine);
1442             GLfloat result[4];
1443             GLuint i;
1444             for (i = 0; i < 4; i++) {
1445                const GLuint swz = GET_SWZ(source->Swizzle, i);
1446                if (swz == SWIZZLE_ZERO)
1447                   result[i] = 0.0;
1448                else if (swz == SWIZZLE_ONE)
1449                   result[i] = 1.0;
1450                else {
1451                   ASSERT(swz >= 0);
1452                   ASSERT(swz <= 3);
1453                   result[i] = src[swz];
1454                }
1455                if (source->Negate & (1 << i))
1456                   result[i] = -result[i];
1457             }
1458             store_vector4(inst, machine, result);
1459          }
1460          break;
1461       case OPCODE_TEX:         /* Both ARB and NV frag prog */
1462          /* Simple texel lookup */
1463          {
1464             GLfloat texcoord[4], color[4];
1465             fetch_vector4(&inst->SrcReg[0], machine, texcoord);
1466
1467             /* For TEX, texcoord.Q should not be used and its value should not
1468              * matter (at most, we pass coord.xyz to texture3D() in GLSL).
1469              * Set Q=1 so that FetchTexelDeriv() doesn't get a garbage value
1470              * which is effectively what happens when the texcoord swizzle
1471              * is .xyzz
1472              */
1473             texcoord[3] = 1.0f;
1474
1475             fetch_texel(ctx, machine, inst, texcoord, 0.0, color);
1476
1477             if (DEBUG_PROG) {
1478                printf("TEX (%g, %g, %g, %g) = texture[%d][%g, %g, %g, %g]\n",
1479                       color[0], color[1], color[2], color[3],
1480                       inst->TexSrcUnit,
1481                       texcoord[0], texcoord[1], texcoord[2], texcoord[3]);
1482             }
1483             store_vector4(inst, machine, color);
1484          }
1485          break;
1486       case OPCODE_TXB:         /* GL_ARB_fragment_program only */
1487          /* Texel lookup with LOD bias */
1488          {
1489             GLfloat texcoord[4], color[4], lodBias;
1490
1491             fetch_vector4(&inst->SrcReg[0], machine, texcoord);
1492
1493             /* texcoord[3] is the bias to add to lambda */
1494             lodBias = texcoord[3];
1495
1496             fetch_texel(ctx, machine, inst, texcoord, lodBias, color);
1497
1498             if (DEBUG_PROG) {
1499                printf("TXB (%g, %g, %g, %g) = texture[%d][%g %g %g %g]"
1500                       "  bias %g\n",
1501                       color[0], color[1], color[2], color[3],
1502                       inst->TexSrcUnit,
1503                       texcoord[0],
1504                       texcoord[1],
1505                       texcoord[2],
1506                       texcoord[3],
1507                       lodBias);
1508             }
1509
1510             store_vector4(inst, machine, color);
1511          }
1512          break;
1513       case OPCODE_TXD:         /* GL_NV_fragment_program only */
1514          /* Texture lookup w/ partial derivatives for LOD */
1515          {
1516             GLfloat texcoord[4], dtdx[4], dtdy[4], color[4];
1517             fetch_vector4(&inst->SrcReg[0], machine, texcoord);
1518             fetch_vector4(&inst->SrcReg[1], machine, dtdx);
1519             fetch_vector4(&inst->SrcReg[2], machine, dtdy);
1520             machine->FetchTexelDeriv(ctx, texcoord, dtdx, dtdy,
1521                                      0.0, /* lodBias */
1522                                      inst->TexSrcUnit, color);
1523             store_vector4(inst, machine, color);
1524          }
1525          break;
1526       case OPCODE_TXL:
1527          /* Texel lookup with explicit LOD */
1528          {
1529             GLfloat texcoord[4], color[4], lod;
1530
1531             fetch_vector4(&inst->SrcReg[0], machine, texcoord);
1532
1533             /* texcoord[3] is the LOD */
1534             lod = texcoord[3];
1535
1536             machine->FetchTexelLod(ctx, texcoord, lod,
1537                                    machine->Samplers[inst->TexSrcUnit], color);
1538
1539             store_vector4(inst, machine, color);
1540          }
1541          break;
1542       case OPCODE_TXP:         /* GL_ARB_fragment_program only */
1543          /* Texture lookup w/ projective divide */
1544          {
1545             GLfloat texcoord[4], color[4];
1546
1547             fetch_vector4(&inst->SrcReg[0], machine, texcoord);
1548             /* Not so sure about this test - if texcoord[3] is
1549              * zero, we'd probably be fine except for an ASSERT in
1550              * IROUND_POS() which gets triggered by the inf values created.
1551              */
1552             if (texcoord[3] != 0.0) {
1553                texcoord[0] /= texcoord[3];
1554                texcoord[1] /= texcoord[3];
1555                texcoord[2] /= texcoord[3];
1556             }
1557
1558             fetch_texel(ctx, machine, inst, texcoord, 0.0, color);
1559
1560             store_vector4(inst, machine, color);
1561          }
1562          break;
1563       case OPCODE_TXP_NV:      /* GL_NV_fragment_program only */
1564          /* Texture lookup w/ projective divide, as above, but do not
1565           * do the divide by w if sampling from a cube map.
1566           */
1567          {
1568             GLfloat texcoord[4], color[4];
1569
1570             fetch_vector4(&inst->SrcReg[0], machine, texcoord);
1571             if (inst->TexSrcTarget != TEXTURE_CUBE_INDEX &&
1572                 texcoord[3] != 0.0) {
1573                texcoord[0] /= texcoord[3];
1574                texcoord[1] /= texcoord[3];
1575                texcoord[2] /= texcoord[3];
1576             }
1577
1578             fetch_texel(ctx, machine, inst, texcoord, 0.0, color);
1579
1580             store_vector4(inst, machine, color);
1581          }
1582          break;
1583       case OPCODE_TRUNC:       /* truncate toward zero */
1584          {
1585             GLfloat a[4], result[4];
1586             fetch_vector4(&inst->SrcReg[0], machine, a);
1587             result[0] = (GLfloat) (GLint) a[0];
1588             result[1] = (GLfloat) (GLint) a[1];
1589             result[2] = (GLfloat) (GLint) a[2];
1590             result[3] = (GLfloat) (GLint) a[3];
1591             store_vector4(inst, machine, result);
1592          }
1593          break;
1594       case OPCODE_UP2H:        /* unpack two 16-bit floats */
1595          {
1596             const GLuint raw = fetch_vector1ui(&inst->SrcReg[0], machine);
1597             GLfloat result[4];
1598             GLushort hx, hy;
1599             hx = raw & 0xffff;
1600             hy = raw >> 16;
1601             result[0] = result[2] = _mesa_half_to_float(hx);
1602             result[1] = result[3] = _mesa_half_to_float(hy);
1603             store_vector4(inst, machine, result);
1604          }
1605          break;
1606       case OPCODE_UP2US:       /* unpack two GLushorts */
1607          {
1608             const GLuint raw = fetch_vector1ui(&inst->SrcReg[0], machine);
1609             GLfloat result[4];
1610             GLushort usx, usy;
1611             usx = raw & 0xffff;
1612             usy = raw >> 16;
1613             result[0] = result[2] = usx * (1.0f / 65535.0f);
1614             result[1] = result[3] = usy * (1.0f / 65535.0f);
1615             store_vector4(inst, machine, result);
1616          }
1617          break;
1618       case OPCODE_UP4B:        /* unpack four GLbytes */
1619          {
1620             const GLuint raw = fetch_vector1ui(&inst->SrcReg[0], machine);
1621             GLfloat result[4];
1622             result[0] = (((raw >> 0) & 0xff) - 128) / 127.0F;
1623             result[1] = (((raw >> 8) & 0xff) - 128) / 127.0F;
1624             result[2] = (((raw >> 16) & 0xff) - 128) / 127.0F;
1625             result[3] = (((raw >> 24) & 0xff) - 128) / 127.0F;
1626             store_vector4(inst, machine, result);
1627          }
1628          break;
1629       case OPCODE_UP4UB:       /* unpack four GLubytes */
1630          {
1631             const GLuint raw = fetch_vector1ui(&inst->SrcReg[0], machine);
1632             GLfloat result[4];
1633             result[0] = ((raw >> 0) & 0xff) / 255.0F;
1634             result[1] = ((raw >> 8) & 0xff) / 255.0F;
1635             result[2] = ((raw >> 16) & 0xff) / 255.0F;
1636             result[3] = ((raw >> 24) & 0xff) / 255.0F;
1637             store_vector4(inst, machine, result);
1638          }
1639          break;
1640       case OPCODE_XPD:         /* cross product */
1641          {
1642             GLfloat a[4], b[4], result[4];
1643             fetch_vector4(&inst->SrcReg[0], machine, a);
1644             fetch_vector4(&inst->SrcReg[1], machine, b);
1645             result[0] = a[1] * b[2] - a[2] * b[1];
1646             result[1] = a[2] * b[0] - a[0] * b[2];
1647             result[2] = a[0] * b[1] - a[1] * b[0];
1648             result[3] = 1.0;
1649             store_vector4(inst, machine, result);
1650             if (DEBUG_PROG) {
1651                printf("XPD (%g %g %g %g) = (%g %g %g) X (%g %g %g)\n",
1652                       result[0], result[1], result[2], result[3],
1653                       a[0], a[1], a[2], b[0], b[1], b[2]);
1654             }
1655          }
1656          break;
1657       case OPCODE_X2D:         /* 2-D matrix transform */
1658          {
1659             GLfloat a[4], b[4], c[4], result[4];
1660             fetch_vector4(&inst->SrcReg[0], machine, a);
1661             fetch_vector4(&inst->SrcReg[1], machine, b);
1662             fetch_vector4(&inst->SrcReg[2], machine, c);
1663             result[0] = a[0] + b[0] * c[0] + b[1] * c[1];
1664             result[1] = a[1] + b[0] * c[2] + b[1] * c[3];
1665             result[2] = a[2] + b[0] * c[0] + b[1] * c[1];
1666             result[3] = a[3] + b[0] * c[2] + b[1] * c[3];
1667             store_vector4(inst, machine, result);
1668          }
1669          break;
1670       case OPCODE_END:
1671          return GL_TRUE;
1672       default:
1673          _mesa_problem(ctx, "Bad opcode %d in _mesa_execute_program",
1674                        inst->Opcode);
1675          return GL_TRUE;        /* return value doesn't matter */
1676       }
1677
1678       numExec++;
1679       if (numExec > maxExec) {
1680          static GLboolean reported = GL_FALSE;
1681          if (!reported) {
1682             _mesa_problem(ctx, "Infinite loop detected in fragment program");
1683             reported = GL_TRUE;
1684          }
1685          return GL_TRUE;
1686       }
1687
1688    } /* for pc */
1689
1690    return GL_TRUE;
1691 }