OSDN Git Service

R600: initial copy of r300 code
authorAlex Deucher <alexdeucher@gmail.com>
Wed, 8 Apr 2009 19:16:35 +0000 (15:16 -0400)
committerAlex Deucher <alexdeucher@gmail.com>
Wed, 8 Apr 2009 19:16:35 +0000 (15:16 -0400)
44 files changed:
configure.ac
src/mesa/drivers/dri/r600/Lindent [new file with mode: 0755]
src/mesa/drivers/dri/r600/Makefile [new file with mode: 0644]
src/mesa/drivers/dri/r600/r600_cmdbuf.c [new file with mode: 0644]
src/mesa/drivers/dri/r600/r600_cmdbuf.h [new file with mode: 0644]
src/mesa/drivers/dri/r600/r600_context.c [new file with mode: 0644]
src/mesa/drivers/dri/r600/r600_context.h [new file with mode: 0644]
src/mesa/drivers/dri/r600/r600_emit.c [new file with mode: 0644]
src/mesa/drivers/dri/r600/r600_emit.h [new file with mode: 0644]
src/mesa/drivers/dri/r600/r600_fragprog.c [new file with mode: 0644]
src/mesa/drivers/dri/r600/r600_fragprog.h [new file with mode: 0644]
src/mesa/drivers/dri/r600/r600_fragprog_emit.c [new file with mode: 0644]
src/mesa/drivers/dri/r600/r600_fragprog_swizzle.c [new file with mode: 0644]
src/mesa/drivers/dri/r600/r600_fragprog_swizzle.h [new file with mode: 0644]
src/mesa/drivers/dri/r600/r600_ioctl.c [new file with mode: 0644]
src/mesa/drivers/dri/r600/r600_ioctl.h [new file with mode: 0644]
src/mesa/drivers/dri/r600/r600_reg.h [new file with mode: 0644]
src/mesa/drivers/dri/r600/r600_render.c [new file with mode: 0644]
src/mesa/drivers/dri/r600/r600_shader.c [new file with mode: 0644]
src/mesa/drivers/dri/r600/r600_state.c [new file with mode: 0644]
src/mesa/drivers/dri/r600/r600_state.h [new file with mode: 0644]
src/mesa/drivers/dri/r600/r600_swtcl.c [new file with mode: 0644]
src/mesa/drivers/dri/r600/r600_swtcl.h [new file with mode: 0644]
src/mesa/drivers/dri/r600/r600_tex.c [new file with mode: 0644]
src/mesa/drivers/dri/r600/r600_tex.h [new file with mode: 0644]
src/mesa/drivers/dri/r600/r600_texstate.c [new file with mode: 0644]
src/mesa/drivers/dri/r600/r600_vertprog.c [new file with mode: 0644]
src/mesa/drivers/dri/r600/r600_vertprog.h [new file with mode: 0644]
src/mesa/drivers/dri/r600/r700_fragprog.c [new file with mode: 0644]
src/mesa/drivers/dri/r600/r700_fragprog.h [new file with mode: 0644]
src/mesa/drivers/dri/r600/r700_fragprog_emit.c [new file with mode: 0644]
src/mesa/drivers/dri/r600/radeon_context.h [new file with mode: 0644]
src/mesa/drivers/dri/r600/radeon_nqssadce.c [new file with mode: 0644]
src/mesa/drivers/dri/r600/radeon_nqssadce.h [new file with mode: 0644]
src/mesa/drivers/dri/r600/radeon_program.c [new file with mode: 0644]
src/mesa/drivers/dri/r600/radeon_program.h [new file with mode: 0644]
src/mesa/drivers/dri/r600/radeon_program_alu.c [new file with mode: 0644]
src/mesa/drivers/dri/r600/radeon_program_alu.h [new file with mode: 0644]
src/mesa/drivers/dri/r600/radeon_program_pair.c [new file with mode: 0644]
src/mesa/drivers/dri/r600/radeon_program_pair.h [new file with mode: 0644]
src/mesa/drivers/dri/radeon/radeon_chipset.h
src/mesa/drivers/dri/radeon/radeon_common_context.c
src/mesa/drivers/dri/radeon/radeon_screen.c
src/mesa/drivers/dri/radeon/radeon_screen.h

index 4164d37..f81f349 100644 (file)
@@ -724,7 +724,7 @@ if test "$mesa_driver" = dri; then
             # because there is no x86-64 system where they could *ever*
             # be used.
             if test "x$DRI_DIRS" = "xyes"; then
-                DRI_DIRS="i915 i965 mach64 mga r128 r200 r300 radeon \
+                DRI_DIRS="i915 i965 mach64 mga r128 r200 r300 r600 radeon \
                     savage tdfx unichrome swrast"
             fi
             ;;
@@ -732,13 +732,13 @@ if test "$mesa_driver" = dri; then
             # Build only the drivers for cards that exist on PowerPC.
             # At some point MGA will be added, but not yet.
             if test "x$DRI_DIRS" = "xyes"; then
-                DRI_DIRS="mach64 r128 r200 r300 radeon tdfx swrast"
+                DRI_DIRS="mach64 r128 r200 r300 r600 radeon tdfx swrast"
             fi
             ;;
         sparc*)
             # Build only the drivers for cards that exist on sparc`
             if test "x$DRI_DIRS" = "xyes"; then
-                DRI_DIRS="mach64 r128 r200 r300 radeon ffb swrast"
+                DRI_DIRS="mach64 r128 r200 r300 r600 radeon ffb swrast"
             fi
             ;;
         esac
@@ -757,7 +757,7 @@ if test "$mesa_driver" = dri; then
         # ffb and gamma are missing because they have not been converted
         # to use the new interface.
         if test "x$DRI_DIRS" = "xyes"; then
-            DRI_DIRS="i810 i915 i965 mach64 mga r128 r200 r300 radeon tdfx \
+            DRI_DIRS="i810 i915 i965 mach64 mga r128 r200 r300 r600 radeon tdfx \
                 unichrome savage sis swrast"
         fi
         ;;
@@ -772,7 +772,7 @@ if test "$mesa_driver" = dri; then
 
     # default drivers
     if test "x$DRI_DIRS" = "xyes"; then
-        DRI_DIRS="i810 i915 i965 mach64 mga r128 r200 r300 radeon s3v \
+        DRI_DIRS="i810 i915 i965 mach64 mga r128 r200 r300 r600 radeon s3v \
             savage sis tdfx trident unichrome ffb swrast"
     fi
 
diff --git a/src/mesa/drivers/dri/r600/Lindent b/src/mesa/drivers/dri/r600/Lindent
new file mode 100755 (executable)
index 0000000..7d8d889
--- /dev/null
@@ -0,0 +1,2 @@
+#!/bin/sh
+indent -npro -kr -i8 -ts8 -sob -l80 -ss -ncs "$@"
diff --git a/src/mesa/drivers/dri/r600/Makefile b/src/mesa/drivers/dri/r600/Makefile
new file mode 100644 (file)
index 0000000..e1fc406
--- /dev/null
@@ -0,0 +1,118 @@
+# src/mesa/drivers/dri/r300/Makefile
+
+TOP = ../../../../..
+include $(TOP)/configs/current
+
+CFLAGS += $(RADEON_CFLAGS)
+
+LIBNAME = r600_dri.so
+
+MINIGLX_SOURCES = server/radeon_dri.c
+
+ifeq ($(USING_EGL), 1)
+EGL_SOURCES = server/radeon_egl.c
+endif
+
+COMMON_SOURCES = \
+       ../../common/driverfuncs.c \
+       ../common/mm.c \
+       ../common/utils.c \
+       ../common/texmem.c \
+       ../common/vblank.c \
+       ../common/xmlconfig.c \
+       ../common/dri_util.c
+
+RADEON_COMMON_SOURCES = \
+       radeon_texture.c \
+       radeon_common_context.c \
+       radeon_common.c \
+       radeon_dma.c \
+       radeon_lock.c \
+       radeon_bo_legacy.c \
+       radeon_cs_legacy.c \
+       radeon_mipmap_tree.c \
+       radeon_span.c \
+       radeon_fbo.c
+
+DRIVER_SOURCES = \
+                radeon_screen.c \
+                r600_context.c \
+                r600_ioctl.c \
+                r600_cmdbuf.c \
+                r600_state.c \
+                r600_render.c \
+                r600_tex.c \
+                r600_texstate.c \
+                radeon_program.c \
+                radeon_program_alu.c \
+                radeon_program_pair.c \
+                radeon_nqssadce.c \
+                r600_vertprog.c \
+                r600_fragprog.c \
+                r600_fragprog_swizzle.c \
+                r600_fragprog_emit.c \
+                r700_fragprog.c \
+                r700_fragprog_emit.c \
+                r600_shader.c \
+                r600_emit.c \
+                r600_swtcl.c \
+                $(RADEON_COMMON_SOURCES) \
+                $(EGL_SOURCES)
+
+C_SOURCES = $(COMMON_SOURCES) $(DRIVER_SOURCES)
+
+DRIVER_DEFINES = -DCOMPILE_R600 -DR200_MERGED=0 \
+       -DRADEON_COMMON=1 -DRADEON_COMMON_FOR_R600 \
+#      -DRADEON_BO_TRACK \
+       -Wall
+
+SYMLINKS = \
+       server/radeon_dri.c \
+       server/radeon_dri.h \
+       server/radeon.h \
+       server/radeon_macros.h \
+       server/radeon_reg.h \
+       server/radeon_egl.c
+
+COMMON_SYMLINKS = \
+       radeon_chipset.h \
+       radeon_screen.c \
+       radeon_screen.h \
+       radeon_span.h \
+       radeon_span.c \
+       radeon_bo_legacy.c \
+       radeon_cs_legacy.c \
+       radeon_bo_legacy.h \
+       radeon_cs_legacy.h \
+       radeon_bocs_wrapper.h \
+       radeon_lock.c \
+       radeon_lock.h \
+       radeon_common.c \
+       radeon_common.h \
+       radeon_common_context.c \
+       radeon_common_context.h \
+       radeon_cmdbuf.h \
+       radeon_dma.c \
+       radeon_dma.h \
+       radeon_mipmap_tree.c \
+       radeon_mipmap_tree.h \
+       radeon_texture.c \
+       radeon_texture.h \
+       radeon_fbo.c
+
+DRI_LIB_DEPS += $(RADEON_LDFLAGS)
+
+##### TARGETS #####
+
+include ../Makefile.template
+
+server:
+       mkdir -p server
+
+$(SYMLINKS): server
+       @[ -e $@ ] || ln -sf ../../radeon/$@ server/
+
+$(COMMON_SYMLINKS):
+       @[ -e $@ ] || ln -sf ../radeon/$@ ./
+
+symlinks: $(SYMLINKS) $(COMMON_SYMLINKS)
diff --git a/src/mesa/drivers/dri/r600/r600_cmdbuf.c b/src/mesa/drivers/dri/r600/r600_cmdbuf.c
new file mode 100644 (file)
index 0000000..1ae6dc8
--- /dev/null
@@ -0,0 +1,669 @@
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/**
+ * \file
+ *
+ * \author Nicolai Haehnle <prefect_@gmx.net>
+ */
+
+#include "main/glheader.h"
+#include "main/state.h"
+#include "main/imports.h"
+#include "main/macros.h"
+#include "main/context.h"
+#include "main/simple_list.h"
+#include "swrast/swrast.h"
+
+#include "drm.h"
+#include "radeon_drm.h"
+
+#include "r600_context.h"
+#include "r600_ioctl.h"
+#include "radeon_reg.h"
+#include "r600_reg.h"
+#include "r600_cmdbuf.h"
+#include "r600_emit.h"
+#include "radeon_bocs_wrapper.h"
+#include "radeon_mipmap_tree.h"
+#include "r600_state.h"
+#include "radeon_reg.h"
+
+#define R300_VAP_PVS_UPLOAD_ADDRESS 0x2200
+#   define RADEON_ONE_REG_WR        (1 << 15)
+
+/** # of dwords reserved for additional instructions that may need to be written
+ * during flushing.
+ */
+#define SPACE_FOR_FLUSHING     4
+
+static unsigned packet0_count(r300ContextPtr r300, uint32_t *pkt)
+{
+    if (r300->radeon.radeonScreen->kernel_mm) {
+        return ((((*pkt) >> 16) & 0x3FFF) + 1);
+    } else {
+        drm_r300_cmd_header_t *t = (drm_r300_cmd_header_t*)pkt;
+        return t->packet0.count;
+    }
+    return 0;
+}
+
+#define vpu_count(ptr) (((drm_r300_cmd_header_t*)(ptr))->vpu.count)
+#define r500fp_count(ptr) (((drm_r300_cmd_header_t*)(ptr))->r500fp.count)
+
+void emit_vpu(GLcontext *ctx, struct radeon_state_atom * atom)
+{
+       r300ContextPtr r300 = R300_CONTEXT(ctx);
+       BATCH_LOCALS(&r300->radeon);
+       drm_r300_cmd_header_t cmd;
+       uint32_t addr, ndw, i;
+       
+       if (!r300->radeon.radeonScreen->kernel_mm) {
+               uint32_t dwords;
+               dwords = (*atom->check) (ctx, atom);
+               BEGIN_BATCH_NO_AUTOSTATE(dwords);
+               OUT_BATCH_TABLE(atom->cmd, dwords);
+               END_BATCH();
+               return;
+       }
+       
+       cmd.u = atom->cmd[0];
+       addr = (cmd.vpu.adrhi << 8) | cmd.vpu.adrlo;
+       ndw = cmd.vpu.count * 4;
+       if (ndw) {
+
+               if (r300->vap_flush_needed) {
+                       BEGIN_BATCH_NO_AUTOSTATE(15 + ndw);
+
+                       /* flush processing vertices */
+                       OUT_BATCH_REGVAL(R300_SC_SCREENDOOR, 0);
+                       OUT_BATCH_REGVAL(R300_RB3D_DSTCACHE_CTLSTAT, R300_RB3D_DSTCACHE_CTLSTAT_DC_FLUSH_FLUSH_DIRTY_3D);
+                       OUT_BATCH_REGVAL(RADEON_WAIT_UNTIL, RADEON_WAIT_3D_IDLECLEAN);
+                       OUT_BATCH_REGVAL(R300_SC_SCREENDOOR, 0xffffff);
+                       OUT_BATCH_REGVAL(R300_VAP_PVS_STATE_FLUSH_REG, 0);
+                       r300->vap_flush_needed = GL_FALSE;
+               } else {
+                       BEGIN_BATCH_NO_AUTOSTATE(5 + ndw);
+               }
+               OUT_BATCH_REGVAL(R300_VAP_PVS_UPLOAD_ADDRESS, addr);
+               OUT_BATCH(CP_PACKET0(R300_VAP_PVS_UPLOAD_DATA, ndw-1) | RADEON_ONE_REG_WR);
+               for (i = 0; i < ndw; i++) {
+                       OUT_BATCH(atom->cmd[i+1]);
+               }
+               OUT_BATCH_REGVAL(R300_VAP_PVS_STATE_FLUSH_REG, 0);
+               END_BATCH();
+       }
+}
+
+void emit_r500fp(GLcontext *ctx, struct radeon_state_atom * atom)
+{
+       r300ContextPtr r300 = R300_CONTEXT(ctx);
+       BATCH_LOCALS(&r300->radeon);
+       drm_r300_cmd_header_t cmd;
+       uint32_t addr, ndw, i, sz;
+       int type, clamp, stride;
+
+       if (!r300->radeon.radeonScreen->kernel_mm) {
+               uint32_t dwords;
+               dwords = (*atom->check) (ctx, atom);
+               BEGIN_BATCH_NO_AUTOSTATE(dwords);
+               OUT_BATCH_TABLE(atom->cmd, dwords);
+               END_BATCH();
+               return;
+       }
+
+       cmd.u = atom->cmd[0];
+       sz = cmd.r500fp.count;
+       addr = ((cmd.r500fp.adrhi_flags & 1) << 8) | cmd.r500fp.adrlo;
+       type = !!(cmd.r500fp.adrhi_flags & R500FP_CONSTANT_TYPE);
+       clamp = !!(cmd.r500fp.adrhi_flags & R500FP_CONSTANT_CLAMP);
+
+       addr |= (type << 16);
+       addr |= (clamp << 17);
+
+       stride = type ? 4 : 6;
+
+       ndw = sz * stride;
+       if (ndw) {
+
+               BEGIN_BATCH_NO_AUTOSTATE(3 + ndw);
+               OUT_BATCH(CP_PACKET0(R500_GA_US_VECTOR_INDEX, 0));
+               OUT_BATCH(addr);
+               OUT_BATCH(CP_PACKET0(R500_GA_US_VECTOR_DATA, ndw-1) | RADEON_ONE_REG_WR);
+               for (i = 0; i < ndw; i++) {
+                       OUT_BATCH(atom->cmd[i+1]);
+               }
+               END_BATCH();
+       }
+}
+
+static void emit_tex_offsets(GLcontext *ctx, struct radeon_state_atom * atom)
+{
+       r300ContextPtr r300 = R300_CONTEXT(ctx);
+       BATCH_LOCALS(&r300->radeon);
+       int numtmus = packet0_count(r300, r300->hw.tex.offset.cmd);
+       int notexture = 0;
+
+       if (numtmus) {
+               int i;
+
+               for(i = 0; i < numtmus; ++i) {
+                   radeonTexObj *t = r300->hw.textures[i];
+               
+                   if (!t)
+                       notexture = 1;
+               }
+
+               if (r300->radeon.radeonScreen->kernel_mm && notexture) {
+                       return;
+               }
+               BEGIN_BATCH_NO_AUTOSTATE(4 * numtmus);
+               for(i = 0; i < numtmus; ++i) {
+                   radeonTexObj *t = r300->hw.textures[i];
+                   OUT_BATCH_REGSEQ(R300_TX_OFFSET_0 + (i * 4), 1);
+                   if (t && !t->image_override) {
+                           OUT_BATCH_RELOC(t->tile_bits, t->mt->bo, 0,
+                                           RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0, 0);
+                   } else if (!t) {
+                           OUT_BATCH(r300->radeon.radeonScreen->texOffset[0]);
+                   } else { /* override cases */
+                           if (t->bo) {
+                                   OUT_BATCH_RELOC(t->tile_bits, t->bo, 0,
+                                                   RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0, 0);
+                           } else if (!r300->radeon.radeonScreen->kernel_mm) {
+                                   OUT_BATCH(t->override_offset);
+                           }
+                           else
+                               OUT_BATCH(r300->radeon.radeonScreen->texOffset[0]);
+                   }
+               }
+               END_BATCH();
+       }
+}
+
+static void emit_cb_offset(GLcontext *ctx, struct radeon_state_atom * atom)
+{
+       r300ContextPtr r300 = R300_CONTEXT(ctx);
+       BATCH_LOCALS(&r300->radeon);
+       struct radeon_renderbuffer *rrb;
+       uint32_t cbpitch;
+       uint32_t offset = r300->radeon.state.color.draw_offset;
+
+       rrb = radeon_get_colorbuffer(&r300->radeon);
+       if (!rrb || !rrb->bo) {
+               fprintf(stderr, "no rrb\n");
+               return;
+       }
+
+       cbpitch = (rrb->pitch / rrb->cpp);
+       if (rrb->cpp == 4)
+               cbpitch |= R300_COLOR_FORMAT_ARGB8888;
+       else
+               cbpitch |= R300_COLOR_FORMAT_RGB565;
+
+       if (rrb->bo->flags & RADEON_BO_FLAGS_MACRO_TILE)
+               cbpitch |= R300_COLOR_TILE_ENABLE;
+
+       BEGIN_BATCH_NO_AUTOSTATE(8);
+       OUT_BATCH_REGSEQ(R300_RB3D_COLOROFFSET0, 1);
+       OUT_BATCH_RELOC(offset, rrb->bo, offset, 0, RADEON_GEM_DOMAIN_VRAM, 0);
+       OUT_BATCH_REGSEQ(R300_RB3D_COLORPITCH0, 1);
+       OUT_BATCH_RELOC(cbpitch, rrb->bo, cbpitch, 0, RADEON_GEM_DOMAIN_VRAM, 0);
+       END_BATCH();
+    if (r300->radeon.radeonScreen->driScreen->dri2.enabled) {
+        if (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515) {
+            BEGIN_BATCH_NO_AUTOSTATE(3);
+            OUT_BATCH_REGSEQ(R300_SC_SCISSORS_TL, 2);
+            OUT_BATCH(0);
+            OUT_BATCH((rrb->width << R300_SCISSORS_X_SHIFT) |
+                    (rrb->height << R300_SCISSORS_Y_SHIFT));
+            END_BATCH();
+        } else {
+            BEGIN_BATCH_NO_AUTOSTATE(3);
+            OUT_BATCH_REGSEQ(R300_SC_SCISSORS_TL, 2);
+            OUT_BATCH((R300_SCISSORS_OFFSET << R300_SCISSORS_X_SHIFT) |
+                    (R300_SCISSORS_OFFSET << R300_SCISSORS_Y_SHIFT));
+            OUT_BATCH(((rrb->width + R300_SCISSORS_OFFSET) << R300_SCISSORS_X_SHIFT) |
+                    ((rrb->height + R300_SCISSORS_OFFSET) << R300_SCISSORS_Y_SHIFT));
+            END_BATCH();
+        }
+    }
+}
+
+static void emit_zb_offset(GLcontext *ctx, struct radeon_state_atom * atom)
+{
+       r300ContextPtr r300 = R300_CONTEXT(ctx);
+       BATCH_LOCALS(&r300->radeon);
+       struct radeon_renderbuffer *rrb;
+       uint32_t zbpitch;
+
+       rrb = radeon_get_depthbuffer(&r300->radeon);
+       if (!rrb)
+               return;
+
+       zbpitch = (rrb->pitch / rrb->cpp);
+       if (rrb->bo->flags & RADEON_BO_FLAGS_MACRO_TILE) {
+               zbpitch |= R300_DEPTHMACROTILE_ENABLE;
+       }
+       if (rrb->bo->flags & RADEON_BO_FLAGS_MICRO_TILE){
+               zbpitch |= R300_DEPTHMICROTILE_TILED;
+       }
+       
+       BEGIN_BATCH_NO_AUTOSTATE(6);
+       OUT_BATCH_REGSEQ(R300_ZB_DEPTHOFFSET, 1);
+       OUT_BATCH_RELOC(0, rrb->bo, 0, 0, RADEON_GEM_DOMAIN_VRAM, 0);
+       OUT_BATCH_REGVAL(R300_ZB_DEPTHPITCH, zbpitch);
+       END_BATCH();
+}
+
+static void emit_zstencil_format(GLcontext *ctx, struct radeon_state_atom * atom)
+{
+       r300ContextPtr r300 = R300_CONTEXT(ctx);
+       BATCH_LOCALS(&r300->radeon);
+       struct radeon_renderbuffer *rrb;
+       uint32_t format = 0;
+
+       rrb = radeon_get_depthbuffer(&r300->radeon);
+       if (!rrb)
+         format = 0;
+       else {
+         if (rrb->cpp == 2)
+           format = R300_DEPTHFORMAT_16BIT_INT_Z;
+         else if (rrb->cpp == 4)
+           format = R300_DEPTHFORMAT_24BIT_INT_Z_8BIT_STENCIL;
+       }
+
+       OUT_BATCH(atom->cmd[0]);
+       atom->cmd[1] &= ~0xf;
+       atom->cmd[1] |= format;
+       OUT_BATCH(atom->cmd[1]);
+       OUT_BATCH(atom->cmd[2]);
+       OUT_BATCH(atom->cmd[3]);
+       OUT_BATCH(atom->cmd[4]);
+}
+
+static int check_always(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+       return atom->cmd_size;
+}
+
+static int check_variable(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+       r300ContextPtr r300 = R300_CONTEXT(ctx);
+       int cnt;
+       if (atom->cmd[0] == CP_PACKET2) {
+               return 0;
+       }
+       cnt = packet0_count(r300, atom->cmd);
+       return cnt ? cnt + 1 : 0;
+}
+
+int check_vpu(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+       int cnt;
+
+       cnt = vpu_count(atom->cmd);
+       return cnt ? (cnt * 4) + 1 : 0;
+}
+
+int check_r500fp(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+       int cnt;
+
+       cnt = r500fp_count(atom->cmd);
+       return cnt ? (cnt * 6) + 1 : 0;
+}
+
+int check_r500fp_const(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+       int cnt;
+
+       cnt = r500fp_count(atom->cmd);
+       return cnt ? (cnt * 4) + 1 : 0;
+}
+
+#define ALLOC_STATE( ATOM, CHK, SZ, IDX )                              \
+   do {                                                                        \
+      r300->hw.ATOM.cmd_size = (SZ);                                   \
+      r300->hw.ATOM.cmd = (uint32_t*)CALLOC((SZ) * sizeof(uint32_t));  \
+      r300->hw.ATOM.name = #ATOM;                                      \
+      r300->hw.ATOM.idx = (IDX);                                       \
+      r300->hw.ATOM.check = check_##CHK;                               \
+      r300->hw.ATOM.dirty = GL_FALSE;                                  \
+      r300->radeon.hw.max_state_size += (SZ);                                  \
+      insert_at_tail(&r300->radeon.hw.atomlist, &r300->hw.ATOM);               \
+   } while (0)
+/**
+ * Allocate memory for the command buffer and initialize the state atom
+ * list. Note that the initial hardware state is set by r300InitState().
+ */
+void r300InitCmdBuf(r300ContextPtr r300)
+{
+       int mtu;
+       int has_tcl = 1;
+       int is_r500 = 0;
+       int i;
+
+       if (!(r300->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL))
+               has_tcl = 0;
+
+       if (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515)
+               is_r500 = 1;
+
+       r300->radeon.hw.max_state_size = 2 + 2; /* reserve extra space for WAIT_IDLE and tex cache flush */
+
+       mtu = r300->radeon.glCtx->Const.MaxTextureUnits;
+       if (RADEON_DEBUG & DEBUG_TEXTURE) {
+               fprintf(stderr, "Using %d maximum texture units..\n", mtu);
+       }
+
+       /* Setup the atom linked list */
+       make_empty_list(&r300->radeon.hw.atomlist);
+       r300->radeon.hw.atomlist.name = "atom-list";
+
+       /* Initialize state atoms */
+       ALLOC_STATE(vpt, always, R300_VPT_CMDSIZE, 0);
+       r300->hw.vpt.cmd[R300_VPT_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_SE_VPORT_XSCALE, 6);
+       ALLOC_STATE(vap_cntl, always, R300_VAP_CNTL_SIZE, 0);
+       r300->hw.vap_cntl.cmd[R300_VAP_CNTL_FLUSH] = cmdpacket0(r300->radeon.radeonScreen, R300_VAP_PVS_STATE_FLUSH_REG, 1);
+       r300->hw.vap_cntl.cmd[R300_VAP_CNTL_FLUSH_1] = 0;
+       r300->hw.vap_cntl.cmd[R300_VAP_CNTL_CMD] = cmdpacket0(r300->radeon.radeonScreen, R300_VAP_CNTL, 1);
+       if (is_r500) {
+           ALLOC_STATE(vap_index_offset, always, 2, 0);
+           r300->hw.vap_index_offset.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R500_VAP_INDEX_OFFSET, 1);
+           r300->hw.vap_index_offset.cmd[1] = 0;
+       }
+       ALLOC_STATE(vte, always, 3, 0);
+       r300->hw.vte.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_SE_VTE_CNTL, 2);
+       ALLOC_STATE(vap_vf_max_vtx_indx, always, 3, 0);
+       r300->hw.vap_vf_max_vtx_indx.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_VAP_VF_MAX_VTX_INDX, 2);
+       ALLOC_STATE(vap_cntl_status, always, 2, 0);
+       r300->hw.vap_cntl_status.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_VAP_CNTL_STATUS, 1);
+       ALLOC_STATE(vir[0], variable, R300_VIR_CMDSIZE, 0);
+       r300->hw.vir[0].cmd[R300_VIR_CMD_0] =
+           cmdpacket0(r300->radeon.radeonScreen, R300_VAP_PROG_STREAM_CNTL_0, 1);
+       ALLOC_STATE(vir[1], variable, R300_VIR_CMDSIZE, 1);
+       r300->hw.vir[1].cmd[R300_VIR_CMD_0] =
+           cmdpacket0(r300->radeon.radeonScreen, R300_VAP_PROG_STREAM_CNTL_EXT_0, 1);
+       ALLOC_STATE(vic, always, R300_VIC_CMDSIZE, 0);
+       r300->hw.vic.cmd[R300_VIC_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_VAP_VTX_STATE_CNTL, 2);
+       ALLOC_STATE(vap_psc_sgn_norm_cntl, always, 2, 0);
+       r300->hw.vap_psc_sgn_norm_cntl.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_VAP_PSC_SGN_NORM_CNTL, SGN_NORM_ZERO_CLAMP_MINUS_ONE);
+
+       if (has_tcl) {
+               ALLOC_STATE(vap_clip_cntl, always, 2, 0);
+               r300->hw.vap_clip_cntl.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_VAP_CLIP_CNTL, 1);
+               ALLOC_STATE(vap_clip, always, 5, 0);
+               r300->hw.vap_clip.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_VAP_GB_VERT_CLIP_ADJ, 4);
+               ALLOC_STATE(vap_pvs_vtx_timeout_reg, always, 2, 0);
+               r300->hw.vap_pvs_vtx_timeout_reg.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, VAP_PVS_VTX_TIMEOUT_REG, 1);
+       }
+
+       ALLOC_STATE(vof, always, R300_VOF_CMDSIZE, 0);
+       r300->hw.vof.cmd[R300_VOF_CMD_0] =
+           cmdpacket0(r300->radeon.radeonScreen, R300_VAP_OUTPUT_VTX_FMT_0, 2);
+
+       if (has_tcl) {
+               ALLOC_STATE(pvs, always, R300_PVS_CMDSIZE, 0);
+               r300->hw.pvs.cmd[R300_PVS_CMD_0] =
+                   cmdpacket0(r300->radeon.radeonScreen, R300_VAP_PVS_CODE_CNTL_0, 3);
+       }
+
+       ALLOC_STATE(gb_enable, always, 2, 0);
+       r300->hw.gb_enable.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_GB_ENABLE, 1);
+       ALLOC_STATE(gb_misc, always, R300_GB_MISC_CMDSIZE, 0);
+       r300->hw.gb_misc.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_GB_MSPOS0, 5);
+       ALLOC_STATE(txe, always, R300_TXE_CMDSIZE, 0);
+       r300->hw.txe.cmd[R300_TXE_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_TX_ENABLE, 1);
+       ALLOC_STATE(ga_point_s0, always, 5, 0);
+       r300->hw.ga_point_s0.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_GA_POINT_S0, 4);
+       ALLOC_STATE(ga_triangle_stipple, always, 2, 0);
+       r300->hw.ga_triangle_stipple.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_GA_TRIANGLE_STIPPLE, 1);
+       ALLOC_STATE(ps, always, R300_PS_CMDSIZE, 0);
+       r300->hw.ps.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_GA_POINT_SIZE, 1);
+       ALLOC_STATE(ga_point_minmax, always, 4, 0);
+       r300->hw.ga_point_minmax.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_GA_POINT_MINMAX, 3);
+       ALLOC_STATE(lcntl, always, 2, 0);
+       r300->hw.lcntl.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_GA_LINE_CNTL, 1);
+       ALLOC_STATE(ga_line_stipple, always, 4, 0);
+       r300->hw.ga_line_stipple.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_GA_LINE_STIPPLE_VALUE, 3);
+       ALLOC_STATE(shade, always, 5, 0);
+       r300->hw.shade.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_GA_ENHANCE, 4);
+       ALLOC_STATE(polygon_mode, always, 4, 0);
+       r300->hw.polygon_mode.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_GA_POLY_MODE, 3);
+       ALLOC_STATE(fogp, always, 3, 0);
+       r300->hw.fogp.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_GA_FOG_SCALE, 2);
+       ALLOC_STATE(zbias_cntl, always, 2, 0);
+       r300->hw.zbias_cntl.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_SU_TEX_WRAP, 1);
+       ALLOC_STATE(zbs, always, R300_ZBS_CMDSIZE, 0);
+       r300->hw.zbs.cmd[R300_ZBS_CMD_0] =
+           cmdpacket0(r300->radeon.radeonScreen, R300_SU_POLY_OFFSET_FRONT_SCALE, 4);
+       ALLOC_STATE(occlusion_cntl, always, 2, 0);
+       r300->hw.occlusion_cntl.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_SU_POLY_OFFSET_ENABLE, 1);
+       ALLOC_STATE(cul, always, R300_CUL_CMDSIZE, 0);
+       r300->hw.cul.cmd[R300_CUL_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_SU_CULL_MODE, 1);
+       ALLOC_STATE(su_depth_scale, always, 3, 0);
+       r300->hw.su_depth_scale.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_SU_DEPTH_SCALE, 2);
+       ALLOC_STATE(rc, always, R300_RC_CMDSIZE, 0);
+       r300->hw.rc.cmd[R300_RC_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_RS_COUNT, 2);
+       if (is_r500) {
+               ALLOC_STATE(ri, always, R500_RI_CMDSIZE, 0);
+               r300->hw.ri.cmd[R300_RI_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R500_RS_IP_0, 16);
+               for (i = 0; i < 8; i++) {
+                       r300->hw.ri.cmd[R300_RI_CMD_0 + i +1] =
+                         (R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_S_SHIFT) |
+                          (R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_T_SHIFT) |
+                          (R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_R_SHIFT) |
+                          (R500_RS_IP_PTR_K1 << R500_RS_IP_TEX_PTR_Q_SHIFT);
+               }
+               ALLOC_STATE(rr, variable, R300_RR_CMDSIZE, 0);
+               r300->hw.rr.cmd[R300_RR_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R500_RS_INST_0, 1);
+       } else {
+               ALLOC_STATE(ri, always, R300_RI_CMDSIZE, 0);
+               r300->hw.ri.cmd[R300_RI_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_RS_IP_0, 8);
+               ALLOC_STATE(rr, variable, R300_RR_CMDSIZE, 0);
+               r300->hw.rr.cmd[R300_RR_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_RS_INST_0, 1);
+       }
+       ALLOC_STATE(sc_hyperz, always, 3, 0);
+       r300->hw.sc_hyperz.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_SC_HYPERZ, 2);
+       ALLOC_STATE(sc_screendoor, always, 2, 0);
+       r300->hw.sc_screendoor.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_SC_SCREENDOOR, 1);
+       ALLOC_STATE(us_out_fmt, always, 6, 0);
+       r300->hw.us_out_fmt.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_US_OUT_FMT, 5);
+
+       if (is_r500) {
+               ALLOC_STATE(fp, always, R500_FP_CMDSIZE, 0);
+               r300->hw.fp.cmd[R500_FP_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R500_US_CONFIG, 2);
+               r300->hw.fp.cmd[R500_FP_CNTL] = R500_ZERO_TIMES_ANYTHING_EQUALS_ZERO;
+               r300->hw.fp.cmd[R500_FP_CMD_1] = cmdpacket0(r300->radeon.radeonScreen, R500_US_CODE_ADDR, 3);
+               r300->hw.fp.cmd[R500_FP_CMD_2] = cmdpacket0(r300->radeon.radeonScreen, R500_US_FC_CTRL, 1);
+               r300->hw.fp.cmd[R500_FP_FC_CNTL] = 0; /* FIXME when we add flow control */
+
+               ALLOC_STATE(r500fp, r500fp, R500_FPI_CMDSIZE, 0);
+               r300->hw.r500fp.cmd[R300_FPI_CMD_0] =
+                       cmdr500fp(r300->radeon.radeonScreen, 0, 0, 0, 0);
+               r300->hw.r500fp.emit = emit_r500fp;
+               ALLOC_STATE(r500fp_const, r500fp_const, R500_FPP_CMDSIZE, 0);
+               r300->hw.r500fp_const.cmd[R300_FPI_CMD_0] =
+                       cmdr500fp(r300->radeon.radeonScreen, 0, 0, 1, 0);
+               r300->hw.r500fp_const.emit = emit_r500fp;
+       } else {
+               ALLOC_STATE(fp, always, R300_FP_CMDSIZE, 0);
+               r300->hw.fp.cmd[R300_FP_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_US_CONFIG, 3);
+               r300->hw.fp.cmd[R300_FP_CMD_1] = cmdpacket0(r300->radeon.radeonScreen, R300_US_CODE_ADDR_0, 4);
+
+               ALLOC_STATE(fpt, variable, R300_FPT_CMDSIZE, 0);
+               r300->hw.fpt.cmd[R300_FPT_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_US_TEX_INST_0, 0);
+
+               ALLOC_STATE(fpi[0], variable, R300_FPI_CMDSIZE, 0);
+               r300->hw.fpi[0].cmd[R300_FPI_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_US_ALU_RGB_INST_0, 1);
+               ALLOC_STATE(fpi[1], variable, R300_FPI_CMDSIZE, 1);
+               r300->hw.fpi[1].cmd[R300_FPI_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_US_ALU_RGB_ADDR_0, 1);
+               ALLOC_STATE(fpi[2], variable, R300_FPI_CMDSIZE, 2);
+               r300->hw.fpi[2].cmd[R300_FPI_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_US_ALU_ALPHA_INST_0, 1);
+               ALLOC_STATE(fpi[3], variable, R300_FPI_CMDSIZE, 3);
+               r300->hw.fpi[3].cmd[R300_FPI_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_US_ALU_ALPHA_ADDR_0, 1);
+               ALLOC_STATE(fpp, variable, R300_FPP_CMDSIZE, 0);
+               r300->hw.fpp.cmd[R300_FPP_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_PFS_PARAM_0_X, 0);
+       }
+       ALLOC_STATE(fogs, always, R300_FOGS_CMDSIZE, 0);
+       r300->hw.fogs.cmd[R300_FOGS_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_FG_FOG_BLEND, 1);
+       ALLOC_STATE(fogc, always, R300_FOGC_CMDSIZE, 0);
+       r300->hw.fogc.cmd[R300_FOGC_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_FG_FOG_COLOR_R, 3);
+       ALLOC_STATE(at, always, R300_AT_CMDSIZE, 0);
+       r300->hw.at.cmd[R300_AT_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_FG_ALPHA_FUNC, 2);
+       ALLOC_STATE(fg_depth_src, always, 2, 0);
+       r300->hw.fg_depth_src.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_FG_DEPTH_SRC, 1);
+       ALLOC_STATE(rb3d_cctl, always, 2, 0);
+       r300->hw.rb3d_cctl.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_RB3D_CCTL, 1);
+       ALLOC_STATE(bld, always, R300_BLD_CMDSIZE, 0);
+       r300->hw.bld.cmd[R300_BLD_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_RB3D_CBLEND, 2);
+       ALLOC_STATE(cmk, always, R300_CMK_CMDSIZE, 0);
+       r300->hw.cmk.cmd[R300_CMK_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, RB3D_COLOR_CHANNEL_MASK, 1);
+       if (is_r500) {
+               ALLOC_STATE(blend_color, always, 3, 0);
+               r300->hw.blend_color.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R500_RB3D_CONSTANT_COLOR_AR, 2);
+       } else {
+               ALLOC_STATE(blend_color, always, 2, 0);
+               r300->hw.blend_color.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_RB3D_BLEND_COLOR, 1);
+       }
+       ALLOC_STATE(rop, always, 2, 0);
+       r300->hw.rop.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_RB3D_ROPCNTL, 1);
+       ALLOC_STATE(cb, always, R300_CB_CMDSIZE, 0);
+       r300->hw.cb.emit = &emit_cb_offset;
+       ALLOC_STATE(rb3d_dither_ctl, always, 10, 0);
+       r300->hw.rb3d_dither_ctl.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_RB3D_DITHER_CTL, 9);
+       ALLOC_STATE(rb3d_aaresolve_ctl, always, 2, 0);
+       r300->hw.rb3d_aaresolve_ctl.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_RB3D_AARESOLVE_CTL, 1);
+       ALLOC_STATE(rb3d_discard_src_pixel_lte_threshold, always, 3, 0);
+       r300->hw.rb3d_discard_src_pixel_lte_threshold.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R500_RB3D_DISCARD_SRC_PIXEL_LTE_THRESHOLD, 2);
+       ALLOC_STATE(zs, always, R300_ZS_CMDSIZE, 0);
+       r300->hw.zs.cmd[R300_ZS_CMD_0] =
+           cmdpacket0(r300->radeon.radeonScreen, R300_ZB_CNTL, 3);
+
+       ALLOC_STATE(zstencil_format, always, 5, 0);
+       r300->hw.zstencil_format.cmd[0] =
+           cmdpacket0(r300->radeon.radeonScreen, R300_ZB_FORMAT, 4);
+       r300->hw.zstencil_format.emit = emit_zstencil_format;
+
+       ALLOC_STATE(zb, always, R300_ZB_CMDSIZE, 0);
+       r300->hw.zb.emit = emit_zb_offset;
+       ALLOC_STATE(zb_depthclearvalue, always, 2, 0);
+       r300->hw.zb_depthclearvalue.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_ZB_DEPTHCLEARVALUE, 1);
+       ALLOC_STATE(unk4F30, always, 3, 0);
+       r300->hw.unk4F30.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, 0x4F30, 2);
+       ALLOC_STATE(zb_hiz_offset, always, 2, 0);
+       r300->hw.zb_hiz_offset.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_ZB_HIZ_OFFSET, 1);
+       ALLOC_STATE(zb_hiz_pitch, always, 2, 0);
+       r300->hw.zb_hiz_pitch.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_ZB_HIZ_PITCH, 1);
+
+       /* VPU only on TCL */
+       if (has_tcl) {
+               int i;
+               ALLOC_STATE(vpi, vpu, R300_VPI_CMDSIZE, 0);
+               r300->hw.vpi.cmd[0] =
+                   cmdvpu(r300->radeon.radeonScreen, R300_PVS_CODE_START, 0);
+               r300->hw.vpi.emit = emit_vpu;
+
+               if (is_r500) {
+                   ALLOC_STATE(vpp, vpu, R300_VPP_CMDSIZE, 0);
+                   r300->hw.vpp.cmd[0] =
+                       cmdvpu(r300->radeon.radeonScreen, R500_PVS_CONST_START, 0);
+                   r300->hw.vpp.emit = emit_vpu;
+
+                   ALLOC_STATE(vps, vpu, R300_VPS_CMDSIZE, 0);
+                   r300->hw.vps.cmd[0] =
+                       cmdvpu(r300->radeon.radeonScreen, R500_POINT_VPORT_SCALE_OFFSET, 1);
+                   r300->hw.vps.emit = emit_vpu;
+
+                       for (i = 0; i < 6; i++) {
+                         ALLOC_STATE(vpucp[i], vpu, R300_VPUCP_CMDSIZE, 0);
+                         r300->hw.vpucp[i].cmd[0] =
+                                 cmdvpu(r300->radeon.radeonScreen,
+                           R500_PVS_UCP_START + i, 1);
+                               r300->hw.vpucp[i].emit = emit_vpu;
+                       }
+               } else {
+                   ALLOC_STATE(vpp, vpu, R300_VPP_CMDSIZE, 0);
+                   r300->hw.vpp.cmd[0] =
+                       cmdvpu(r300->radeon.radeonScreen, R300_PVS_CONST_START, 0);
+                   r300->hw.vpp.emit = emit_vpu;
+
+                   ALLOC_STATE(vps, vpu, R300_VPS_CMDSIZE, 0);
+                   r300->hw.vps.cmd[0] =
+                       cmdvpu(r300->radeon.radeonScreen, R300_POINT_VPORT_SCALE_OFFSET, 1);
+                   r300->hw.vps.emit = emit_vpu;
+
+                       for (i = 0; i < 6; i++) {
+                               ALLOC_STATE(vpucp[i], vpu, R300_VPUCP_CMDSIZE, 0);
+                               r300->hw.vpucp[i].cmd[0] =
+                                       cmdvpu(r300->radeon.radeonScreen,
+                                              R300_PVS_UCP_START + i, 1);
+                               r300->hw.vpucp[i].emit = emit_vpu;
+                       }
+               }
+       }
+
+       /* Textures */
+       ALLOC_STATE(tex.filter, variable, mtu + 1, 0);
+       r300->hw.tex.filter.cmd[R300_TEX_CMD_0] =
+           cmdpacket0(r300->radeon.radeonScreen, R300_TX_FILTER0_0, 0);
+
+       ALLOC_STATE(tex.filter_1, variable, mtu + 1, 0);
+       r300->hw.tex.filter_1.cmd[R300_TEX_CMD_0] =
+           cmdpacket0(r300->radeon.radeonScreen, R300_TX_FILTER1_0, 0);
+
+       ALLOC_STATE(tex.size, variable, mtu + 1, 0);
+       r300->hw.tex.size.cmd[R300_TEX_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_TX_SIZE_0, 0);
+
+       ALLOC_STATE(tex.format, variable, mtu + 1, 0);
+       r300->hw.tex.format.cmd[R300_TEX_CMD_0] =
+           cmdpacket0(r300->radeon.radeonScreen, R300_TX_FORMAT_0, 0);
+
+       ALLOC_STATE(tex.pitch, variable, mtu + 1, 0);
+       r300->hw.tex.pitch.cmd[R300_TEX_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_TX_FORMAT2_0, 0);
+
+       ALLOC_STATE(tex.offset, variable, 1, 0);
+       r300->hw.tex.offset.cmd[R300_TEX_CMD_0] =
+           cmdpacket0(r300->radeon.radeonScreen, R300_TX_OFFSET_0, 0);
+       r300->hw.tex.offset.emit = &emit_tex_offsets;
+
+       ALLOC_STATE(tex.chroma_key, variable, mtu + 1, 0);
+       r300->hw.tex.chroma_key.cmd[R300_TEX_CMD_0] =
+           cmdpacket0(r300->radeon.radeonScreen, R300_TX_CHROMA_KEY_0, 0);
+
+       ALLOC_STATE(tex.border_color, variable, mtu + 1, 0);
+       r300->hw.tex.border_color.cmd[R300_TEX_CMD_0] =
+           cmdpacket0(r300->radeon.radeonScreen, R300_TX_BORDER_COLOR_0, 0);
+
+       r300->radeon.hw.is_dirty = GL_TRUE;
+       r300->radeon.hw.all_dirty = GL_TRUE;
+
+       rcommonInitCmdBuf(&r300->radeon);
+}
diff --git a/src/mesa/drivers/dri/r600/r600_cmdbuf.h b/src/mesa/drivers/dri/r600/r600_cmdbuf.h
new file mode 100644 (file)
index 0000000..eeaca96
--- /dev/null
@@ -0,0 +1,50 @@
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/**
+ * \file
+ *
+ * \author Nicolai Haehnle <prefect_@gmx.net>
+ */
+
+#ifndef __R600_CMDBUF_H__
+#define __R600_CMDBUF_H__
+
+#include "r600_context.h"
+
+extern void r300InitCmdBuf(r300ContextPtr r300);
+
+void emit_vpu(GLcontext *ctx, struct radeon_state_atom * atom);
+int check_vpu(GLcontext *ctx, struct radeon_state_atom *atom);
+
+void emit_r500fp(GLcontext *ctx, struct radeon_state_atom * atom);
+int check_r500fp(GLcontext *ctx, struct radeon_state_atom *atom);
+int check_r500fp_const(GLcontext *ctx, struct radeon_state_atom *atom);
+
+#endif                         /* __R600_CMDBUF_H__ */
diff --git a/src/mesa/drivers/dri/r600/r600_context.c b/src/mesa/drivers/dri/r600/r600_context.c
new file mode 100644 (file)
index 0000000..0b351b1
--- /dev/null
@@ -0,0 +1,472 @@
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/**
+ * \file
+ *
+ * \author Keith Whitwell <keith@tungstengraphics.com>
+ *
+ * \author Nicolai Haehnle <prefect_@gmx.net>
+ */
+
+#include "main/glheader.h"
+#include "main/api_arrayelt.h"
+#include "main/context.h"
+#include "main/simple_list.h"
+#include "main/imports.h"
+#include "main/matrix.h"
+#include "main/extensions.h"
+#include "main/state.h"
+#include "main/bufferobj.h"
+#include "main/texobj.h"
+
+#include "swrast/swrast.h"
+#include "swrast_setup/swrast_setup.h"
+#include "vbo/vbo.h"
+
+#include "tnl/tnl.h"
+#include "tnl/t_pipeline.h"
+#include "tnl/t_vp_build.h"
+
+#include "drivers/common/driverfuncs.h"
+
+#include "r600_context.h"
+#include "radeon_context.h"
+#include "radeon_span.h"
+#include "r600_cmdbuf.h"
+#include "r600_state.h"
+#include "r600_ioctl.h"
+#include "r600_tex.h"
+#include "r600_emit.h"
+#include "r600_swtcl.h"
+#include "radeon_bocs_wrapper.h"
+
+
+#include "vblank.h"
+#include "utils.h"
+#include "xmlpool.h"           /* for symbolic values of enum-type options */
+
+/* hw_tcl_on derives from future_hw_tcl_on when its safe to change it. */
+int future_hw_tcl_on = 1;
+int hw_tcl_on = 1;
+
+#define need_GL_VERSION_2_0
+#define need_GL_ARB_point_parameters
+#define need_GL_ARB_vertex_program
+#define need_GL_EXT_blend_equation_separate
+#define need_GL_EXT_blend_func_separate
+#define need_GL_EXT_blend_minmax
+#define need_GL_EXT_framebuffer_object
+#define need_GL_EXT_fog_coord
+#define need_GL_EXT_gpu_program_parameters
+#define need_GL_EXT_secondary_color
+#define need_GL_EXT_stencil_two_side
+#define need_GL_ATI_separate_stencil
+#define need_GL_NV_vertex_program
+
+#include "extension_helper.h"
+
+
+const struct dri_extension card_extensions[] = {
+  /* *INDENT-OFF* */
+  {"GL_ARB_depth_texture",             NULL},
+  {"GL_ARB_fragment_program",          NULL},
+  {"GL_ARB_multitexture",              NULL},
+  {"GL_ARB_point_parameters",          GL_ARB_point_parameters_functions},
+  {"GL_ARB_shadow",                    NULL},
+  {"GL_ARB_shadow_ambient",            NULL},
+  {"GL_ARB_texture_border_clamp",      NULL},
+  {"GL_ARB_texture_cube_map",          NULL},
+  {"GL_ARB_texture_env_add",           NULL},
+  {"GL_ARB_texture_env_combine",       NULL},
+  {"GL_ARB_texture_env_crossbar",      NULL},
+  {"GL_ARB_texture_env_dot3",          NULL},
+  {"GL_ARB_texture_mirrored_repeat",   NULL},
+  {"GL_ARB_vertex_program",            GL_ARB_vertex_program_functions},
+  {"GL_EXT_blend_equation_separate",   GL_EXT_blend_equation_separate_functions},
+  {"GL_EXT_blend_func_separate",       GL_EXT_blend_func_separate_functions},
+  {"GL_EXT_blend_minmax",              GL_EXT_blend_minmax_functions},
+  {"GL_EXT_blend_subtract",            NULL},
+  {"GL_EXT_packed_depth_stencil",      NULL},
+  {"GL_EXT_fog_coord",                 GL_EXT_fog_coord_functions },
+  {"GL_EXT_gpu_program_parameters",     GL_EXT_gpu_program_parameters_functions},
+  {"GL_EXT_secondary_color",           GL_EXT_secondary_color_functions},
+  {"GL_EXT_shadow_funcs",              NULL},
+  {"GL_EXT_stencil_two_side",          GL_EXT_stencil_two_side_functions},
+  {"GL_EXT_stencil_wrap",              NULL},
+  {"GL_EXT_texture_edge_clamp",                NULL},
+  {"GL_EXT_texture_env_combine",       NULL},
+  {"GL_EXT_texture_env_dot3",          NULL},
+  {"GL_EXT_texture_filter_anisotropic",        NULL},
+  {"GL_EXT_texture_lod_bias",          NULL},
+  {"GL_EXT_texture_mirror_clamp",      NULL},
+  {"GL_EXT_texture_rectangle",         NULL},
+  {"GL_ATI_separate_stencil",          GL_ATI_separate_stencil_functions},
+  {"GL_ATI_texture_env_combine3",      NULL},
+  {"GL_ATI_texture_mirror_once",       NULL},
+  {"GL_MESA_pack_invert",              NULL},
+  {"GL_MESA_ycbcr_texture",            NULL},
+  {"GL_MESAX_texture_float",           NULL},
+  {"GL_NV_blend_square",               NULL},
+  {"GL_NV_vertex_program",             GL_NV_vertex_program_functions},
+  {"GL_SGIS_generate_mipmap",          NULL},
+  {NULL,                               NULL}
+  /* *INDENT-ON* */
+};
+
+
+const struct dri_extension mm_extensions[] = {
+  { "GL_EXT_framebuffer_object", GL_EXT_framebuffer_object_functions },
+  { NULL, NULL }
+};
+
+/**
+ * The GL 2.0 functions are needed to make display lists work with
+ * functions added by GL_ATI_separate_stencil.
+ */
+const struct dri_extension gl_20_extension[] = {
+  {"GL_VERSION_2_0",                   GL_VERSION_2_0_functions },
+};
+
+
+extern struct tnl_pipeline_stage _r300_render_stage;
+extern const struct tnl_pipeline_stage _r300_tcl_stage;
+
+static const struct tnl_pipeline_stage *r300_pipeline[] = {
+
+       /* Try and go straight to t&l
+        */
+       &_r300_tcl_stage,
+
+       /* Catch any t&l fallbacks
+        */
+       &_tnl_vertex_transform_stage,
+       &_tnl_normal_transform_stage,
+       &_tnl_lighting_stage,
+       &_tnl_fog_coordinate_stage,
+       &_tnl_texgen_stage,
+       &_tnl_texture_transform_stage,
+       &_tnl_vertex_program_stage,
+
+       /* Try again to go to tcl?
+        *     - no good for asymmetric-twoside (do with multipass)
+        *     - no good for asymmetric-unfilled (do with multipass)
+        *     - good for material
+        *     - good for texgen
+        *     - need to manipulate a bit of state
+        *
+        * - worth it/not worth it?
+        */
+
+       /* Else do them here.
+        */
+       &_r300_render_stage,
+       &_tnl_render_stage,     /* FALLBACK  */
+       0,
+};
+
+static void r300RunPipeline(GLcontext * ctx)
+{
+    _mesa_lock_context_textures(ctx);
+
+    if (ctx->NewState)
+        _mesa_update_state_locked(ctx);
+    
+    _tnl_run_pipeline(ctx);
+    _mesa_unlock_context_textures(ctx);
+}
+
+static void r300_get_lock(radeonContextPtr rmesa)
+{
+       drm_radeon_sarea_t *sarea = rmesa->sarea;
+
+       if (sarea->ctx_owner != rmesa->dri.hwContext) {
+               sarea->ctx_owner = rmesa->dri.hwContext;
+               if (!rmesa->radeonScreen->kernel_mm)
+                       radeon_bo_legacy_texture_age(rmesa->radeonScreen->bom);
+       }
+}                
+
+static void r300_vtbl_emit_cs_header(struct radeon_cs *cs, radeonContextPtr rmesa)
+{
+    /* please flush pipe do all pending work */
+    radeon_cs_write_dword(cs, cmdpacket0(rmesa->radeonScreen,
+                                  R300_SC_SCREENDOOR, 1));
+    radeon_cs_write_dword(cs, 0x0);
+    radeon_cs_write_dword(cs, cmdpacket0(rmesa->radeonScreen,
+                                  R300_SC_SCREENDOOR, 1));
+    radeon_cs_write_dword(cs, 0x00FFFFFF);
+    radeon_cs_write_dword(cs, cmdpacket0(rmesa->radeonScreen,
+                                  R300_SC_HYPERZ, 1));
+    radeon_cs_write_dword(cs, 0x0);
+    radeon_cs_write_dword(cs, cmdpacket0(rmesa->radeonScreen,
+                                  R300_US_CONFIG, 1));
+    radeon_cs_write_dword(cs, 0x0);
+    radeon_cs_write_dword(cs, cmdpacket0(rmesa->radeonScreen,
+                                  R300_ZB_CNTL, 1));
+    radeon_cs_write_dword(cs, 0x0);
+    radeon_cs_write_dword(cs, cmdwait(rmesa->radeonScreen, R300_WAIT_3D));
+    radeon_cs_write_dword(cs, cmdpacket0(rmesa->radeonScreen,
+                                  R300_RB3D_DSTCACHE_CTLSTAT, 1));
+    radeon_cs_write_dword(cs, R300_RB3D_DSTCACHE_CTLSTAT_DC_FLUSH_FLUSH_DIRTY_3D);
+    radeon_cs_write_dword(cs, cmdpacket0(rmesa->radeonScreen,
+                                  R300_ZB_ZCACHE_CTLSTAT, 1));
+    radeon_cs_write_dword(cs, R300_ZB_ZCACHE_CTLSTAT_ZC_FLUSH_FLUSH_AND_FREE);
+    radeon_cs_write_dword(cs, cmdwait(rmesa->radeonScreen,
+                               R300_WAIT_3D | R300_WAIT_3D_CLEAN));
+}
+
+static void r300_vtbl_pre_emit_atoms(radeonContextPtr radeon)
+{
+       r300ContextPtr r300 = (r300ContextPtr)radeon;
+       BATCH_LOCALS(radeon);
+       
+       r300->vap_flush_needed = GL_TRUE;
+       
+       cp_wait(radeon, R300_WAIT_3D | R300_WAIT_3D_CLEAN);
+       BEGIN_BATCH_NO_AUTOSTATE(2);
+       OUT_BATCH_REGVAL(R300_TX_INVALTAGS, R300_TX_FLUSH);
+       END_BATCH();
+       end_3d(radeon);
+}
+
+static void r300_fallback(GLcontext *ctx, GLuint bit, GLboolean mode)
+{
+       r300ContextPtr r300 = R300_CONTEXT(ctx);
+       if (mode)
+               r300->radeon.Fallback |= bit;
+       else
+               r300->radeon.Fallback &= ~bit;
+}
+
+static void r300_init_vtbl(radeonContextPtr radeon)
+{
+       radeon->vtbl.get_lock = r300_get_lock;
+       radeon->vtbl.update_viewport_offset = r300UpdateViewportOffset;
+       radeon->vtbl.emit_cs_header = r300_vtbl_emit_cs_header;
+       radeon->vtbl.swtcl_flush = r300_swtcl_flush;
+       radeon->vtbl.pre_emit_atoms = r300_vtbl_pre_emit_atoms;
+       radeon->vtbl.fallback = r300_fallback;
+}
+
+
+/* Create the device specific rendering context.
+ */
+GLboolean r300CreateContext(const __GLcontextModes * glVisual,
+                           __DRIcontextPrivate * driContextPriv,
+                           void *sharedContextPrivate)
+{
+       __DRIscreenPrivate *sPriv = driContextPriv->driScreenPriv;
+       radeonScreenPtr screen = (radeonScreenPtr) (sPriv->private);
+       struct dd_function_table functions;
+       r300ContextPtr r300;
+       GLcontext *ctx;
+       int tcl_mode;
+
+       assert(glVisual);
+       assert(driContextPriv);
+       assert(screen);
+
+       /* Allocate the R300 context */
+       r300 = (r300ContextPtr) CALLOC(sizeof(*r300));
+       if (!r300)
+               return GL_FALSE;
+
+       if (!(screen->chip_flags & RADEON_CHIPSET_TCL))
+               hw_tcl_on = future_hw_tcl_on = 0;
+
+       r300_init_vtbl(&r300->radeon);
+       /* Parse configuration files.
+        * Do this here so that initialMaxAnisotropy is set before we create
+        * the default textures.
+        */
+       driParseConfigFiles(&r300->radeon.optionCache, &screen->optionCache,
+                           screen->driScreen->myNum, "r300");
+       r300->radeon.initialMaxAnisotropy = driQueryOptionf(&r300->radeon.optionCache,
+                                                    "def_max_anisotropy");
+
+       /* Init default driver functions then plug in our R300-specific functions
+        * (the texture functions are especially important)
+        */
+       _mesa_init_driver_functions(&functions);
+       r300InitIoctlFuncs(&functions);
+       r300InitStateFuncs(&functions);
+       r300InitTextureFuncs(&functions);
+       r300InitShaderFuncs(&functions);
+
+       if (!radeonInitContext(&r300->radeon, &functions,
+                              glVisual, driContextPriv,
+                              sharedContextPrivate)) {
+               FREE(r300);
+               return GL_FALSE;
+       }
+
+       /* Init r300 context data */
+       /* Set the maximum texture size small enough that we can guarentee that
+        * all texture units can bind a maximal texture and have them both in
+        * texturable memory at once.
+        */
+
+       ctx = r300->radeon.glCtx;
+
+       ctx->Const.MaxTextureImageUnits =
+           driQueryOptioni(&r300->radeon.optionCache, "texture_image_units");
+       ctx->Const.MaxTextureCoordUnits =
+           driQueryOptioni(&r300->radeon.optionCache, "texture_coord_units");
+       ctx->Const.MaxTextureUnits =
+           MIN2(ctx->Const.MaxTextureImageUnits,
+                ctx->Const.MaxTextureCoordUnits);
+       ctx->Const.MaxTextureMaxAnisotropy = 16.0;
+       ctx->Const.MaxTextureLodBias = 16.0;
+
+       if (screen->chip_family >= CHIP_FAMILY_RV515) {
+           ctx->Const.MaxTextureLevels = 13;
+           ctx->Const.MaxTextureRectSize = 4096;
+       }
+
+       ctx->Const.MinPointSize = 1.0;
+       ctx->Const.MinPointSizeAA = 1.0;
+       ctx->Const.MaxPointSize = R300_POINTSIZE_MAX;
+       ctx->Const.MaxPointSizeAA = R300_POINTSIZE_MAX;
+
+       ctx->Const.MinLineWidth = 1.0;
+       ctx->Const.MinLineWidthAA = 1.0;
+       ctx->Const.MaxLineWidth = R300_LINESIZE_MAX;
+       ctx->Const.MaxLineWidthAA = R300_LINESIZE_MAX;
+
+       /* Needs further modifications */
+#if 0
+       ctx->Const.MaxArrayLockSize =
+           ( /*512 */ RADEON_BUFFER_SIZE * 16 * 1024) / (4 * 4);
+#endif
+
+       ctx->Const.MaxDrawBuffers = 1;
+
+       /* Initialize the software rasterizer and helper modules.
+        */
+       _swrast_CreateContext(ctx);
+       _vbo_CreateContext(ctx);
+       _tnl_CreateContext(ctx);
+       _swsetup_CreateContext(ctx);
+       _swsetup_Wakeup(ctx);
+       _ae_create_context(ctx);
+
+       /* Install the customized pipeline:
+        */
+       _tnl_destroy_pipeline(ctx);
+       _tnl_install_pipeline(ctx, r300_pipeline);
+
+       /* Try and keep materials and vertices separate:
+        */
+/*     _tnl_isolate_materials(ctx, GL_TRUE); */
+
+       /* Configure swrast and TNL to match hardware characteristics:
+        */
+       _swrast_allow_pixel_fog(ctx, GL_FALSE);
+       _swrast_allow_vertex_fog(ctx, GL_TRUE);
+       _tnl_allow_pixel_fog(ctx, GL_FALSE);
+       _tnl_allow_vertex_fog(ctx, GL_TRUE);
+
+       /* currently bogus data */
+       if (screen->chip_flags & RADEON_CHIPSET_TCL) {
+               ctx->Const.VertexProgram.MaxInstructions = VSF_MAX_FRAGMENT_LENGTH / 4;
+               ctx->Const.VertexProgram.MaxNativeInstructions =
+                 VSF_MAX_FRAGMENT_LENGTH / 4;
+               ctx->Const.VertexProgram.MaxNativeAttribs = 16; /* r420 */
+               ctx->Const.VertexProgram.MaxTemps = 32;
+               ctx->Const.VertexProgram.MaxNativeTemps =
+                 /*VSF_MAX_FRAGMENT_TEMPS */ 32;
+               ctx->Const.VertexProgram.MaxNativeParameters = 256;     /* r420 */
+               ctx->Const.VertexProgram.MaxNativeAddressRegs = 1;
+       }
+
+       ctx->Const.FragmentProgram.MaxNativeTemps = PFS_NUM_TEMP_REGS;
+       ctx->Const.FragmentProgram.MaxNativeAttribs = 11;       /* copy i915... */
+       ctx->Const.FragmentProgram.MaxNativeParameters = PFS_NUM_CONST_REGS;
+       ctx->Const.FragmentProgram.MaxNativeAluInstructions = PFS_MAX_ALU_INST;
+       ctx->Const.FragmentProgram.MaxNativeTexInstructions = PFS_MAX_TEX_INST;
+       ctx->Const.FragmentProgram.MaxNativeInstructions =
+           PFS_MAX_ALU_INST + PFS_MAX_TEX_INST;
+       ctx->Const.FragmentProgram.MaxNativeTexIndirections =
+           PFS_MAX_TEX_INDIRECT;
+       ctx->Const.FragmentProgram.MaxNativeAddressRegs = 0;    /* and these are?? */
+       ctx->VertexProgram._MaintainTnlProgram = GL_TRUE;
+       ctx->FragmentProgram._MaintainTexEnvProgram = GL_TRUE;
+
+       driInitExtensions(ctx, card_extensions, GL_TRUE);
+       if (r300->radeon.radeonScreen->kernel_mm)
+         driInitExtensions(ctx, mm_extensions, GL_FALSE);
+
+       if (driQueryOptionb
+           (&r300->radeon.optionCache, "disable_stencil_two_side"))
+               _mesa_disable_extension(ctx, "GL_EXT_stencil_two_side");
+
+       if (r300->radeon.glCtx->Mesa_DXTn
+           && !driQueryOptionb(&r300->radeon.optionCache, "disable_s3tc")) {
+               _mesa_enable_extension(ctx, "GL_EXT_texture_compression_s3tc");
+               _mesa_enable_extension(ctx, "GL_S3_s3tc");
+       } else
+           if (driQueryOptionb(&r300->radeon.optionCache, "force_s3tc_enable"))
+       {
+               _mesa_enable_extension(ctx, "GL_EXT_texture_compression_s3tc");
+       }
+
+       r300->disable_lowimpact_fallback =
+           driQueryOptionb(&r300->radeon.optionCache,
+                           "disable_lowimpact_fallback");
+       radeon_fbo_init(&r300->radeon);
+       radeonInitSpanFuncs( ctx );
+       r300InitCmdBuf(r300);
+       r300InitState(r300);
+       if (!(screen->chip_flags & RADEON_CHIPSET_TCL))
+               r300InitSwtcl(ctx);
+
+       TNL_CONTEXT(ctx)->Driver.RunPipeline = r300RunPipeline;
+
+       tcl_mode = driQueryOptioni(&r300->radeon.optionCache, "tcl_mode");
+       if (driQueryOptionb(&r300->radeon.optionCache, "no_rast")) {
+               fprintf(stderr, "disabling 3D acceleration\n");
+#if R200_MERGED
+               FALLBACK(&r300->radeon, RADEON_FALLBACK_DISABLE, 1);
+#endif
+       }
+       if (tcl_mode == DRI_CONF_TCL_SW ||
+           !(r300->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL)) {
+               if (r300->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL) {
+                       r300->radeon.radeonScreen->chip_flags &=
+                           ~RADEON_CHIPSET_TCL;
+                       fprintf(stderr, "Disabling HW TCL support\n");
+               }
+               TCL_FALLBACK(r300->radeon.glCtx,
+                            RADEON_TCL_FALLBACK_TCL_DISABLE, 1);
+       }
+
+       return GL_TRUE;
+}
+
diff --git a/src/mesa/drivers/dri/r600/r600_context.h b/src/mesa/drivers/dri/r600/r600_context.h
new file mode 100644 (file)
index 0000000..0d713f7
--- /dev/null
@@ -0,0 +1,716 @@
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/**
+ * \file
+ *
+ * \author Keith Whitwell <keith@tungstengraphics.com>
+ * \author Nicolai Haehnle <prefect_@gmx.net>
+ */
+
+#ifndef __R600_CONTEXT_H__
+#define __R600_CONTEXT_H__
+
+#include "tnl/t_vertex.h"
+#include "drm.h"
+#include "radeon_drm.h"
+#include "dri_util.h"
+#include "texmem.h"
+#include "radeon_common.h"
+
+#include "main/macros.h"
+#include "main/mtypes.h"
+#include "main/colormac.h"
+
+struct r300_context;
+typedef struct r300_context r300ContextRec;
+typedef struct r300_context *r300ContextPtr;
+
+
+#include "main/mm.h"
+
+/* From http://gcc. gnu.org/onlinedocs/gcc-3.2.3/gcc/Variadic-Macros.html .
+   I suppose we could inline this and use macro to fetch out __LINE__ and stuff in case we run into trouble
+   with other compilers ... GLUE!
+*/
+#define WARN_ONCE(a, ...)      { \
+       static int warn##__LINE__=1; \
+       if(warn##__LINE__){ \
+               fprintf(stderr, "*********************************WARN_ONCE*********************************\n"); \
+               fprintf(stderr, "File %s function %s line %d\n", \
+                       __FILE__, __FUNCTION__, __LINE__); \
+               fprintf(stderr,  a, ## __VA_ARGS__);\
+               fprintf(stderr, "***************************************************************************\n"); \
+               warn##__LINE__=0;\
+               } \
+       }
+
+#include "r600_vertprog.h"
+#include "r700_fragprog.h"
+
+
+
+/************ DMA BUFFERS **************/
+
+/* The blit width for texture uploads
+ */
+#define R300_BLIT_WIDTH_BYTES 1024
+#define R300_MAX_TEXTURE_UNITS 8
+
+struct r300_texture_state {
+       int tc_count;           /* number of incoming texture coordinates from VAP */
+};
+
+
+#define R300_VPT_CMD_0         0
+#define R300_VPT_XSCALE                1
+#define R300_VPT_XOFFSET       2
+#define R300_VPT_YSCALE                3
+#define R300_VPT_YOFFSET       4
+#define R300_VPT_ZSCALE                5
+#define R300_VPT_ZOFFSET       6
+#define R300_VPT_CMDSIZE       7
+
+#define R300_VIR_CMD_0         0       /* vir is variable size (at least 1) */
+#define R300_VIR_CNTL_0                1
+#define R300_VIR_CNTL_1                2
+#define R300_VIR_CNTL_2                3
+#define R300_VIR_CNTL_3                4
+#define R300_VIR_CNTL_4                5
+#define R300_VIR_CNTL_5                6
+#define R300_VIR_CNTL_6                7
+#define R300_VIR_CNTL_7                8
+#define R300_VIR_CMDSIZE       9
+
+#define R300_VIC_CMD_0         0
+#define R300_VIC_CNTL_0                1
+#define R300_VIC_CNTL_1                2
+#define R300_VIC_CMDSIZE       3
+
+#define R300_VOF_CMD_0         0
+#define R300_VOF_CNTL_0                1
+#define R300_VOF_CNTL_1                2
+#define R300_VOF_CMDSIZE       3
+
+#define R300_PVS_CMD_0         0
+#define R300_PVS_CNTL_1                1
+#define R300_PVS_CNTL_2                2
+#define R300_PVS_CNTL_3                3
+#define R300_PVS_CMDSIZE       4
+
+#define R300_GB_MISC_CMD_0             0
+#define R300_GB_MISC_MSPOS_0           1
+#define R300_GB_MISC_MSPOS_1           2
+#define R300_GB_MISC_TILE_CONFIG       3
+#define R300_GB_MISC_SELECT            4
+#define R300_GB_MISC_AA_CONFIG         5
+#define R300_GB_MISC_CMDSIZE           6
+
+#define R300_TXE_CMD_0         0
+#define R300_TXE_ENABLE                1
+#define R300_TXE_CMDSIZE       2
+
+#define R300_PS_CMD_0          0
+#define R300_PS_POINTSIZE      1
+#define R300_PS_CMDSIZE                2
+
+#define R300_ZBS_CMD_0         0
+#define R300_ZBS_T_FACTOR      1
+#define R300_ZBS_T_CONSTANT    2
+#define R300_ZBS_W_FACTOR      3
+#define R300_ZBS_W_CONSTANT    4
+#define R300_ZBS_CMDSIZE       5
+
+#define R300_CUL_CMD_0         0
+#define R300_CUL_CULL          1
+#define R300_CUL_CMDSIZE       2
+
+#define R300_RC_CMD_0          0
+#define R300_RC_CNTL_0         1
+#define R300_RC_CNTL_1         2
+#define R300_RC_CMDSIZE                3
+
+#define R300_RI_CMD_0          0
+#define R300_RI_INTERP_0       1
+#define R300_RI_INTERP_1       2
+#define R300_RI_INTERP_2       3
+#define R300_RI_INTERP_3       4
+#define R300_RI_INTERP_4       5
+#define R300_RI_INTERP_5       6
+#define R300_RI_INTERP_6       7
+#define R300_RI_INTERP_7       8
+#define R300_RI_CMDSIZE                9
+
+#define R500_RI_CMDSIZE               17
+
+#define R300_RR_CMD_0          0       /* rr is variable size (at least 1) */
+#define R300_RR_INST_0         1
+#define R300_RR_INST_1         2
+#define R300_RR_INST_2         3
+#define R300_RR_INST_3         4
+#define R300_RR_INST_4         5
+#define R300_RR_INST_5         6
+#define R300_RR_INST_6         7
+#define R300_RR_INST_7         8
+#define R300_RR_CMDSIZE                9
+
+#define R300_FP_CMD_0          0
+#define R300_FP_CNTL0          1
+#define R300_FP_CNTL1          2
+#define R300_FP_CNTL2          3
+#define R300_FP_CMD_1          4
+#define R300_FP_NODE0          5
+#define R300_FP_NODE1          6
+#define R300_FP_NODE2          7
+#define R300_FP_NODE3          8
+#define R300_FP_CMDSIZE                9
+
+#define R500_FP_CMD_0           0
+#define R500_FP_CNTL            1
+#define R500_FP_PIXSIZE         2
+#define R500_FP_CMD_1           3
+#define R500_FP_CODE_ADDR       4
+#define R500_FP_CODE_RANGE      5
+#define R500_FP_CODE_OFFSET     6
+#define R500_FP_CMD_2           7
+#define R500_FP_FC_CNTL         8
+#define R500_FP_CMDSIZE         9
+
+#define R300_FPT_CMD_0         0
+#define R300_FPT_INSTR_0       1
+#define R300_FPT_CMDSIZE       65
+
+#define R300_FPI_CMD_0         0
+#define R300_FPI_INSTR_0       1
+#define R300_FPI_CMDSIZE       65
+/* R500 has space for 512 instructions - 6 dwords per instruction */
+#define R500_FPI_CMDSIZE       (512*6+1)
+
+#define R300_FPP_CMD_0         0
+#define R300_FPP_PARAM_0       1
+#define R300_FPP_CMDSIZE       (32*4+1)
+/* R500 has spcae for 256 constants - 4 dwords per constant */
+#define R500_FPP_CMDSIZE       (256*4+1)
+
+#define R300_FOGS_CMD_0                0
+#define R300_FOGS_STATE                1
+#define R300_FOGS_CMDSIZE      2
+
+#define R300_FOGC_CMD_0                0
+#define R300_FOGC_R            1
+#define R300_FOGC_G            2
+#define R300_FOGC_B            3
+#define R300_FOGC_CMDSIZE      4
+
+#define R300_FOGP_CMD_0                0
+#define R300_FOGP_SCALE                1
+#define R300_FOGP_START                2
+#define R300_FOGP_CMDSIZE      3
+
+#define R300_AT_CMD_0          0
+#define R300_AT_ALPHA_TEST     1
+#define R300_AT_UNKNOWN                2
+#define R300_AT_CMDSIZE                3
+
+#define R300_BLD_CMD_0         0
+#define R300_BLD_CBLEND                1
+#define R300_BLD_ABLEND                2
+#define R300_BLD_CMDSIZE       3
+
+#define R300_CMK_CMD_0         0
+#define R300_CMK_COLORMASK     1
+#define R300_CMK_CMDSIZE       2
+
+#define R300_CB_CMD_0          0
+#define R300_CB_OFFSET         1
+#define R300_CB_CMD_1          2
+#define R300_CB_PITCH          3
+#define R300_CB_CMDSIZE                4
+
+#define R300_ZS_CMD_0          0
+#define R300_ZS_CNTL_0         1
+#define R300_ZS_CNTL_1         2
+#define R300_ZS_CNTL_2         3
+#define R300_ZS_CMDSIZE                4
+
+#define R300_ZB_CMD_0          0
+#define R300_ZB_OFFSET         1
+#define R300_ZB_PITCH          2
+#define R300_ZB_CMDSIZE                3
+
+#define R300_VAP_CNTL_FLUSH     0
+#define R300_VAP_CNTL_FLUSH_1   1
+#define R300_VAP_CNTL_CMD       2
+#define R300_VAP_CNTL_INSTR     3
+#define R300_VAP_CNTL_SIZE      4
+
+#define R300_VPI_CMD_0         0
+#define R300_VPI_INSTR_0       1
+#define R300_VPI_CMDSIZE       1025    /* 256 16 byte instructions */
+
+#define R300_VPP_CMD_0         0
+#define R300_VPP_PARAM_0       1
+#define R300_VPP_CMDSIZE       1025    /* 256 4-component parameters */
+
+#define R300_VPUCP_CMD_0               0
+#define R300_VPUCP_X            1
+#define R300_VPUCP_Y            2
+#define R300_VPUCP_Z            3
+#define R300_VPUCP_W            4
+#define R300_VPUCP_CMDSIZE     5       /* 256 4-component parameters */
+
+#define R300_VPS_CMD_0         0
+#define R300_VPS_ZERO_0                1
+#define R300_VPS_ZERO_1                2
+#define R300_VPS_POINTSIZE     3
+#define R300_VPS_ZERO_3                4
+#define R300_VPS_CMDSIZE       5
+
+       /* the layout is common for all fields inside tex */
+#define R300_TEX_CMD_0         0
+#define R300_TEX_VALUE_0       1
+/* We don't really use this, instead specify mtu+1 dynamically
+#define R300_TEX_CMDSIZE       (MAX_TEXTURE_UNITS+1)
+*/
+
+/**
+ * Cache for hardware register state.
+ */
+struct r300_hw_state {
+       struct radeon_state_atom vpt;   /* viewport (1D98) */
+       struct radeon_state_atom vap_cntl;
+        struct radeon_state_atom vap_index_offset; /* 0x208c r5xx only */
+       struct radeon_state_atom vof;   /* VAP output format register 0x2090 */
+       struct radeon_state_atom vte;   /* (20B0) */
+       struct radeon_state_atom vap_vf_max_vtx_indx;   /* Maximum Vertex Indx Clamp (2134) */
+       struct radeon_state_atom vap_cntl_status;
+       struct radeon_state_atom vir[2];        /* vap input route (2150/21E0) */
+       struct radeon_state_atom vic;   /* vap input control (2180) */
+       struct radeon_state_atom vap_psc_sgn_norm_cntl; /* Programmable Stream Control Signed Normalize Control (21DC) */
+       struct radeon_state_atom vap_clip_cntl;
+       struct radeon_state_atom vap_clip;
+       struct radeon_state_atom vap_pvs_vtx_timeout_reg;       /* Vertex timeout register (2288) */
+       struct radeon_state_atom pvs;   /* pvs_cntl (22D0) */
+       struct radeon_state_atom gb_enable;     /* (4008) */
+       struct radeon_state_atom gb_misc;       /* Multisampling position shifts ? (4010) */
+       struct radeon_state_atom ga_point_s0;   /* S Texture Coordinate of Vertex 0 for Point texture stuffing (LLC) (4200) */
+       struct radeon_state_atom ga_triangle_stipple;   /* (4214) */
+       struct radeon_state_atom ps;    /* pointsize (421C) */
+       struct radeon_state_atom ga_point_minmax;       /* (4230) */
+       struct radeon_state_atom lcntl; /* line control */
+       struct radeon_state_atom ga_line_stipple;       /* (4260) */
+       struct radeon_state_atom shade;
+       struct radeon_state_atom polygon_mode;
+       struct radeon_state_atom fogp;  /* fog parameters (4294) */
+       struct radeon_state_atom ga_soft_reset; /* (429C) */
+       struct radeon_state_atom zbias_cntl;
+       struct radeon_state_atom zbs;   /* zbias (42A4) */
+       struct radeon_state_atom occlusion_cntl;
+       struct radeon_state_atom cul;   /* cull cntl (42B8) */
+       struct radeon_state_atom su_depth_scale;        /* (42C0) */
+       struct radeon_state_atom rc;    /* rs control (4300) */
+       struct radeon_state_atom ri;    /* rs interpolators (4310) */
+       struct radeon_state_atom rr;    /* rs route (4330) */
+       struct radeon_state_atom sc_hyperz;     /* (43A4) */
+       struct radeon_state_atom sc_screendoor; /* (43E8) */
+       struct radeon_state_atom fp;    /* fragment program cntl + nodes (4600) */
+       struct radeon_state_atom fpt;   /* texi - (4620) */
+       struct radeon_state_atom us_out_fmt;    /* (46A4) */
+       struct radeon_state_atom r500fp;        /* r500 fp instructions */
+       struct radeon_state_atom r500fp_const;  /* r500 fp constants */
+       struct radeon_state_atom fpi[4];        /* fp instructions (46C0/47C0/48C0/49C0) */
+       struct radeon_state_atom fogs;  /* fog state (4BC0) */
+       struct radeon_state_atom fogc;  /* fog color (4BC8) */
+       struct radeon_state_atom at;    /* alpha test (4BD4) */
+       struct radeon_state_atom fg_depth_src;  /* (4BD8) */
+       struct radeon_state_atom fpp;   /* 0x4C00 and following */
+       struct radeon_state_atom rb3d_cctl;     /* (4E00) */
+       struct radeon_state_atom bld;   /* blending (4E04) */
+       struct radeon_state_atom cmk;   /* colormask (4E0C) */
+       struct radeon_state_atom blend_color;   /* constant blend color */
+       struct radeon_state_atom rop;   /* ropcntl */
+       struct radeon_state_atom cb;    /* colorbuffer (4E28) */
+       struct radeon_state_atom rb3d_dither_ctl;       /* (4E50) */
+       struct radeon_state_atom rb3d_aaresolve_ctl;    /* (4E88) */
+       struct radeon_state_atom rb3d_discard_src_pixel_lte_threshold;  /* (4E88) I saw it only written on RV350 hardware..  */
+       struct radeon_state_atom zs;    /* zstencil control (4F00) */
+       struct radeon_state_atom zstencil_format;
+       struct radeon_state_atom zb;    /* z buffer (4F20) */
+       struct radeon_state_atom zb_depthclearvalue;    /* (4F28) */
+       struct radeon_state_atom unk4F30;       /* (4F30) */
+       struct radeon_state_atom zb_hiz_offset; /* (4F44) */
+       struct radeon_state_atom zb_hiz_pitch;  /* (4F54) */
+
+       struct radeon_state_atom vpi;   /* vp instructions */
+       struct radeon_state_atom vpp;   /* vp parameters */
+       struct radeon_state_atom vps;   /* vertex point size (?) */
+       struct radeon_state_atom vpucp[6];      /* vp user clip plane - 6 */
+       /* 8 texture units */
+       /* the state is grouped by function and not by
+          texture unit. This makes single unit updates
+          really awkward - we are much better off
+          updating the whole thing at once */
+       struct {
+               struct radeon_state_atom filter;
+               struct radeon_state_atom filter_1;
+               struct radeon_state_atom size;
+               struct radeon_state_atom format;
+               struct radeon_state_atom pitch;
+               struct radeon_state_atom offset;
+               struct radeon_state_atom chroma_key;
+               struct radeon_state_atom border_color;
+       } tex;
+       struct radeon_state_atom txe;   /* tex enable (4104) */
+
+       radeonTexObj *textures[R300_MAX_TEXTURE_UNITS];
+};
+
+/**
+ * State cache
+ */
+
+/* Vertex shader state */
+
+/* Perhaps more if we store programs in vmem? */
+/* drm_r300_cmd_header_t->vpu->count is unsigned char */
+#define VSF_MAX_FRAGMENT_LENGTH (255*4)
+
+/* Can be tested with colormat currently. */
+#define VSF_MAX_FRAGMENT_TEMPS (14)
+
+#define STATE_R300_WINDOW_DIMENSION (STATE_INTERNAL_DRIVER+0)
+#define STATE_R300_TEXRECT_FACTOR (STATE_INTERNAL_DRIVER+1)
+
+struct r300_vertex_shader_fragment {
+       int length;
+       union {
+               GLuint d[VSF_MAX_FRAGMENT_LENGTH];
+               float f[VSF_MAX_FRAGMENT_LENGTH];
+               GLuint i[VSF_MAX_FRAGMENT_LENGTH];
+       } body;
+};
+
+struct r300_vertex_shader_state {
+       struct r300_vertex_shader_fragment program;
+};
+
+extern int hw_tcl_on;
+
+#define COLOR_IS_RGBA
+#define TAG(x) r300##x
+#include "tnl_dd/t_dd_vertex.h"
+#undef TAG
+
+//#define CURRENT_VERTEX_SHADER(ctx) (ctx->VertexProgram._Current)
+#define CURRENT_VERTEX_SHADER(ctx) (R300_CONTEXT(ctx)->selected_vp)
+
+/* Should but doesnt work */
+//#define CURRENT_VERTEX_SHADER(ctx) (R300_CONTEXT(ctx)->curr_vp)
+
+/* r300_vertex_shader_state and r300_vertex_program should probably be merged together someday.
+ * Keeping them them seperate for now should ensure fixed pipeline keeps functioning properly.
+ */
+
+struct r300_vertex_program_key {
+       GLuint InputsRead;
+       GLuint OutputsWritten;
+       GLuint OutputsAdded;
+};
+
+struct r300_vertex_program {
+       struct r300_vertex_program *next;
+       struct r300_vertex_program_key key;
+       int translated;
+
+       struct r300_vertex_shader_fragment program;
+
+       int pos_end;
+       int num_temporaries;    /* Number of temp vars used by program */
+       int wpos_idx;
+       int inputs[VERT_ATTRIB_MAX];
+       int outputs[VERT_RESULT_MAX];
+       int native;
+       int ref_count;
+       int use_ref_count;
+};
+
+struct r300_vertex_program_cont {
+       struct gl_vertex_program mesa_program;  /* Must be first */
+       struct r300_vertex_shader_fragment params;
+       struct r300_vertex_program *progs;
+};
+
+#define PFS_MAX_ALU_INST       64
+#define PFS_MAX_TEX_INST       64
+#define PFS_MAX_TEX_INDIRECT 4
+#define PFS_NUM_TEMP_REGS      32
+#define PFS_NUM_CONST_REGS     16
+
+struct r300_pfs_compile_state;
+
+
+/**
+ * Stores state that influences the compilation of a fragment program.
+ */
+struct r300_fragment_program_external_state {
+       struct {
+               /**
+                * If the sampler is used as a shadow sampler,
+                * this field is:
+                *  0 - GL_LUMINANCE
+                *  1 - GL_INTENSITY
+                *  2 - GL_ALPHA
+                * depending on the depth texture mode.
+                */
+               GLuint depth_texture_mode : 2;
+
+               /**
+                * If the sampler is used as a shadow sampler,
+                * this field is (texture_compare_func - GL_NEVER).
+                * [e.g. if compare function is GL_LEQUAL, this field is 3]
+                *
+                * Otherwise, this field is 0.
+                */
+               GLuint texture_compare_func : 3;
+       } unit[16];
+};
+
+
+struct r300_fragment_program_node {
+       int tex_offset; /**< first tex instruction */
+       int tex_end; /**< last tex instruction, relative to tex_offset */
+       int alu_offset; /**< first ALU instruction */
+       int alu_end; /**< last ALU instruction, relative to alu_offset */
+       int flags;
+};
+
+/**
+ * Stores an R300 fragment program in its compiled-to-hardware form.
+ */
+struct r300_fragment_program_code {
+       struct {
+               int length; /**< total # of texture instructions used */
+               GLuint inst[PFS_MAX_TEX_INST];
+       } tex;
+
+       struct {
+               int length; /**< total # of ALU instructions used */
+               struct {
+                       GLuint inst0;
+                       GLuint inst1;
+                       GLuint inst2;
+                       GLuint inst3;
+               } inst[PFS_MAX_ALU_INST];
+       } alu;
+
+       struct r300_fragment_program_node node[4];
+       int cur_node;
+       int first_node_has_tex;
+
+       /**
+        * Remember which program register a given hardware constant
+        * belongs to.
+        */
+       struct prog_src_register constant[PFS_NUM_CONST_REGS];
+       int const_nr;
+
+       int max_temp_idx;
+};
+
+/**
+ * Store everything about a fragment program that is needed
+ * to render with that program.
+ */
+struct r300_fragment_program {
+       struct gl_fragment_program mesa_program;
+
+       GLboolean translated;
+       GLboolean error;
+
+       struct r300_fragment_program_external_state state;
+       struct r300_fragment_program_code code;
+
+       GLboolean WritesDepth;
+       GLuint optimization;
+};
+
+struct r500_pfs_compile_state;
+
+struct r500_fragment_program_external_state {
+       struct {
+               /**
+                * If the sampler is used as a shadow sampler,
+                * this field is:
+                *  0 - GL_LUMINANCE
+                *  1 - GL_INTENSITY
+                *  2 - GL_ALPHA
+                * depending on the depth texture mode.
+                */
+               GLuint depth_texture_mode : 2;
+
+               /**
+                * If the sampler is used as a shadow sampler,
+                * this field is (texture_compare_func - GL_NEVER).
+                * [e.g. if compare function is GL_LEQUAL, this field is 3]
+                *
+                * Otherwise, this field is 0.
+                */
+               GLuint texture_compare_func : 3;
+       } unit[16];
+};
+
+struct r500_fragment_program_code {
+       struct {
+               GLuint inst0;
+               GLuint inst1;
+               GLuint inst2;
+               GLuint inst3;
+               GLuint inst4;
+               GLuint inst5;
+       } inst[512];
+
+       int inst_offset;
+       int inst_end;
+
+       /**
+        * Remember which program register a given hardware constant
+        * belongs to.
+        */
+       struct prog_src_register constant[PFS_NUM_CONST_REGS];
+       int const_nr;
+
+       int max_temp_idx;
+};
+
+struct r500_fragment_program {
+       struct gl_fragment_program mesa_program;
+
+       GLcontext *ctx;
+       GLboolean translated;
+       GLboolean error;
+
+       struct r500_fragment_program_external_state state;
+       struct r500_fragment_program_code code;
+
+       GLboolean writes_depth;
+
+       GLuint optimization;
+};
+
+#define R300_MAX_AOS_ARRAYS            16
+
+#define REG_COORDS     0
+#define REG_COLOR0     1
+#define REG_TEX0       2
+
+struct r300_state {
+       struct r300_texture_state texture;
+       int sw_tcl_inputs[VERT_ATTRIB_MAX];
+       struct r300_vertex_shader_state vertex_shader;
+
+
+       DECLARE_RENDERINPUTS(render_inputs_bitset);     /* actual render inputs that R300 was configured for.
+                                                          They are the same as tnl->render_inputs for fixed pipeline */
+
+};
+
+#define R300_FALLBACK_NONE 0
+#define R300_FALLBACK_TCL 1
+#define R300_FALLBACK_RAST 2
+
+/* r300_swtcl.c
+ */
+struct r300_swtcl_info {
+  /*
+    * Offset of the 4UB color data within a hardware (swtcl) vertex.
+    */
+   GLuint coloroffset;
+
+   /**
+    * Offset of the 3UB specular color data within a hardware (swtcl) vertex.
+    */
+   GLuint specoffset;
+
+   struct vertex_attribute{
+       GLuint attr;
+       GLubyte format;
+       GLubyte dst_loc;
+       GLuint swizzle;
+       GLubyte write_mask;
+   } vert_attrs[VERT_ATTRIB_MAX];
+
+   GLubyte vertex_attr_count;
+};
+
+
+/**
+ * \brief R300 context structure.
+ */
+struct r300_context {
+       struct radeon_context radeon;   /* parent class, must be first */
+
+       struct r300_hw_state hw;
+
+       struct r300_state state;
+       struct gl_vertex_program *curr_vp;
+       struct r300_vertex_program *selected_vp;
+
+       /* Vertex buffers
+        */
+       GLvector4f dummy_attrib[_TNL_ATTRIB_MAX];
+       GLvector4f *temp_attrib[_TNL_ATTRIB_MAX];
+
+       GLboolean disable_lowimpact_fallback;
+
+       struct r300_swtcl_info swtcl;
+       GLboolean vap_flush_needed;
+};
+
+#define R300_CONTEXT(ctx)              ((r300ContextPtr)(ctx->DriverCtx))
+
+extern void r300DestroyContext(__DRIcontextPrivate * driContextPriv);
+extern GLboolean r300CreateContext(const __GLcontextModes * glVisual,
+                                  __DRIcontextPrivate * driContextPriv,
+                                  void *sharedContextPrivate);
+
+extern void r300SelectVertexShader(r300ContextPtr r300);
+extern void r300InitShaderFuncs(struct dd_function_table *functions);
+extern int r300VertexProgUpdateParams(GLcontext * ctx,
+                                     struct r300_vertex_program_cont *vp,
+                                     float *dst);
+
+#define RADEON_D_CAPTURE 0
+#define RADEON_D_PLAYBACK 1
+#define RADEON_D_PLAYBACK_RAW 2
+#define RADEON_D_T 3
+
+#define r300PackFloat32 radeonPackFloat32
+#define r300PackFloat24 radeonPackFloat24
+
+#endif                         /* __R300_CONTEXT_H__ */
diff --git a/src/mesa/drivers/dri/r600/r600_emit.c b/src/mesa/drivers/dri/r600/r600_emit.c
new file mode 100644 (file)
index 0000000..4d16ad8
--- /dev/null
@@ -0,0 +1,364 @@
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/**
+ * \file
+ *
+ * \author Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#include "main/glheader.h"
+#include "main/mtypes.h"
+#include "main/colormac.h"
+#include "main/imports.h"
+#include "main/macros.h"
+#include "main/image.h"
+
+#include "swrast_setup/swrast_setup.h"
+#include "math/m_translate.h"
+#include "tnl/tnl.h"
+#include "tnl/t_context.h"
+
+#include "r600_context.h"
+#include "r600_state.h"
+#include "r600_emit.h"
+#include "r600_ioctl.h"
+
+
+#if SWIZZLE_X != R300_INPUT_ROUTE_SELECT_X || \
+    SWIZZLE_Y != R300_INPUT_ROUTE_SELECT_Y || \
+    SWIZZLE_Z != R300_INPUT_ROUTE_SELECT_Z || \
+    SWIZZLE_W != R300_INPUT_ROUTE_SELECT_W || \
+    SWIZZLE_ZERO != R300_INPUT_ROUTE_SELECT_ZERO || \
+    SWIZZLE_ONE != R300_INPUT_ROUTE_SELECT_ONE
+#error Cannot change these!
+#endif
+
+#define DEBUG_ALL DEBUG_VERTS
+
+#define DW_SIZE(x) ((inputs[tab[(x)]] << R300_DST_VEC_LOC_SHIFT) |     \
+                   (attribptr[tab[(x)]]->size - 1) << R300_DATA_TYPE_0_SHIFT)
+
+GLuint r300VAPInputRoute0(uint32_t * dst, GLvector4f ** attribptr,
+                                int *inputs, GLint * tab, GLuint nr)
+{
+       GLuint i, dw;
+
+       /* type, inputs, stop bit, size */
+       for (i = 0; i < nr; i += 2) {
+               /* make sure input is valid, would lockup the gpu */
+               assert(inputs[tab[i]] != -1);
+               dw = (R300_SIGNED | DW_SIZE(i));
+               if (i + 1 == nr) {
+                       dw |= R300_LAST_VEC << R300_DATA_TYPE_0_SHIFT;
+               } else {
+                       assert(inputs[tab[i + 1]] != -1);
+                       dw |= (R300_SIGNED |
+                              DW_SIZE(i + 1)) << R300_DATA_TYPE_1_SHIFT;
+                       if (i + 2 == nr) {
+                               dw |= R300_LAST_VEC << R300_DATA_TYPE_1_SHIFT;
+                       }
+               }
+               dst[i >> 1] = dw;
+       }
+
+       return (nr + 1) >> 1;
+}
+
+static GLuint r300VAPInputRoute1Swizzle(int swizzle[4])
+{
+       return (swizzle[0] << R300_SWIZZLE_SELECT_X_SHIFT) |
+           (swizzle[1] << R300_SWIZZLE_SELECT_Y_SHIFT) |
+           (swizzle[2] << R300_SWIZZLE_SELECT_Z_SHIFT) |
+           (swizzle[3] << R300_SWIZZLE_SELECT_W_SHIFT);
+}
+
+GLuint r300VAPInputRoute1(uint32_t * dst, int swizzle[][4], GLuint nr)
+{
+       GLuint i, dw;
+
+       for (i = 0; i < nr; i += 2) {
+               dw = (r300VAPInputRoute1Swizzle(swizzle[i]) |
+                     ((R300_WRITE_ENA_X | R300_WRITE_ENA_Y |
+                       R300_WRITE_ENA_Z | R300_WRITE_ENA_W) << R300_WRITE_ENA_SHIFT)) << R300_SWIZZLE0_SHIFT;
+               if (i + 1 < nr) {
+                       dw |= (r300VAPInputRoute1Swizzle(swizzle[i + 1]) |
+                              ((R300_WRITE_ENA_X | R300_WRITE_ENA_Y |
+                                R300_WRITE_ENA_Z | R300_WRITE_ENA_W) << R300_WRITE_ENA_SHIFT)) << R300_SWIZZLE1_SHIFT;
+               }
+               dst[i >> 1] = dw;
+       }
+
+       return (nr + 1) >> 1;
+}
+
+GLuint r300VAPInputCntl0(GLcontext * ctx, GLuint InputsRead)
+{
+       /* No idea what this value means. I have seen other values written to
+        * this register... */
+       return 0x5555;
+}
+
+GLuint r300VAPInputCntl1(GLcontext * ctx, GLuint InputsRead)
+{
+       r300ContextPtr rmesa = R300_CONTEXT(ctx);
+       GLuint i, vic_1 = 0;
+
+       if (InputsRead & (1 << VERT_ATTRIB_POS))
+               vic_1 |= R300_INPUT_CNTL_POS;
+
+       if (InputsRead & (1 << VERT_ATTRIB_NORMAL))
+               vic_1 |= R300_INPUT_CNTL_NORMAL;
+
+       if (InputsRead & (1 << VERT_ATTRIB_COLOR0))
+               vic_1 |= R300_INPUT_CNTL_COLOR;
+
+       rmesa->state.texture.tc_count = 0;
+       for (i = 0; i < ctx->Const.MaxTextureUnits; i++)
+               if (InputsRead & (1 << (VERT_ATTRIB_TEX0 + i))) {
+                       rmesa->state.texture.tc_count++;
+                       vic_1 |= R300_INPUT_CNTL_TC0 << i;
+               }
+
+       return vic_1;
+}
+
+GLuint r300VAPOutputCntl0(GLcontext * ctx, GLuint OutputsWritten)
+{
+       GLuint ret = 0;
+
+       if (OutputsWritten & (1 << VERT_RESULT_HPOS))
+               ret |= R300_VAP_OUTPUT_VTX_FMT_0__POS_PRESENT;
+
+       if (OutputsWritten & (1 << VERT_RESULT_COL0))
+               ret |= R300_VAP_OUTPUT_VTX_FMT_0__COLOR_0_PRESENT;
+
+       if (OutputsWritten & (1 << VERT_RESULT_COL1))
+               ret |= R300_VAP_OUTPUT_VTX_FMT_0__COLOR_1_PRESENT;
+
+       if (OutputsWritten & (1 << VERT_RESULT_BFC0)
+           || OutputsWritten & (1 << VERT_RESULT_BFC1))
+               ret |=
+                   R300_VAP_OUTPUT_VTX_FMT_0__COLOR_1_PRESENT |
+                   R300_VAP_OUTPUT_VTX_FMT_0__COLOR_2_PRESENT |
+                   R300_VAP_OUTPUT_VTX_FMT_0__COLOR_3_PRESENT;
+
+       if (OutputsWritten & (1 << VERT_RESULT_PSIZ))
+               ret |= R300_VAP_OUTPUT_VTX_FMT_0__PT_SIZE_PRESENT;
+
+       return ret;
+}
+
+GLuint r300VAPOutputCntl1(GLcontext * ctx, GLuint OutputsWritten)
+{
+       GLuint i, ret = 0, first_free_texcoord = 0;
+
+       for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
+               if (OutputsWritten & (1 << (VERT_RESULT_TEX0 + i))) {
+                       ret |= (4 << (3 * i));
+                       ++first_free_texcoord;
+               }
+       }
+
+       if (OutputsWritten & (1 << VERT_RESULT_FOGC)) {
+               if (first_free_texcoord > 8) {
+                       fprintf(stderr, "\tout of free texcoords to write fog coord\n");
+                       _mesa_exit(-1);
+               }
+               ret |= 4 << (3 * first_free_texcoord);
+       }
+
+       return ret;
+}
+
+/* Emit vertex data to GART memory
+ * Route inputs to the vertex processor
+ * This function should never return R300_FALLBACK_TCL when using software tcl.
+ */
+int r300EmitArrays(GLcontext * ctx)
+{
+       r300ContextPtr rmesa = R300_CONTEXT(ctx);
+       TNLcontext *tnl = TNL_CONTEXT(ctx);
+       struct vertex_buffer *vb = &tnl->vb;
+       GLuint nr;
+       GLuint count = vb->Count;
+       GLuint i;
+       GLuint InputsRead = 0, OutputsWritten = 0;
+       int *inputs = NULL;
+       int vir_inputs[VERT_ATTRIB_MAX];
+       GLint tab[VERT_ATTRIB_MAX];
+       int swizzle[VERT_ATTRIB_MAX][4];
+       struct r300_vertex_program *prog =
+           (struct r300_vertex_program *)CURRENT_VERTEX_SHADER(ctx);
+
+       if (hw_tcl_on) {
+               inputs = prog->inputs;
+               InputsRead = prog->key.InputsRead;
+               OutputsWritten = prog->key.OutputsWritten;
+       } else {
+               inputs = rmesa->state.sw_tcl_inputs;
+
+               DECLARE_RENDERINPUTS(render_inputs_bitset);
+               RENDERINPUTS_COPY(render_inputs_bitset, tnl->render_inputs_bitset);
+
+               vb->AttribPtr[VERT_ATTRIB_POS] = vb->ClipPtr;
+
+               assert(RENDERINPUTS_TEST(render_inputs_bitset, _TNL_ATTRIB_POS));
+               assert(RENDERINPUTS_TEST(render_inputs_bitset, _TNL_ATTRIB_NORMAL) == 0);
+
+               if (RENDERINPUTS_TEST(render_inputs_bitset, _TNL_ATTRIB_POS)) {
+                       InputsRead |= 1 << VERT_ATTRIB_POS;
+                       OutputsWritten |= 1 << VERT_RESULT_HPOS;
+               }
+
+               if (RENDERINPUTS_TEST(render_inputs_bitset, _TNL_ATTRIB_COLOR0)) {
+                       InputsRead |= 1 << VERT_ATTRIB_COLOR0;
+                       OutputsWritten |= 1 << VERT_RESULT_COL0;
+               }
+
+               if (RENDERINPUTS_TEST(render_inputs_bitset, _TNL_ATTRIB_COLOR1)) {
+                       InputsRead |= 1 << VERT_ATTRIB_COLOR1;
+                       OutputsWritten |= 1 << VERT_RESULT_COL1;
+               }
+
+               for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
+                       if (RENDERINPUTS_TEST(render_inputs_bitset, _TNL_ATTRIB_TEX(i))) {
+                               InputsRead |= 1 << (VERT_ATTRIB_TEX0 + i);
+                               OutputsWritten |= 1 << (VERT_RESULT_TEX0 + i);
+                       }
+               }
+
+               for (i = 0, nr = 0; i < VERT_ATTRIB_MAX; i++) {
+                       if (InputsRead & (1 << i)) {
+                               inputs[i] = nr++;
+                       } else {
+                               inputs[i] = -1;
+                       }
+               }
+
+               /* Fixed, apply to vir0 only */
+               memcpy(vir_inputs, inputs, VERT_ATTRIB_MAX * sizeof(int));
+               inputs = vir_inputs;
+               if (InputsRead & VERT_ATTRIB_POS)
+                       inputs[VERT_ATTRIB_POS] = 0;
+               if (InputsRead & (1 << VERT_ATTRIB_COLOR0))
+                       inputs[VERT_ATTRIB_COLOR0] = 2;
+               if (InputsRead & (1 << VERT_ATTRIB_COLOR1))
+                       inputs[VERT_ATTRIB_COLOR1] = 3;
+               for (i = VERT_ATTRIB_TEX0; i <= VERT_ATTRIB_TEX7; i++)
+                       if (InputsRead & (1 << i))
+                               inputs[i] = 6 + (i - VERT_ATTRIB_TEX0);
+
+               RENDERINPUTS_COPY(rmesa->state.render_inputs_bitset, render_inputs_bitset);
+       }
+
+       assert(InputsRead);
+       assert(OutputsWritten);
+
+       for (i = 0, nr = 0; i < VERT_ATTRIB_MAX; i++) {
+               if (InputsRead & (1 << i)) {
+                       tab[nr++] = i;
+               }
+       }
+
+       if (nr > R300_MAX_AOS_ARRAYS) {
+               return R300_FALLBACK_TCL;
+       }
+
+       for (i = 0; i < nr; i++) {
+               int ci;
+
+               swizzle[i][0] = SWIZZLE_ZERO;
+               swizzle[i][1] = SWIZZLE_ZERO;
+               swizzle[i][2] = SWIZZLE_ZERO;
+               swizzle[i][3] = SWIZZLE_ONE;
+
+               for (ci = 0; ci < vb->AttribPtr[tab[i]]->size; ci++) {
+                       swizzle[i][ci] = ci;
+               }
+               rcommon_emit_vector(ctx, &rmesa->radeon.tcl.aos[i],
+                                   vb->AttribPtr[tab[i]]->data,
+                                   vb->AttribPtr[tab[i]]->size,
+                                   vb->AttribPtr[tab[i]]->stride, count);
+       }
+
+       /* Setup INPUT_ROUTE. */
+       if (rmesa->radeon.radeonScreen->kernel_mm) {
+               R300_STATECHANGE(rmesa, vir[0]);
+               rmesa->hw.vir[0].cmd[0] &= 0xC000FFFF;
+               rmesa->hw.vir[1].cmd[0] &= 0xC000FFFF;
+               rmesa->hw.vir[0].cmd[0] |=
+                       (r300VAPInputRoute0(&rmesa->hw.vir[0].cmd[R300_VIR_CNTL_0],
+                                           vb->AttribPtr, inputs, tab, nr) & 0x3FFF) << 16;
+               R300_STATECHANGE(rmesa, vir[1]);
+               rmesa->hw.vir[1].cmd[0] |=
+                       (r300VAPInputRoute1(&rmesa->hw.vir[1].cmd[R300_VIR_CNTL_0], swizzle,
+                                           nr) & 0x3FFF) << 16;
+       } else {
+               R300_STATECHANGE(rmesa, vir[0]);
+               ((drm_r300_cmd_header_t *) rmesa->hw.vir[0].cmd)->packet0.count =
+                       r300VAPInputRoute0(&rmesa->hw.vir[0].cmd[R300_VIR_CNTL_0],
+                                          vb->AttribPtr, inputs, tab, nr);
+               R300_STATECHANGE(rmesa, vir[1]);
+               ((drm_r300_cmd_header_t *) rmesa->hw.vir[1].cmd)->packet0.count =
+                       r300VAPInputRoute1(&rmesa->hw.vir[1].cmd[R300_VIR_CNTL_0], swizzle,
+                                          nr);
+       }
+       
+       /* Setup INPUT_CNTL. */
+       R300_STATECHANGE(rmesa, vic);
+       rmesa->hw.vic.cmd[R300_VIC_CNTL_0] = r300VAPInputCntl0(ctx, InputsRead);
+       rmesa->hw.vic.cmd[R300_VIC_CNTL_1] = r300VAPInputCntl1(ctx, InputsRead);
+
+       /* Setup OUTPUT_VTX_FMT. */
+       R300_STATECHANGE(rmesa, vof);
+       rmesa->hw.vof.cmd[R300_VOF_CNTL_0] =
+           r300VAPOutputCntl0(ctx, OutputsWritten);
+       rmesa->hw.vof.cmd[R300_VOF_CNTL_1] =
+           r300VAPOutputCntl1(ctx, OutputsWritten);
+
+       rmesa->radeon.tcl.aos_count = nr;
+
+       return R300_FALLBACK_NONE;
+}
+
+void r300EmitCacheFlush(r300ContextPtr rmesa)
+{
+       BATCH_LOCALS(&rmesa->radeon);
+
+       BEGIN_BATCH_NO_AUTOSTATE(4);
+       OUT_BATCH_REGVAL(R300_RB3D_DSTCACHE_CTLSTAT,
+               R300_RB3D_DSTCACHE_CTLSTAT_DC_FREE_FREE_3D_TAGS |
+               R300_RB3D_DSTCACHE_CTLSTAT_DC_FLUSH_FLUSH_DIRTY_3D);
+       OUT_BATCH_REGVAL(R300_ZB_ZCACHE_CTLSTAT,
+               R300_ZB_ZCACHE_CTLSTAT_ZC_FLUSH_FLUSH_AND_FREE |
+               R300_ZB_ZCACHE_CTLSTAT_ZC_FREE_FREE);
+       END_BATCH();
+       COMMIT_BATCH();
+}
diff --git a/src/mesa/drivers/dri/r600/r600_emit.h b/src/mesa/drivers/dri/r600/r600_emit.h
new file mode 100644 (file)
index 0000000..14c17c0
--- /dev/null
@@ -0,0 +1,234 @@
+/*
+ * Copyright (C) 2005 Vladimir Dergachev.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+/*
+ * Authors:
+ *   Vladimir Dergachev <volodya@mindspring.com>
+ *   Nicolai Haehnle <prefect_@gmx.net>
+ *   Aapo Tahkola <aet@rasterburn.org>
+ *   Ben Skeggs <darktama@iinet.net.au>
+ *   Jerome Glisse <j.glisse@gmail.com>
+ */
+
+/* This files defines functions for accessing R300 hardware.
+ */
+#ifndef __R600_EMIT_H__
+#define __R600_EMIT_H__
+
+#include "main/glheader.h"
+#include "r600_context.h"
+#include "r600_cmdbuf.h"
+#include "radeon_reg.h"
+
+static INLINE uint32_t cmdpacket0(struct radeon_screen *rscrn,
+                                  int reg, int count)
+{
+    if (!rscrn->kernel_mm) {
+           drm_r300_cmd_header_t cmd;
+
+       cmd.u = 0;
+       cmd.packet0.cmd_type = R300_CMD_PACKET0;
+           cmd.packet0.count = count;
+       cmd.packet0.reghi = ((unsigned int)reg & 0xFF00) >> 8;
+           cmd.packet0.reglo = ((unsigned int)reg & 0x00FF);
+
+       return cmd.u;
+    }
+    if (count) {
+        return CP_PACKET0(reg, count - 1);
+    }
+    return CP_PACKET2;
+}
+
+static INLINE uint32_t cmdvpu(struct radeon_screen *rscrn, int addr, int count)
+{
+       drm_r300_cmd_header_t cmd;
+
+       cmd.u = 0;
+       cmd.vpu.cmd_type = R300_CMD_VPU;
+       cmd.vpu.count = count;
+       cmd.vpu.adrhi = ((unsigned int)addr & 0xFF00) >> 8;
+       cmd.vpu.adrlo = ((unsigned int)addr & 0x00FF);
+
+       return cmd.u;
+}
+
+static INLINE uint32_t cmdr500fp(struct radeon_screen *rscrn,
+                                 int addr, int count, int type, int clamp)
+{
+       drm_r300_cmd_header_t cmd;
+
+       cmd.u = 0;
+       cmd.r500fp.cmd_type = R300_CMD_R500FP;
+       cmd.r500fp.count = count;
+       cmd.r500fp.adrhi_flags = ((unsigned int)addr & 0x100) >> 8;
+       cmd.r500fp.adrhi_flags |= type ? R500FP_CONSTANT_TYPE : 0;
+       cmd.r500fp.adrhi_flags |= clamp ? R500FP_CONSTANT_CLAMP : 0;
+       cmd.r500fp.adrlo = ((unsigned int)addr & 0x00FF);
+
+       return cmd.u;
+}
+
+static INLINE uint32_t cmdpacket3(struct radeon_screen *rscrn, int packet)
+{
+       drm_r300_cmd_header_t cmd;
+
+       cmd.u = 0;
+       cmd.packet3.cmd_type = R300_CMD_PACKET3;
+       cmd.packet3.packet = packet;
+
+       return cmd.u;
+}
+
+static INLINE uint32_t cmdcpdelay(struct radeon_screen *rscrn,  
+                                  unsigned short count)
+{
+       drm_r300_cmd_header_t cmd;
+
+       cmd.u = 0;
+
+       cmd.delay.cmd_type = R300_CMD_CP_DELAY;
+       cmd.delay.count = count;
+
+       return cmd.u;
+}
+
+static INLINE uint32_t cmdwait(struct radeon_screen *rscrn,
+                               unsigned char flags)
+{
+       drm_r300_cmd_header_t cmd;
+
+       cmd.u = 0;
+       cmd.wait.cmd_type = R300_CMD_WAIT;
+       cmd.wait.flags = flags;
+
+       return cmd.u;
+}
+
+static INLINE uint32_t cmdpacify(struct radeon_screen *rscrn)
+{
+       drm_r300_cmd_header_t cmd;
+
+       cmd.u = 0;
+       cmd.header.cmd_type = R300_CMD_END3D;
+
+       return cmd.u;
+}
+
+/**
+ * Write the header of a packet3 to the command buffer.
+ * Outputs 2 dwords and expects (num_extra+1) additional dwords afterwards.
+ */
+#define OUT_BATCH_PACKET3(packet, num_extra) do {\
+    if (!b_l_rmesa->radeonScreen->kernel_mm) {         \
+       OUT_BATCH(cmdpacket3(b_l_rmesa->radeonScreen,\
+                  R300_CMD_PACKET3_RAW)); \
+    } else b_l_rmesa->cmdbuf.cs->section_cdw++;\
+       OUT_BATCH(CP_PACKET3((packet), (num_extra))); \
+       } while(0)
+
+/**
+ * Must be sent to switch to 2d commands
+ */
+void static INLINE end_3d(radeonContextPtr radeon)
+{
+       BATCH_LOCALS(radeon);
+
+       if (!radeon->radeonScreen->kernel_mm) {
+               BEGIN_BATCH_NO_AUTOSTATE(1);
+               OUT_BATCH(cmdpacify(radeon->radeonScreen));
+               END_BATCH();
+       }
+}
+
+void static INLINE cp_delay(r300ContextPtr rmesa, unsigned short count)
+{
+       BATCH_LOCALS(&rmesa->radeon);
+
+       if (!rmesa->radeon.radeonScreen->kernel_mm) {
+               BEGIN_BATCH_NO_AUTOSTATE(1);
+               OUT_BATCH(cmdcpdelay(rmesa->radeon.radeonScreen, count));
+               END_BATCH();
+       }
+}
+
+void static INLINE cp_wait(radeonContextPtr radeon, unsigned char flags)
+{
+       BATCH_LOCALS(radeon);
+       uint32_t wait_until;
+
+       if (!radeon->radeonScreen->kernel_mm) {
+               BEGIN_BATCH_NO_AUTOSTATE(1);
+               OUT_BATCH(cmdwait(radeon->radeonScreen, flags));
+               END_BATCH();
+       } else {
+               switch(flags) {
+               case R300_WAIT_2D:
+                       wait_until = (1 << 14);
+                       break;
+               case R300_WAIT_3D:
+                       wait_until = (1 << 15);
+                       break;
+               case R300_NEW_WAIT_2D_3D:
+                       wait_until = (1 << 14) | (1 << 15);
+                       break;
+               case R300_NEW_WAIT_2D_2D_CLEAN:
+                       wait_until = (1 << 14) | (1 << 16) | (1 << 18);
+                       break;
+               case R300_NEW_WAIT_3D_3D_CLEAN:
+                       wait_until = (1 << 15) | (1 << 17) | (1 << 18);
+                       break;
+               case R300_NEW_WAIT_2D_2D_CLEAN_3D_3D_CLEAN:
+                       wait_until  = (1 << 14) | (1 << 16) | (1 << 18);
+                       wait_until |= (1 << 15) | (1 << 17) | (1 << 18);
+                       break;
+               default:
+                       return;
+               }
+               BEGIN_BATCH_NO_AUTOSTATE(2);
+               OUT_BATCH(CP_PACKET0(RADEON_WAIT_UNTIL, 0));
+               OUT_BATCH(wait_until);
+               END_BATCH();
+       }
+}
+
+extern int r300EmitArrays(GLcontext * ctx);
+
+extern int r300PrimitiveType(r300ContextPtr rmesa, int prim);
+extern int r300NumVerts(r300ContextPtr rmesa, int num_verts, int prim);
+
+extern void r300EmitCacheFlush(r300ContextPtr rmesa);
+
+extern GLuint r300VAPInputRoute0(uint32_t * dst, GLvector4f ** attribptr,
+                                int *inputs, GLint * tab, GLuint nr);
+extern GLuint r300VAPInputRoute1(uint32_t * dst, int swizzle[][4], GLuint nr);
+extern GLuint r300VAPInputCntl0(GLcontext * ctx, GLuint InputsRead);
+extern GLuint r300VAPInputCntl1(GLcontext * ctx, GLuint InputsRead);
+extern GLuint r300VAPOutputCntl0(GLcontext * ctx, GLuint OutputsWritten);
+extern GLuint r300VAPOutputCntl1(GLcontext * ctx, GLuint OutputsWritten);
+
+#endif
diff --git a/src/mesa/drivers/dri/r600/r600_fragprog.c b/src/mesa/drivers/dri/r600/r600_fragprog.c
new file mode 100644 (file)
index 0000000..4d2dddc
--- /dev/null
@@ -0,0 +1,699 @@
+/*
+ * Copyright (C) 2005 Ben Skeggs.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+/**
+ * \file
+ *
+ * Fragment program compiler. Perform transformations on the intermediate
+ * representation until the program is in a form where we can translate
+ * it more or less directly into machine-readable form.
+ *
+ * \author Ben Skeggs <darktama@iinet.net.au>
+ * \author Jerome Glisse <j.glisse@gmail.com>
+ */
+
+#include "main/glheader.h"
+#include "main/macros.h"
+#include "main/enums.h"
+#include "shader/prog_instruction.h"
+#include "shader/prog_parameter.h"
+#include "shader/prog_print.h"
+
+#include "r600_context.h"
+#include "r600_fragprog.h"
+#include "r600_fragprog_swizzle.h"
+#include "r600_state.h"
+
+#include "radeon_nqssadce.h"
+#include "radeon_program_alu.h"
+
+
+static void reset_srcreg(struct prog_src_register* reg)
+{
+       _mesa_bzero(reg, sizeof(*reg));
+       reg->Swizzle = SWIZZLE_NOOP;
+}
+
+static struct prog_src_register shadow_ambient(struct gl_program *program, int tmu)
+{
+       gl_state_index fail_value_tokens[STATE_LENGTH] = {
+               STATE_INTERNAL, STATE_SHADOW_AMBIENT, 0, 0, 0
+       };
+       struct prog_src_register reg = { 0, };
+
+       fail_value_tokens[2] = tmu;
+       reg.File = PROGRAM_STATE_VAR;
+       reg.Index = _mesa_add_state_reference(program->Parameters, fail_value_tokens);
+       reg.Swizzle = SWIZZLE_WWWW;
+       return reg;
+}
+
+/**
+ * Transform TEX, TXP, TXB, and KIL instructions in the following way:
+ *  - premultiply texture coordinates for RECT
+ *  - extract operand swizzles
+ *  - introduce a temporary register when write masks are needed
+ *
+ * \todo If/when r5xx uses the radeon_program architecture, this can probably
+ * be reused.
+ */
+static GLboolean transform_TEX(
+       struct radeon_transform_context *t,
+       struct prog_instruction* orig_inst, void* data)
+{
+       struct r300_fragment_program_compiler *compiler =
+               (struct r300_fragment_program_compiler*)data;
+       struct prog_instruction inst = *orig_inst;
+       struct prog_instruction* tgt;
+       GLboolean destredirect = GL_FALSE;
+
+       if (inst.Opcode != OPCODE_TEX &&
+           inst.Opcode != OPCODE_TXB &&
+           inst.Opcode != OPCODE_TXP &&
+           inst.Opcode != OPCODE_KIL)
+               return GL_FALSE;
+
+       if (inst.Opcode != OPCODE_KIL &&
+           t->Program->ShadowSamplers & (1 << inst.TexSrcUnit)) {
+               GLuint comparefunc = GL_NEVER + compiler->fp->state.unit[inst.TexSrcUnit].texture_compare_func;
+
+               if (comparefunc == GL_NEVER || comparefunc == GL_ALWAYS) {
+                       tgt = radeonAppendInstructions(t->Program, 1);
+
+                       tgt->Opcode = OPCODE_MOV;
+                       tgt->DstReg = inst.DstReg;
+                       if (comparefunc == GL_ALWAYS) {
+                               tgt->SrcReg[0].File = PROGRAM_BUILTIN;
+                               tgt->SrcReg[0].Swizzle = SWIZZLE_1111;
+                       } else {
+                               tgt->SrcReg[0] = shadow_ambient(t->Program, inst.TexSrcUnit);
+                       }
+                       return GL_TRUE;
+               }
+
+               inst.DstReg.File = PROGRAM_TEMPORARY;
+               inst.DstReg.Index = radeonFindFreeTemporary(t);
+               inst.DstReg.WriteMask = WRITEMASK_XYZW;
+       }
+
+
+       /* Hardware uses [0..1]x[0..1] range for rectangle textures
+        * instead of [0..Width]x[0..Height].
+        * Add a scaling instruction.
+        */
+       if (inst.Opcode != OPCODE_KIL && inst.TexSrcTarget == TEXTURE_RECT_INDEX) {
+               gl_state_index tokens[STATE_LENGTH] = {
+                       STATE_INTERNAL, STATE_R300_TEXRECT_FACTOR, 0, 0,
+                       0
+               };
+
+               int tempreg = radeonFindFreeTemporary(t);
+               int factor_index;
+
+               tokens[2] = inst.TexSrcUnit;
+               factor_index = _mesa_add_state_reference(t->Program->Parameters, tokens);
+
+               tgt = radeonAppendInstructions(t->Program, 1);
+
+               tgt->Opcode = OPCODE_MUL;
+               tgt->DstReg.File = PROGRAM_TEMPORARY;
+               tgt->DstReg.Index = tempreg;
+               tgt->SrcReg[0] = inst.SrcReg[0];
+               tgt->SrcReg[1].File = PROGRAM_STATE_VAR;
+               tgt->SrcReg[1].Index = factor_index;
+
+               reset_srcreg(&inst.SrcReg[0]);
+               inst.SrcReg[0].File = PROGRAM_TEMPORARY;
+               inst.SrcReg[0].Index = tempreg;
+       }
+
+       if (inst.Opcode != OPCODE_KIL) {
+               if (inst.DstReg.File != PROGRAM_TEMPORARY ||
+                   inst.DstReg.WriteMask != WRITEMASK_XYZW) {
+                       int tempreg = radeonFindFreeTemporary(t);
+
+                       inst.DstReg.File = PROGRAM_TEMPORARY;
+                       inst.DstReg.Index = tempreg;
+                       inst.DstReg.WriteMask = WRITEMASK_XYZW;
+                       destredirect = GL_TRUE;
+               }
+       }
+
+       if (inst.SrcReg[0].File != PROGRAM_TEMPORARY && inst.SrcReg[0].File != PROGRAM_INPUT) {
+               int tmpreg = radeonFindFreeTemporary(t);
+               tgt = radeonAppendInstructions(t->Program, 1);
+               tgt->Opcode = OPCODE_MOV;
+               tgt->DstReg.File = PROGRAM_TEMPORARY;
+               tgt->DstReg.Index = tmpreg;
+               tgt->SrcReg[0] = inst.SrcReg[0];
+
+               reset_srcreg(&inst.SrcReg[0]);
+               inst.SrcReg[0].File = PROGRAM_TEMPORARY;
+               inst.SrcReg[0].Index = tmpreg;
+       }
+       
+       tgt = radeonAppendInstructions(t->Program, 1);
+       _mesa_copy_instructions(tgt, &inst, 1);
+
+       if (inst.Opcode != OPCODE_KIL &&
+           t->Program->ShadowSamplers & (1 << inst.TexSrcUnit)) {
+               GLuint comparefunc = GL_NEVER + compiler->fp->state.unit[inst.TexSrcUnit].texture_compare_func;
+               GLuint depthmode = compiler->fp->state.unit[inst.TexSrcUnit].depth_texture_mode;
+               int rcptemp = radeonFindFreeTemporary(t);
+               int pass, fail;
+
+               tgt = radeonAppendInstructions(t->Program, 3);
+
+               tgt[0].Opcode = OPCODE_RCP;
+               tgt[0].DstReg.File = PROGRAM_TEMPORARY;
+               tgt[0].DstReg.Index = rcptemp;
+               tgt[0].DstReg.WriteMask = WRITEMASK_W;
+               tgt[0].SrcReg[0] = inst.SrcReg[0];
+               tgt[0].SrcReg[0].Swizzle = SWIZZLE_WWWW;
+
+               tgt[1].Opcode = OPCODE_MAD;
+               tgt[1].DstReg = inst.DstReg;
+               tgt[1].DstReg.WriteMask = orig_inst->DstReg.WriteMask;
+               tgt[1].SrcReg[0] = inst.SrcReg[0];
+               tgt[1].SrcReg[0].Swizzle = SWIZZLE_ZZZZ;
+               tgt[1].SrcReg[1].File = PROGRAM_TEMPORARY;
+               tgt[1].SrcReg[1].Index = rcptemp;
+               tgt[1].SrcReg[1].Swizzle = SWIZZLE_WWWW;
+               tgt[1].SrcReg[2].File = PROGRAM_TEMPORARY;
+               tgt[1].SrcReg[2].Index = inst.DstReg.Index;
+               if (depthmode == 0) /* GL_LUMINANCE */
+                       tgt[1].SrcReg[2].Swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_Z);
+               else if (depthmode == 2) /* GL_ALPHA */
+                       tgt[1].SrcReg[2].Swizzle = SWIZZLE_WWWW;
+
+               /* Recall that SrcReg[0] is tex, SrcReg[2] is r and:
+                *   r  < tex  <=>      -tex+r < 0
+                *   r >= tex  <=> not (-tex+r < 0 */
+               if (comparefunc == GL_LESS || comparefunc == GL_GEQUAL)
+                       tgt[1].SrcReg[2].NegateBase = tgt[0].SrcReg[2].NegateBase ^ NEGATE_XYZW;
+               else
+                       tgt[1].SrcReg[0].NegateBase = tgt[0].SrcReg[0].NegateBase ^ NEGATE_XYZW;
+
+               tgt[2].Opcode = OPCODE_CMP;
+               tgt[2].DstReg = orig_inst->DstReg;
+               tgt[2].SrcReg[0].File = PROGRAM_TEMPORARY;
+               tgt[2].SrcReg[0].Index = tgt[1].DstReg.Index;
+
+               if (comparefunc == GL_LESS || comparefunc == GL_GREATER) {
+                       pass = 1;
+                       fail = 2;
+               } else {
+                       pass = 2;
+                       fail = 1;
+               }
+
+               tgt[2].SrcReg[pass].File = PROGRAM_BUILTIN;
+               tgt[2].SrcReg[pass].Swizzle = SWIZZLE_1111;
+               tgt[2].SrcReg[fail] = shadow_ambient(t->Program, inst.TexSrcUnit);
+       } else if (destredirect) {
+               tgt = radeonAppendInstructions(t->Program, 1);
+
+               tgt->Opcode = OPCODE_MOV;
+               tgt->DstReg = orig_inst->DstReg;
+               tgt->SrcReg[0].File = PROGRAM_TEMPORARY;
+               tgt->SrcReg[0].Index = inst.DstReg.Index;
+       }
+
+       return GL_TRUE;
+}
+
+
+static void update_params(r300ContextPtr r300, struct r300_fragment_program *fp)
+{
+       struct gl_fragment_program *mp = &fp->mesa_program;
+
+       /* Ask Mesa nicely to fill in ParameterValues for us */
+       if (mp->Base.Parameters)
+               _mesa_load_state_parameters(r300->radeon.glCtx, mp->Base.Parameters);
+}
+
+
+/**
+ * Transform the program to support fragment.position.
+ *
+ * Introduce a small fragment at the start of the program that will be
+ * the only code that directly reads the FRAG_ATTRIB_WPOS input.
+ * All other code pieces that reference that input will be rewritten
+ * to read from a newly allocated temporary.
+ *
+ * \todo if/when r5xx supports the radeon_program architecture, this is a
+ * likely candidate for code sharing.
+ */
+static void insert_WPOS_trailer(struct r300_fragment_program_compiler *compiler)
+{
+       GLuint InputsRead = compiler->fp->mesa_program.Base.InputsRead;
+
+       if (!(InputsRead & FRAG_BIT_WPOS))
+               return;
+
+       static gl_state_index tokens[STATE_LENGTH] = {
+               STATE_INTERNAL, STATE_R300_WINDOW_DIMENSION, 0, 0, 0
+       };
+       struct prog_instruction *fpi;
+       GLuint window_index;
+       int i = 0;
+       GLuint tempregi = _mesa_find_free_register(compiler->program, PROGRAM_TEMPORARY);
+
+       _mesa_insert_instructions(compiler->program, 0, 3);
+       fpi = compiler->program->Instructions;
+
+       /* perspective divide */
+       fpi[i].Opcode = OPCODE_RCP;
+
+       fpi[i].DstReg.File = PROGRAM_TEMPORARY;
+       fpi[i].DstReg.Index = tempregi;
+       fpi[i].DstReg.WriteMask = WRITEMASK_W;
+       fpi[i].DstReg.CondMask = COND_TR;
+
+       fpi[i].SrcReg[0].File = PROGRAM_INPUT;
+       fpi[i].SrcReg[0].Index = FRAG_ATTRIB_WPOS;
+       fpi[i].SrcReg[0].Swizzle = SWIZZLE_WWWW;
+       i++;
+
+       fpi[i].Opcode = OPCODE_MUL;
+
+       fpi[i].DstReg.File = PROGRAM_TEMPORARY;
+       fpi[i].DstReg.Index = tempregi;
+       fpi[i].DstReg.WriteMask = WRITEMASK_XYZ;
+       fpi[i].DstReg.CondMask = COND_TR;
+
+       fpi[i].SrcReg[0].File = PROGRAM_INPUT;
+       fpi[i].SrcReg[0].Index = FRAG_ATTRIB_WPOS;
+       fpi[i].SrcReg[0].Swizzle = SWIZZLE_XYZW;
+
+       fpi[i].SrcReg[1].File = PROGRAM_TEMPORARY;
+       fpi[i].SrcReg[1].Index = tempregi;
+       fpi[i].SrcReg[1].Swizzle = SWIZZLE_WWWW;
+       i++;
+
+       /* viewport transformation */
+       window_index = _mesa_add_state_reference(compiler->program->Parameters, tokens);
+
+       fpi[i].Opcode = OPCODE_MAD;
+
+       fpi[i].DstReg.File = PROGRAM_TEMPORARY;
+       fpi[i].DstReg.Index = tempregi;
+       fpi[i].DstReg.WriteMask = WRITEMASK_XYZ;
+       fpi[i].DstReg.CondMask = COND_TR;
+
+       fpi[i].SrcReg[0].File = PROGRAM_TEMPORARY;
+       fpi[i].SrcReg[0].Index = tempregi;
+       fpi[i].SrcReg[0].Swizzle =
+           MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
+
+       fpi[i].SrcReg[1].File = PROGRAM_STATE_VAR;
+       fpi[i].SrcReg[1].Index = window_index;
+       fpi[i].SrcReg[1].Swizzle =
+           MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
+
+       fpi[i].SrcReg[2].File = PROGRAM_STATE_VAR;
+       fpi[i].SrcReg[2].Index = window_index;
+       fpi[i].SrcReg[2].Swizzle =
+           MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
+       i++;
+
+       for (; i < compiler->program->NumInstructions; ++i) {
+               int reg;
+               for (reg = 0; reg < 3; reg++) {
+                       if (fpi[i].SrcReg[reg].File == PROGRAM_INPUT &&
+                           fpi[i].SrcReg[reg].Index == FRAG_ATTRIB_WPOS) {
+                               fpi[i].SrcReg[reg].File = PROGRAM_TEMPORARY;
+                               fpi[i].SrcReg[reg].Index = tempregi;
+                       }
+               }
+       }
+}
+
+
+static void nqssadce_init(struct nqssadce_state* s)
+{
+       s->Outputs[FRAG_RESULT_COLOR].Sourced = WRITEMASK_XYZW;
+       s->Outputs[FRAG_RESULT_DEPTH].Sourced = WRITEMASK_W;
+}
+
+
+static GLuint build_dtm(GLuint depthmode)
+{
+       switch(depthmode) {
+       default:
+       case GL_LUMINANCE: return 0;
+       case GL_INTENSITY: return 1;
+       case GL_ALPHA: return 2;
+       }
+}
+
+static GLuint build_func(GLuint comparefunc)
+{
+       return comparefunc - GL_NEVER;
+}
+
+
+/**
+ * Collect all external state that is relevant for compiling the given
+ * fragment program.
+ */
+static void build_state(
+       r300ContextPtr r300,
+       struct r300_fragment_program *fp,
+       struct r300_fragment_program_external_state *state)
+{
+       int unit;
+
+       _mesa_bzero(state, sizeof(*state));
+
+       for(unit = 0; unit < 16; ++unit) {
+               if (fp->mesa_program.Base.ShadowSamplers & (1 << unit)) {
+                       struct gl_texture_object* tex = r300->radeon.glCtx->Texture.Unit[unit]._Current;
+
+                       state->unit[unit].depth_texture_mode = build_dtm(tex->DepthMode);
+                       state->unit[unit].texture_compare_func = build_func(tex->CompareFunc);
+               }
+       }
+}
+
+
+void r300TranslateFragmentShader(r300ContextPtr r300,
+                                struct r300_fragment_program *fp)
+{
+       struct r300_fragment_program_external_state state;
+
+       build_state(r300, fp, &state);
+       if (_mesa_memcmp(&fp->state, &state, sizeof(state))) {
+               /* TODO: cache compiled programs */
+               fp->translated = GL_FALSE;
+               _mesa_memcpy(&fp->state, &state, sizeof(state));
+       }
+
+       if (!fp->translated) {
+               struct r300_fragment_program_compiler compiler;
+
+               compiler.r300 = r300;
+               compiler.fp = fp;
+               compiler.code = &fp->code;
+               compiler.program = _mesa_clone_program(r300->radeon.glCtx, &fp->mesa_program.Base);
+
+               if (RADEON_DEBUG & DEBUG_PIXEL) {
+                       _mesa_printf("Fragment Program: Initial program:\n");
+                       _mesa_print_program(compiler.program);
+               }
+
+               insert_WPOS_trailer(&compiler);
+
+               struct radeon_program_transformation transformations[] = {
+                       { &transform_TEX, &compiler },
+                       { &radeonTransformALU, 0 },
+                       { &radeonTransformTrigSimple, 0 }
+               };
+               radeonLocalTransform(
+                       r300->radeon.glCtx,
+                       compiler.program,
+                       3, transformations);
+
+               if (RADEON_DEBUG & DEBUG_PIXEL) {
+                       _mesa_printf("Fragment Program: After native rewrite:\n");
+                       _mesa_print_program(compiler.program);
+               }
+
+               struct radeon_nqssadce_descr nqssadce = {
+                       .Init = &nqssadce_init,
+                       .IsNativeSwizzle = &r300FPIsNativeSwizzle,
+                       .BuildSwizzle = &r300FPBuildSwizzle,
+                       .RewriteDepthOut = GL_TRUE
+               };
+               radeonNqssaDce(r300->radeon.glCtx, compiler.program, &nqssadce);
+
+               if (RADEON_DEBUG & DEBUG_PIXEL) {
+                       _mesa_printf("Compiler: after NqSSA-DCE:\n");
+                       _mesa_print_program(compiler.program);
+               }
+
+               if (!r300FragmentProgramEmit(&compiler))
+                       fp->error = GL_TRUE;
+
+               /* Subtle: Rescue any parameters that have been added during transformations */
+               _mesa_free_parameter_list(fp->mesa_program.Base.Parameters);
+               fp->mesa_program.Base.Parameters = compiler.program->Parameters;
+               compiler.program->Parameters = 0;
+
+               _mesa_reference_program(r300->radeon.glCtx, &compiler.program, NULL);
+
+               if (!fp->error)
+                       fp->translated = GL_TRUE;
+               if (fp->error || (RADEON_DEBUG & DEBUG_PIXEL))
+                       r300FragmentProgramDump(fp, &fp->code);
+               r300UpdateStateParameters(r300->radeon.glCtx, _NEW_PROGRAM);
+       }
+
+       update_params(r300, fp);
+}
+
+/* just some random things... */
+void r300FragmentProgramDump(
+       struct r300_fragment_program *fp,
+       struct r300_fragment_program_code *code)
+{
+       int n, i, j;
+       static int pc = 0;
+
+       fprintf(stderr, "pc=%d*************************************\n", pc++);
+
+       fprintf(stderr, "Hardware program\n");
+       fprintf(stderr, "----------------\n");
+
+       for (n = 0; n < (code->cur_node + 1); n++) {
+               fprintf(stderr, "NODE %d: alu_offset: %d, tex_offset: %d, "
+                       "alu_end: %d, tex_end: %d, flags: %08x\n", n,
+                       code->node[n].alu_offset,
+                       code->node[n].tex_offset,
+                       code->node[n].alu_end, code->node[n].tex_end,
+                       code->node[n].flags);
+
+               if (n > 0 || code->first_node_has_tex) {
+                       fprintf(stderr, "  TEX:\n");
+                       for (i = code->node[n].tex_offset;
+                            i <= code->node[n].tex_offset + code->node[n].tex_end;
+                            ++i) {
+                               const char *instr;
+
+                               switch ((code->tex.
+                                        inst[i] >> R300_TEX_INST_SHIFT) &
+                                       15) {
+                               case R300_TEX_OP_LD:
+                                       instr = "TEX";
+                                       break;
+                               case R300_TEX_OP_KIL:
+                                       instr = "KIL";
+                                       break;
+                               case R300_TEX_OP_TXP:
+                                       instr = "TXP";
+                                       break;
+                               case R300_TEX_OP_TXB:
+                                       instr = "TXB";
+                                       break;
+                               default:
+                                       instr = "UNKNOWN";
+                               }
+
+                               fprintf(stderr,
+                                       "    %s t%i, %c%i, texture[%i]   (%08x)\n",
+                                       instr,
+                                       (code->tex.
+                                        inst[i] >> R300_DST_ADDR_SHIFT) & 31,
+                                       't',
+                                       (code->tex.
+                                        inst[i] >> R300_SRC_ADDR_SHIFT) & 31,
+                                       (code->tex.
+                                        inst[i] & R300_TEX_ID_MASK) >>
+                                       R300_TEX_ID_SHIFT,
+                                       code->tex.inst[i]);
+                       }
+               }
+
+               for (i = code->node[n].alu_offset;
+                    i <= code->node[n].alu_offset + code->node[n].alu_end; ++i) {
+                       char srcc[3][10], dstc[20];
+                       char srca[3][10], dsta[20];
+                       char argc[3][20];
+                       char arga[3][20];
+                       char flags[5], tmp[10];
+
+                       for (j = 0; j < 3; ++j) {
+                               int regc = code->alu.inst[i].inst1 >> (j * 6);
+                               int rega = code->alu.inst[i].inst3 >> (j * 6);
+
+                               sprintf(srcc[j], "%c%i",
+                                       (regc & 32) ? 'c' : 't', regc & 31);
+                               sprintf(srca[j], "%c%i",
+                                       (rega & 32) ? 'c' : 't', rega & 31);
+                       }
+
+                       dstc[0] = 0;
+                       sprintf(flags, "%s%s%s",
+                               (code->alu.inst[i].
+                                inst1 & R300_ALU_DSTC_REG_X) ? "x" : "",
+                               (code->alu.inst[i].
+                                inst1 & R300_ALU_DSTC_REG_Y) ? "y" : "",
+                               (code->alu.inst[i].
+                                inst1 & R300_ALU_DSTC_REG_Z) ? "z" : "");
+                       if (flags[0] != 0) {
+                               sprintf(dstc, "t%i.%s ",
+                                       (code->alu.inst[i].
+                                        inst1 >> R300_ALU_DSTC_SHIFT) & 31,
+                                       flags);
+                       }
+                       sprintf(flags, "%s%s%s",
+                               (code->alu.inst[i].
+                                inst1 & R300_ALU_DSTC_OUTPUT_X) ? "x" : "",
+                               (code->alu.inst[i].
+                                inst1 & R300_ALU_DSTC_OUTPUT_Y) ? "y" : "",
+                               (code->alu.inst[i].
+                                inst1 & R300_ALU_DSTC_OUTPUT_Z) ? "z" : "");
+                       if (flags[0] != 0) {
+                               sprintf(tmp, "o%i.%s",
+                                       (code->alu.inst[i].
+                                        inst1 >> R300_ALU_DSTC_SHIFT) & 31,
+                                       flags);
+                               strcat(dstc, tmp);
+                       }
+
+                       dsta[0] = 0;
+                       if (code->alu.inst[i].inst3 & R300_ALU_DSTA_REG) {
+                               sprintf(dsta, "t%i.w ",
+                                       (code->alu.inst[i].
+                                        inst3 >> R300_ALU_DSTA_SHIFT) & 31);
+                       }
+                       if (code->alu.inst[i].inst3 & R300_ALU_DSTA_OUTPUT) {
+                               sprintf(tmp, "o%i.w ",
+                                       (code->alu.inst[i].
+                                        inst3 >> R300_ALU_DSTA_SHIFT) & 31);
+                               strcat(dsta, tmp);
+                       }
+                       if (code->alu.inst[i].inst3 & R300_ALU_DSTA_DEPTH) {
+                               strcat(dsta, "Z");
+                       }
+
+                       fprintf(stderr,
+                               "%3i: xyz: %3s %3s %3s -> %-20s (%08x)\n"
+                               "       w: %3s %3s %3s -> %-20s (%08x)\n", i,
+                               srcc[0], srcc[1], srcc[2], dstc,
+                               code->alu.inst[i].inst1, srca[0], srca[1],
+                               srca[2], dsta, code->alu.inst[i].inst3);
+
+                       for (j = 0; j < 3; ++j) {
+                               int regc = code->alu.inst[i].inst0 >> (j * 7);
+                               int rega = code->alu.inst[i].inst2 >> (j * 7);
+                               int d;
+                               char buf[20];
+
+                               d = regc & 31;
+                               if (d < 12) {
+                                       switch (d % 4) {
+                                       case R300_ALU_ARGC_SRC0C_XYZ:
+                                               sprintf(buf, "%s.xyz",
+                                                       srcc[d / 4]);
+                                               break;
+                                       case R300_ALU_ARGC_SRC0C_XXX:
+                                               sprintf(buf, "%s.xxx",
+                                                       srcc[d / 4]);
+                                               break;
+                                       case R300_ALU_ARGC_SRC0C_YYY:
+                                               sprintf(buf, "%s.yyy",
+                                                       srcc[d / 4]);
+                                               break;
+                                       case R300_ALU_ARGC_SRC0C_ZZZ:
+                                               sprintf(buf, "%s.zzz",
+                                                       srcc[d / 4]);
+                                               break;
+                                       }
+                               } else if (d < 15) {
+                                       sprintf(buf, "%s.www", srca[d - 12]);
+                               } else if (d == 20) {
+                                       sprintf(buf, "0.0");
+                               } else if (d == 21) {
+                                       sprintf(buf, "1.0");
+                               } else if (d == 22) {
+                                       sprintf(buf, "0.5");
+                               } else if (d >= 23 && d < 32) {
+                                       d -= 23;
+                                       switch (d / 3) {
+                                       case 0:
+                                               sprintf(buf, "%s.yzx",
+                                                       srcc[d % 3]);
+                                               break;
+                                       case 1:
+                                               sprintf(buf, "%s.zxy",
+                                                       srcc[d % 3]);
+                                               break;
+                                       case 2:
+                                               sprintf(buf, "%s.Wzy",
+                                                       srcc[d % 3]);
+                                               break;
+                                       }
+                               } else {
+                                       sprintf(buf, "%i", d);
+                               }
+
+                               sprintf(argc[j], "%s%s%s%s",
+                                       (regc & 32) ? "-" : "",
+                                       (regc & 64) ? "|" : "",
+                                       buf, (regc & 64) ? "|" : "");
+
+                               d = rega & 31;
+                               if (d < 9) {
+                                       sprintf(buf, "%s.%c", srcc[d / 3],
+                                               'x' + (char)(d % 3));
+                               } else if (d < 12) {
+                                       sprintf(buf, "%s.w", srca[d - 9]);
+                               } else if (d == 16) {
+                                       sprintf(buf, "0.0");
+                               } else if (d == 17) {
+                                       sprintf(buf, "1.0");
+                               } else if (d == 18) {
+                                       sprintf(buf, "0.5");
+                               } else {
+                                       sprintf(buf, "%i", d);
+                               }
+
+                               sprintf(arga[j], "%s%s%s%s",
+                                       (rega & 32) ? "-" : "",
+                                       (rega & 64) ? "|" : "",
+                                       buf, (rega & 64) ? "|" : "");
+                       }
+
+                       fprintf(stderr, "     xyz: %8s %8s %8s    op: %08x\n"
+                               "       w: %8s %8s %8s    op: %08x\n",
+                               argc[0], argc[1], argc[2],
+                               code->alu.inst[i].inst0, arga[0], arga[1],
+                               arga[2], code->alu.inst[i].inst2);
+               }
+       }
+}
diff --git a/src/mesa/drivers/dri/r600/r600_fragprog.h b/src/mesa/drivers/dri/r600/r600_fragprog.h
new file mode 100644 (file)
index 0000000..3813985
--- /dev/null
@@ -0,0 +1,132 @@
+/*
+ * Copyright (C) 2005 Ben Skeggs.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+/*
+ * Authors:
+ *   Ben Skeggs <darktama@iinet.net.au>
+ *   Jerome Glisse <j.glisse@gmail.com>
+ */
+#ifndef __R600_FRAGPROG_H_
+#define __R600_FRAGPROG_H_
+
+#include "main/glheader.h"
+#include "main/macros.h"
+#include "main/enums.h"
+#include "shader/program.h"
+#include "shader/prog_instruction.h"
+
+#include "r600_context.h"
+#include "radeon_program.h"
+
+#define DRI_CONF_FP_OPTIMIZATION_SPEED   0
+#define DRI_CONF_FP_OPTIMIZATION_QUALITY 1
+
+#if 1
+
+/**
+ * Fragment program helper macros
+ */
+
+/* Produce unshifted source selectors */
+#define FP_TMP(idx) (idx)
+#define FP_CONST(idx) ((idx) | (1 << 5))
+
+/* Produce source/dest selector dword */
+#define FP_SELC_MASK_NO                0
+#define FP_SELC_MASK_X         1
+#define FP_SELC_MASK_Y         2
+#define FP_SELC_MASK_XY                3
+#define FP_SELC_MASK_Z         4
+#define FP_SELC_MASK_XZ                5
+#define FP_SELC_MASK_YZ                6
+#define FP_SELC_MASK_XYZ       7
+
+#define FP_SELC(destidx,regmask,outmask,src0,src1,src2) \
+       (((destidx) << R300_ALU_DSTC_SHIFT) |           \
+        (FP_SELC_MASK_##regmask << 23) |               \
+        (FP_SELC_MASK_##outmask << 26) |               \
+        ((src0) << R300_ALU_SRC0C_SHIFT) |             \
+        ((src1) << R300_ALU_SRC1C_SHIFT) |             \
+        ((src2) << R300_ALU_SRC2C_SHIFT))
+
+#define FP_SELA_MASK_NO                0
+#define FP_SELA_MASK_W         1
+
+#define FP_SELA(destidx,regmask,outmask,src0,src1,src2) \
+       (((destidx) << R300_ALU_DSTA_SHIFT) |           \
+        (FP_SELA_MASK_##regmask << 23) |               \
+        (FP_SELA_MASK_##outmask << 24) |               \
+        ((src0) << R300_ALU_SRC0A_SHIFT) |             \
+        ((src1) << R300_ALU_SRC1A_SHIFT) |             \
+        ((src2) << R300_ALU_SRC2A_SHIFT))
+
+/* Produce unshifted argument selectors */
+#define FP_ARGC(source)        R300_ALU_ARGC_##source
+#define FP_ARGA(source) R300_ALU_ARGA_##source
+#define FP_ABS(arg) ((arg) | (1 << 6))
+#define FP_NEG(arg) ((arg) ^ (1 << 5))
+
+/* Produce instruction dword */
+#define FP_INSTRC(opcode,arg0,arg1,arg2) \
+       (R300_ALU_OUTC_##opcode |               \
+       ((arg0) << R300_ALU_ARG0C_SHIFT) |      \
+       ((arg1) << R300_ALU_ARG1C_SHIFT) |      \
+       ((arg2) << R300_ALU_ARG2C_SHIFT))
+
+#define FP_INSTRA(opcode,arg0,arg1,arg2) \
+       (R300_ALU_OUTA_##opcode |               \
+       ((arg0) << R300_ALU_ARG0A_SHIFT) |      \
+       ((arg1) << R300_ALU_ARG1A_SHIFT) |      \
+       ((arg2) << R300_ALU_ARG2A_SHIFT))
+
+#endif
+
+struct r300_fragment_program;
+
+extern void r300TranslateFragmentShader(r300ContextPtr r300,
+                                       struct r300_fragment_program *fp);
+
+
+/**
+ * Used internally by the r300 fragment program code to store compile-time
+ * only data.
+ */
+struct r300_fragment_program_compiler {
+       r300ContextPtr r300;
+       struct r300_fragment_program *fp;
+       struct r300_fragment_program_code *code;
+       struct gl_program *program;
+};
+
+extern GLboolean r300FragmentProgramEmit(struct r300_fragment_program_compiler *compiler);
+
+
+extern void r300FragmentProgramDump(
+       struct r300_fragment_program *fp,
+       struct r300_fragment_program_code *code);
+
+#endif
diff --git a/src/mesa/drivers/dri/r600/r600_fragprog_emit.c b/src/mesa/drivers/dri/r600/r600_fragprog_emit.c
new file mode 100644 (file)
index 0000000..f03559b
--- /dev/null
@@ -0,0 +1,344 @@
+/*
+ * Copyright (C) 2005 Ben Skeggs.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+/**
+ * \file
+ *
+ * Emit the r300_fragment_program_code that can be understood by the hardware.
+ * Input is a pre-transformed radeon_program.
+ *
+ * \author Ben Skeggs <darktama@iinet.net.au>
+ *
+ * \author Jerome Glisse <j.glisse@gmail.com>
+ *
+ * \todo FogOption
+ */
+
+#include "r600_fragprog.h"
+
+#include "radeon_program_pair.h"
+#include "r600_fragprog_swizzle.h"
+#include "r600_reg.h"
+
+
+#define PROG_CODE \
+       struct r300_fragment_program_compiler *c = (struct r300_fragment_program_compiler*)data; \
+       struct r300_fragment_program_code *code = c->code
+
+#define error(fmt, args...) do {                       \
+               fprintf(stderr, "%s::%s(): " fmt "\n",  \
+                       __FILE__, __FUNCTION__, ##args);        \
+       } while(0)
+
+
+static GLboolean emit_const(void* data, GLuint file, GLuint index, GLuint *hwindex)
+{
+       PROG_CODE;
+
+       for (*hwindex = 0; *hwindex < code->const_nr; ++*hwindex) {
+               if (code->constant[*hwindex].File == file &&
+                   code->constant[*hwindex].Index == index)
+                       break;
+       }
+
+       if (*hwindex >= code->const_nr) {
+               if (*hwindex >= PFS_NUM_CONST_REGS) {
+                       error("Out of hw constants!\n");
+                       return GL_FALSE;
+               }
+
+               code->const_nr++;
+               code->constant[*hwindex].File = file;
+               code->constant[*hwindex].Index = index;
+       }
+
+       return GL_TRUE;
+}
+
+
+/**
+ * Mark a temporary register as used.
+ */
+static void use_temporary(struct r300_fragment_program_code *code, GLuint index)
+{
+       if (index > code->max_temp_idx)
+               code->max_temp_idx = index;
+}
+
+
+static GLuint translate_rgb_opcode(GLuint opcode)
+{
+       switch(opcode) {
+       case OPCODE_CMP: return R300_ALU_OUTC_CMP;
+       case OPCODE_DP3: return R300_ALU_OUTC_DP3;
+       case OPCODE_DP4: return R300_ALU_OUTC_DP4;
+       case OPCODE_FRC: return R300_ALU_OUTC_FRC;
+       default:
+               error("translate_rgb_opcode(%i): Unknown opcode", opcode);
+               /* fall through */
+       case OPCODE_NOP:
+               /* fall through */
+       case OPCODE_MAD: return R300_ALU_OUTC_MAD;
+       case OPCODE_MAX: return R300_ALU_OUTC_MAX;
+       case OPCODE_MIN: return R300_ALU_OUTC_MIN;
+       case OPCODE_REPL_ALPHA: return R300_ALU_OUTC_REPL_ALPHA;
+       }
+}
+
+static GLuint translate_alpha_opcode(GLuint opcode)
+{
+       switch(opcode) {
+       case OPCODE_CMP: return R300_ALU_OUTA_CMP;
+       case OPCODE_DP3: return R300_ALU_OUTA_DP4;
+       case OPCODE_DP4: return R300_ALU_OUTA_DP4;
+       case OPCODE_EX2: return R300_ALU_OUTA_EX2;
+       case OPCODE_FRC: return R300_ALU_OUTA_FRC;
+       case OPCODE_LG2: return R300_ALU_OUTA_LG2;
+       default:
+               error("translate_rgb_opcode(%i): Unknown opcode", opcode);
+               /* fall through */
+       case OPCODE_NOP:
+               /* fall through */
+       case OPCODE_MAD: return R300_ALU_OUTA_MAD;
+       case OPCODE_MAX: return R300_ALU_OUTA_MAX;
+       case OPCODE_MIN: return R300_ALU_OUTA_MIN;
+       case OPCODE_RCP: return R300_ALU_OUTA_RCP;
+       case OPCODE_RSQ: return R300_ALU_OUTA_RSQ;
+       }
+}
+
+/**
+ * Emit one paired ALU instruction.
+ */
+static GLboolean emit_alu(void* data, struct radeon_pair_instruction* inst)
+{
+       PROG_CODE;
+
+       if (code->alu.length >= PFS_MAX_ALU_INST) {
+               error("Too many ALU instructions");
+               return GL_FALSE;
+       }
+
+       int ip = code->alu.length++;
+       int j;
+       code->node[code->cur_node].alu_end++;
+
+       code->alu.inst[ip].inst0 = translate_rgb_opcode(inst->RGB.Opcode);
+       code->alu.inst[ip].inst2 = translate_alpha_opcode(inst->Alpha.Opcode);
+
+       for(j = 0; j < 3; ++j) {
+               GLuint src = inst->RGB.Src[j].Index | (inst->RGB.Src[j].Constant << 5);
+               if (!inst->RGB.Src[j].Constant)
+                       use_temporary(code, inst->RGB.Src[j].Index);
+               code->alu.inst[ip].inst1 |= src << (6*j);
+
+               src = inst->Alpha.Src[j].Index | (inst->Alpha.Src[j].Constant << 5);
+               if (!inst->Alpha.Src[j].Constant)
+                       use_temporary(code, inst->Alpha.Src[j].Index);
+               code->alu.inst[ip].inst3 |= src << (6*j);
+
+               GLuint arg = r300FPTranslateRGBSwizzle(inst->RGB.Arg[j].Source, inst->RGB.Arg[j].Swizzle);
+               arg |= inst->RGB.Arg[j].Abs << 6;
+               arg |= inst->RGB.Arg[j].Negate << 5;
+               code->alu.inst[ip].inst0 |= arg << (7*j);
+
+               arg = r300FPTranslateAlphaSwizzle(inst->Alpha.Arg[j].Source, inst->Alpha.Arg[j].Swizzle);
+               arg |= inst->Alpha.Arg[j].Abs << 6;
+               arg |= inst->Alpha.Arg[j].Negate << 5;
+               code->alu.inst[ip].inst2 |= arg << (7*j);
+       }
+
+       if (inst->RGB.Saturate)
+               code->alu.inst[ip].inst0 |= R300_ALU_OUTC_CLAMP;
+       if (inst->Alpha.Saturate)
+               code->alu.inst[ip].inst2 |= R300_ALU_OUTA_CLAMP;
+
+       if (inst->RGB.WriteMask) {
+               use_temporary(code, inst->RGB.DestIndex);
+               code->alu.inst[ip].inst1 |=
+                       (inst->RGB.DestIndex << R300_ALU_DSTC_SHIFT) |
+                       (inst->RGB.WriteMask << R300_ALU_DSTC_REG_MASK_SHIFT);
+       }
+       if (inst->RGB.OutputWriteMask) {
+               code->alu.inst[ip].inst1 |= (inst->RGB.OutputWriteMask << R300_ALU_DSTC_OUTPUT_MASK_SHIFT);
+               code->node[code->cur_node].flags |= R300_RGBA_OUT;
+       }
+
+       if (inst->Alpha.WriteMask) {
+               use_temporary(code, inst->Alpha.DestIndex);
+               code->alu.inst[ip].inst3 |=
+                       (inst->Alpha.DestIndex << R300_ALU_DSTA_SHIFT) |
+                       R300_ALU_DSTA_REG;
+       }
+       if (inst->Alpha.OutputWriteMask) {
+               code->alu.inst[ip].inst3 |= R300_ALU_DSTA_OUTPUT;
+               code->node[code->cur_node].flags |= R300_RGBA_OUT;
+       }
+       if (inst->Alpha.DepthWriteMask) {
+               code->alu.inst[ip].inst3 |= R300_ALU_DSTA_DEPTH;
+               code->node[code->cur_node].flags |= R300_W_OUT;
+               c->fp->WritesDepth = GL_TRUE;
+       }
+
+       return GL_TRUE;
+}
+
+
+/**
+ * Finish the current node without advancing to the next one.
+ */
+static GLboolean finish_node(struct r300_fragment_program_compiler *c)
+{
+       struct r300_fragment_program_code *code = c->code;
+       struct r300_fragment_program_node *node = &code->node[code->cur_node];
+
+       if (node->alu_end < 0) {
+               /* Generate a single NOP for this node */
+               struct radeon_pair_instruction inst;
+               _mesa_bzero(&inst, sizeof(inst));
+               if (!emit_alu(c, &inst))
+                       return GL_FALSE;
+       }
+
+       if (node->tex_end < 0) {
+               if (code->cur_node == 0) {
+                       node->tex_end = 0;
+               } else {
+                       error("Node %i has no TEX instructions", code->cur_node);
+                       return GL_FALSE;
+               }
+       } else {
+               if (code->cur_node == 0)
+                       code->first_node_has_tex = 1;
+       }
+
+       return GL_TRUE;
+}
+
+
+/**
+ * Begin a block of texture instructions.
+ * Create the necessary indirection.
+ */
+static GLboolean begin_tex(void* data)
+{
+       PROG_CODE;
+
+       if (code->cur_node == 0) {
+               if (code->node[0].alu_end < 0 &&
+                   code->node[0].tex_end < 0)
+                       return GL_TRUE;
+       }
+
+       if (code->cur_node == 3) {
+               error("Too many texture indirections");
+               return GL_FALSE;
+       }
+
+       if (!finish_node(c))
+               return GL_FALSE;
+
+       struct r300_fragment_program_node *node = &code->node[++code->cur_node];
+       node->alu_offset = code->alu.length;
+       node->alu_end = -1;
+       node->tex_offset = code->tex.length;
+       node->tex_end = -1;
+       return GL_TRUE;
+}
+
+
+static GLboolean emit_tex(void* data, struct prog_instruction* inst)
+{
+       PROG_CODE;
+
+       if (code->tex.length >= PFS_MAX_TEX_INST) {
+               error("Too many TEX instructions");
+               return GL_FALSE;
+       }
+
+       GLuint unit = inst->TexSrcUnit;
+       GLuint dest = inst->DstReg.Index;
+       GLuint opcode;
+
+       switch(inst->Opcode) {
+       case OPCODE_KIL: opcode = R300_TEX_OP_KIL; break;
+       case OPCODE_TEX: opcode = R300_TEX_OP_LD; break;
+       case OPCODE_TXB: opcode = R300_TEX_OP_TXB; break;
+       case OPCODE_TXP: opcode = R300_TEX_OP_TXP; break;
+       default:
+               error("Unknown texture opcode %i", inst->Opcode);
+               return GL_FALSE;
+       }
+
+       if (inst->Opcode == OPCODE_KIL) {
+               unit = 0;
+               dest = 0;
+       } else {
+               use_temporary(code, dest);
+       }
+
+       use_temporary(code, inst->SrcReg[0].Index);
+
+       code->node[code->cur_node].tex_end++;
+       code->tex.inst[code->tex.length++] =
+               (inst->SrcReg[0].Index << R300_SRC_ADDR_SHIFT) |
+               (dest << R300_DST_ADDR_SHIFT) |
+               (unit << R300_TEX_ID_SHIFT) |
+               (opcode << R300_TEX_INST_SHIFT);
+       return GL_TRUE;
+}
+
+
+static const struct radeon_pair_handler pair_handler = {
+       .EmitConst = &emit_const,
+       .EmitPaired = &emit_alu,
+       .EmitTex = &emit_tex,
+       .BeginTexBlock = &begin_tex,
+       .MaxHwTemps = PFS_NUM_TEMP_REGS
+};
+
+/**
+ * Final compilation step: Turn the intermediate radeon_program into
+ * machine-readable instructions.
+ */
+GLboolean r300FragmentProgramEmit(struct r300_fragment_program_compiler *compiler)
+{
+       struct r300_fragment_program_code *code = compiler->code;
+
+       _mesa_bzero(code, sizeof(struct r300_fragment_program_code));
+       code->node[0].alu_end = -1;
+       code->node[0].tex_end = -1;
+
+       if (!radeonPairProgram(compiler->r300->radeon.glCtx, compiler->program, &pair_handler, compiler))
+               return GL_FALSE;
+
+       if (!finish_node(compiler))
+               return GL_FALSE;
+
+       return GL_TRUE;
+}
+
diff --git a/src/mesa/drivers/dri/r600/r600_fragprog_swizzle.c b/src/mesa/drivers/dri/r600/r600_fragprog_swizzle.c
new file mode 100644 (file)
index 0000000..16c5fc8
--- /dev/null
@@ -0,0 +1,227 @@
+/*
+ * Copyright (C) 2008 Nicolai Haehnle.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+/**
+ * @file
+ * Utilities to deal with the somewhat odd restriction on R300 fragment
+ * program swizzles.
+ */
+
+#include "r600_fragprog_swizzle.h"
+
+#include "r600_reg.h"
+#include "radeon_nqssadce.h"
+
+#define MAKE_SWZ3(x, y, z) (MAKE_SWIZZLE4(SWIZZLE_##x, SWIZZLE_##y, SWIZZLE_##z, SWIZZLE_ZERO))
+
+struct swizzle_data {
+       GLuint hash; /**< swizzle value this matches */
+       GLuint base; /**< base value for hw swizzle */
+       GLuint stride; /**< difference in base between arg0/1/2 */
+};
+
+static const struct swizzle_data native_swizzles[] = {
+       {MAKE_SWZ3(X, Y, Z), R300_ALU_ARGC_SRC0C_XYZ, 4},
+       {MAKE_SWZ3(X, X, X), R300_ALU_ARGC_SRC0C_XXX, 4},
+       {MAKE_SWZ3(Y, Y, Y), R300_ALU_ARGC_SRC0C_YYY, 4},
+       {MAKE_SWZ3(Z, Z, Z), R300_ALU_ARGC_SRC0C_ZZZ, 4},
+       {MAKE_SWZ3(W, W, W), R300_ALU_ARGC_SRC0A, 1},
+       {MAKE_SWZ3(Y, Z, X), R300_ALU_ARGC_SRC0C_YZX, 1},
+       {MAKE_SWZ3(Z, X, Y), R300_ALU_ARGC_SRC0C_ZXY, 1},
+       {MAKE_SWZ3(W, Z, Y), R300_ALU_ARGC_SRC0CA_WZY, 1},
+       {MAKE_SWZ3(ONE, ONE, ONE), R300_ALU_ARGC_ONE, 0},
+       {MAKE_SWZ3(ZERO, ZERO, ZERO), R300_ALU_ARGC_ZERO, 0}
+};
+
+static const int num_native_swizzles = sizeof(native_swizzles)/sizeof(native_swizzles[0]);
+
+
+/**
+ * Find a native RGB swizzle that matches the given swizzle.
+ * Returns 0 if none found.
+ */
+static const struct swizzle_data* lookup_native_swizzle(GLuint swizzle)
+{
+       int i, comp;
+
+       for(i = 0; i < num_native_swizzles; ++i) {
+               const struct swizzle_data* sd = &native_swizzles[i];
+               for(comp = 0; comp < 3; ++comp) {
+                       GLuint swz = GET_SWZ(swizzle, comp);
+                       if (swz == SWIZZLE_NIL)
+                               continue;
+                       if (swz != GET_SWZ(sd->hash, comp))
+                               break;
+               }
+               if (comp == 3)
+                       return sd;
+       }
+
+       return 0;
+}
+
+
+/**
+ * Check whether the given instruction supports the swizzle and negate
+ * combinations in the given source register.
+ */
+GLboolean r300FPIsNativeSwizzle(GLuint opcode, struct prog_src_register reg)
+{
+       if (reg.Abs)
+               reg.NegateBase = 0;
+
+       if (opcode == OPCODE_KIL ||
+           opcode == OPCODE_TEX ||
+           opcode == OPCODE_TXB ||
+           opcode == OPCODE_TXP) {
+               int j;
+
+               if (reg.Abs || reg.NegateBase != (15*reg.NegateAbs))
+                       return GL_FALSE;
+
+               for(j = 0; j < 4; ++j) {
+                       GLuint swz = GET_SWZ(reg.Swizzle, j);
+                       if (swz == SWIZZLE_NIL)
+                               continue;
+                       if (swz != j)
+                               return GL_FALSE;
+               }
+
+               return GL_TRUE;
+       }
+
+       GLuint relevant = 0;
+       int j;
+
+       for(j = 0; j < 3; ++j)
+               if (GET_SWZ(reg.Swizzle, j) != SWIZZLE_NIL)
+                       relevant |= 1 << j;
+
+       if ((reg.NegateBase & relevant) && (reg.NegateBase & relevant) != relevant)
+               return GL_FALSE;
+
+       if (!lookup_native_swizzle(reg.Swizzle))
+               return GL_FALSE;
+
+       return GL_TRUE;
+}
+
+
+/**
+ * Generate MOV dst, src using only native swizzles.
+ */
+void r300FPBuildSwizzle(struct nqssadce_state *s, struct prog_dst_register dst, struct prog_src_register src)
+{
+       if (src.Abs)
+               src.NegateBase = 0;
+
+       while(dst.WriteMask) {
+               const struct swizzle_data *best_swizzle = 0;
+               GLuint best_matchcount = 0;
+               GLuint best_matchmask = 0;
+               GLboolean rgbnegate;
+               int i, comp;
+
+               for(i = 0; i < num_native_swizzles; ++i) {
+                       const struct swizzle_data *sd = &native_swizzles[i];
+                       GLuint matchcount = 0;
+                       GLuint matchmask = 0;
+                       for(comp = 0; comp < 3; ++comp) {
+                               if (!GET_BIT(dst.WriteMask, comp))
+                                       continue;
+                               GLuint swz = GET_SWZ(src.Swizzle, comp);
+                               if (swz == SWIZZLE_NIL)
+                                       continue;
+                               if (swz == GET_SWZ(sd->hash, comp)) {
+                                       matchcount++;
+                                       matchmask |= 1 << comp;
+                               }
+                       }
+                       if (matchcount > best_matchcount) {
+                               best_swizzle = sd;
+                               best_matchcount = matchcount;
+                               best_matchmask = matchmask;
+                               if (matchmask == (dst.WriteMask & WRITEMASK_XYZ))
+                                       break;
+                       }
+               }
+
+               if ((src.NegateBase & best_matchmask) != 0) {
+                       best_matchmask &= src.NegateBase;
+                       rgbnegate = !src.NegateAbs;
+               } else {
+                       rgbnegate = src.NegateAbs;
+               }
+
+               struct prog_instruction *inst;
+
+               _mesa_insert_instructions(s->Program, s->IP, 1);
+               inst = s->Program->Instructions + s->IP++;
+               inst->Opcode = OPCODE_MOV;
+               inst->DstReg = dst;
+               inst->DstReg.WriteMask &= (best_matchmask | WRITEMASK_W);
+               inst->SrcReg[0] = src;
+               /* Note: We rely on NqSSA/DCE to set unused swizzle components to NIL */
+
+               dst.WriteMask &= ~inst->DstReg.WriteMask;
+       }
+}
+
+
+/**
+ * Translate an RGB (XYZ) swizzle into the hardware code for the given
+ * instruction source.
+ */
+GLuint r300FPTranslateRGBSwizzle(GLuint src, GLuint swizzle)
+{
+       const struct swizzle_data* sd = lookup_native_swizzle(swizzle);
+
+       if (!sd) {
+               _mesa_printf("Not a native swizzle: %08x\n", swizzle);
+               return 0;
+       }
+
+       return sd->base + src*sd->stride;
+}
+
+
+/**
+ * Translate an Alpha (W) swizzle into the hardware code for the given
+ * instruction source.
+ */
+GLuint r300FPTranslateAlphaSwizzle(GLuint src, GLuint swizzle)
+{
+       if (swizzle < 3)
+               return swizzle + 3*src;
+
+       switch(swizzle) {
+       case SWIZZLE_W: return R300_ALU_ARGA_SRC0A + src;
+       case SWIZZLE_ONE: return R300_ALU_ARGA_ONE;
+       case SWIZZLE_ZERO: return R300_ALU_ARGA_ZERO;
+       default: return R300_ALU_ARGA_ONE;
+       }
+}
diff --git a/src/mesa/drivers/dri/r600/r600_fragprog_swizzle.h b/src/mesa/drivers/dri/r600/r600_fragprog_swizzle.h
new file mode 100644 (file)
index 0000000..4a0f8cc
--- /dev/null
@@ -0,0 +1,42 @@
+/*
+ * Copyright (C) 2008 Nicolai Haehnle.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef __R600_FRAGPROG_SWIZZLE_H_
+#define __R600_FRAGPROG_SWIZZLE_H_
+
+#include "main/glheader.h"
+#include "shader/prog_instruction.h"
+
+struct nqssadce_state;
+
+GLboolean r300FPIsNativeSwizzle(GLuint opcode, struct prog_src_register reg);
+void r300FPBuildSwizzle(struct nqssadce_state*, struct prog_dst_register dst, struct prog_src_register src);
+
+GLuint r300FPTranslateRGBSwizzle(GLuint src, GLuint swizzle);
+GLuint r300FPTranslateAlphaSwizzle(GLuint src, GLuint swizzle);
+
+#endif /* __R300_FRAGPROG_SWIZZLE_H_ */
diff --git a/src/mesa/drivers/dri/r600/r600_ioctl.c b/src/mesa/drivers/dri/r600/r600_ioctl.c
new file mode 100644 (file)
index 0000000..c75354a
--- /dev/null
@@ -0,0 +1,667 @@
+/*
+Copyright (C) The Weather Channel, Inc.  2002.
+Copyright (C) 2004 Nicolai Haehnle.
+All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/**
+ * \file
+ *
+ * \author Keith Whitwell <keith@tungstengraphics.com>
+ *
+ * \author Nicolai Haehnle <prefect_@gmx.net>
+ */
+
+#include <sched.h>
+#include <errno.h>
+
+#include "main/glheader.h"
+#include "main/imports.h"
+#include "main/macros.h"
+#include "main/context.h"
+#include "swrast/swrast.h"
+
+#include "radeon_common.h"
+#include "radeon_lock.h"
+#include "r600_context.h"
+#include "r600_ioctl.h"
+#include "r600_cmdbuf.h"
+#include "r600_state.h"
+#include "r600_vertprog.h"
+#include "radeon_reg.h"
+#include "r600_emit.h"
+#include "r600_fragprog.h"
+#include "r600_context.h"
+
+#include "vblank.h"
+
+#define R200_3D_DRAW_IMMD_2      0xC0003500
+
+#define CLEARBUFFER_COLOR      0x1
+#define CLEARBUFFER_DEPTH      0x2
+#define CLEARBUFFER_STENCIL    0x4
+
+static void r300EmitClearState(GLcontext * ctx);
+
+static void r300UserClear(GLcontext *ctx, GLuint mask)
+{
+       radeon_clear_tris(ctx, mask);
+}
+
+static void r300ClearBuffer(r300ContextPtr r300, int flags,
+                           struct radeon_renderbuffer *rrb,
+                           struct radeon_renderbuffer *rrbd)
+{
+       BATCH_LOCALS(&r300->radeon);
+       GLcontext *ctx = r300->radeon.glCtx;
+       __DRIdrawablePrivate *dPriv = r300->radeon.dri.drawable;
+       GLuint cbpitch = 0;
+       r300ContextPtr rmesa = r300;
+
+       if (RADEON_DEBUG & DEBUG_IOCTL)
+               fprintf(stderr, "%s: buffer %p (%i,%i %ix%i)\n",
+                       __FUNCTION__, rrb, dPriv->x, dPriv->y,
+                       dPriv->w, dPriv->h);
+
+       if (rrb) {
+               cbpitch = (rrb->pitch / rrb->cpp);
+               if (rrb->cpp == 4)
+                       cbpitch |= R300_COLOR_FORMAT_ARGB8888;
+               else
+                       cbpitch |= R300_COLOR_FORMAT_RGB565;
+
+               if (rrb->bo->flags & RADEON_BO_FLAGS_MACRO_TILE){
+                       cbpitch |= R300_COLOR_TILE_ENABLE;
+        }
+       }
+
+       /* TODO in bufmgr */
+       cp_wait(&r300->radeon, R300_WAIT_3D | R300_WAIT_3D_CLEAN);
+       end_3d(&rmesa->radeon);
+
+       if (flags & CLEARBUFFER_COLOR) {
+               assert(rrb != 0);
+               BEGIN_BATCH_NO_AUTOSTATE(6);
+               OUT_BATCH_REGSEQ(R300_RB3D_COLOROFFSET0, 1);
+               OUT_BATCH_RELOC(0, rrb->bo, 0, 0, RADEON_GEM_DOMAIN_VRAM, 0);
+               OUT_BATCH_REGVAL(R300_RB3D_COLORPITCH0, cbpitch);
+               END_BATCH();
+       }
+#if 1
+       if (flags & (CLEARBUFFER_DEPTH | CLEARBUFFER_STENCIL)) {
+               assert(rrbd != 0);
+               cbpitch = (rrbd->pitch / rrbd->cpp);
+               if (rrbd->bo->flags & RADEON_BO_FLAGS_MACRO_TILE){
+                       cbpitch |= R300_DEPTHMACROTILE_ENABLE;
+        }
+               if (rrbd->bo->flags & RADEON_BO_FLAGS_MICRO_TILE){
+            cbpitch |= R300_DEPTHMICROTILE_TILED;
+        }
+               BEGIN_BATCH_NO_AUTOSTATE(6);
+               OUT_BATCH_REGSEQ(R300_ZB_DEPTHOFFSET, 1);
+               OUT_BATCH_RELOC(0, rrbd->bo, 0, 0, RADEON_GEM_DOMAIN_VRAM, 0);
+               OUT_BATCH_REGVAL(R300_ZB_DEPTHPITCH, cbpitch);
+               END_BATCH();
+       }
+#endif
+       BEGIN_BATCH_NO_AUTOSTATE(6);
+       OUT_BATCH_REGSEQ(RB3D_COLOR_CHANNEL_MASK, 1);
+       if (flags & CLEARBUFFER_COLOR) {
+               OUT_BATCH((ctx->Color.ColorMask[BCOMP] ? RB3D_COLOR_CHANNEL_MASK_BLUE_MASK0 : 0) |
+                         (ctx->Color.ColorMask[GCOMP] ? RB3D_COLOR_CHANNEL_MASK_GREEN_MASK0 : 0) |
+                         (ctx->Color.ColorMask[RCOMP] ? RB3D_COLOR_CHANNEL_MASK_RED_MASK0 : 0) |
+                         (ctx->Color.ColorMask[ACOMP] ? RB3D_COLOR_CHANNEL_MASK_ALPHA_MASK0 : 0));
+       } else {
+               OUT_BATCH(0);
+       }
+
+
+       {
+               uint32_t t1, t2;
+
+               t1 = 0x0;
+               t2 = 0x0;
+
+               if (flags & CLEARBUFFER_DEPTH) {
+                       t1 |= R300_Z_ENABLE | R300_Z_WRITE_ENABLE;
+                       t2 |=
+                           (R300_ZS_ALWAYS << R300_Z_FUNC_SHIFT);
+               }
+
+               if (flags & CLEARBUFFER_STENCIL) {
+                       t1 |= R300_STENCIL_ENABLE;
+                       t2 |=
+                           (R300_ZS_ALWAYS <<
+                            R300_S_FRONT_FUNC_SHIFT) |
+                           (R300_ZS_REPLACE <<
+                            R300_S_FRONT_SFAIL_OP_SHIFT) |
+                           (R300_ZS_REPLACE <<
+                            R300_S_FRONT_ZPASS_OP_SHIFT) |
+                           (R300_ZS_REPLACE <<
+                            R300_S_FRONT_ZFAIL_OP_SHIFT);
+               }
+
+               OUT_BATCH_REGSEQ(R300_ZB_CNTL, 3);
+               OUT_BATCH(t1);
+               OUT_BATCH(t2);
+               OUT_BATCH(((ctx->Stencil.WriteMask[0] & R300_STENCILREF_MASK) <<
+                   R300_STENCILWRITEMASK_SHIFT) |
+                         (ctx->Stencil.Clear & R300_STENCILREF_MASK));
+               END_BATCH();
+       }
+
+       if (!rmesa->radeon.radeonScreen->kernel_mm) {
+               BEGIN_BATCH_NO_AUTOSTATE(9);
+               OUT_BATCH(cmdpacket3(r300->radeon.radeonScreen, R300_CMD_PACKET3_CLEAR));
+               OUT_BATCH_FLOAT32(dPriv->w / 2.0);
+               OUT_BATCH_FLOAT32(dPriv->h / 2.0);
+               OUT_BATCH_FLOAT32(ctx->Depth.Clear);
+               OUT_BATCH_FLOAT32(1.0);
+               OUT_BATCH_FLOAT32(ctx->Color.ClearColor[0]);
+               OUT_BATCH_FLOAT32(ctx->Color.ClearColor[1]);
+               OUT_BATCH_FLOAT32(ctx->Color.ClearColor[2]);
+               OUT_BATCH_FLOAT32(ctx->Color.ClearColor[3]);
+               END_BATCH();
+       } else {
+               OUT_BATCH(CP_PACKET3(R200_3D_DRAW_IMMD_2, 8));
+               OUT_BATCH(R300_PRIM_TYPE_POINT | R300_PRIM_WALK_RING |
+                         (1 << R300_PRIM_NUM_VERTICES_SHIFT));
+               OUT_BATCH_FLOAT32(dPriv->w / 2.0);
+               OUT_BATCH_FLOAT32(dPriv->h / 2.0);
+               OUT_BATCH_FLOAT32(ctx->Depth.Clear);
+               OUT_BATCH_FLOAT32(1.0);
+               OUT_BATCH_FLOAT32(ctx->Color.ClearColor[0]);
+               OUT_BATCH_FLOAT32(ctx->Color.ClearColor[1]);
+               OUT_BATCH_FLOAT32(ctx->Color.ClearColor[2]);
+               OUT_BATCH_FLOAT32(ctx->Color.ClearColor[3]);
+       }
+       
+       r300EmitCacheFlush(rmesa);
+       cp_wait(&r300->radeon, R300_WAIT_3D | R300_WAIT_3D_CLEAN);
+
+       R300_STATECHANGE(r300, cb);
+       R300_STATECHANGE(r300, cmk);
+       R300_STATECHANGE(r300, zs);
+}
+
+static void r300EmitClearState(GLcontext * ctx)
+{
+       r300ContextPtr r300 = R300_CONTEXT(ctx);
+       BATCH_LOCALS(&r300->radeon);
+       __DRIdrawablePrivate *dPriv = r300->radeon.dri.drawable;
+       int i;
+       int has_tcl = 1;
+       int is_r500 = 0;
+       GLuint vap_cntl;
+
+       if (!(r300->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL))
+               has_tcl = 0;
+
+       if (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515)
+               is_r500 = 1;
+
+       /* State atom dirty tracking is a little subtle here.
+        *
+        * On the one hand, we need to make sure base state is emitted
+        * here if we start with an empty batch buffer, otherwise clear
+        * works incorrectly with multiple processes. Therefore, the first
+        * BEGIN_BATCH cannot be a BEGIN_BATCH_NO_AUTOSTATE.
+        *
+        * On the other hand, implicit state emission clears the state atom
+        * dirty bits, so we have to call R300_STATECHANGE later than the
+        * first BEGIN_BATCH.
+        *
+        * The final trickiness is that, because we change state, we need
+        * to ensure that any stored swtcl primitives are flushed properly
+        * before we start changing state. See the R300_NEWPRIM in r300Clear
+        * for this.
+        */
+       BEGIN_BATCH(31);
+       OUT_BATCH_REGSEQ(R300_VAP_PROG_STREAM_CNTL_0, 1);
+       if (!has_tcl)
+               OUT_BATCH(((((0 << R300_DST_VEC_LOC_SHIFT) | R300_DATA_TYPE_FLOAT_4) << R300_DATA_TYPE_0_SHIFT) |
+                ((R300_LAST_VEC | (2 << R300_DST_VEC_LOC_SHIFT) | R300_DATA_TYPE_FLOAT_4) << R300_DATA_TYPE_1_SHIFT)));
+       else
+               OUT_BATCH(((((0 << R300_DST_VEC_LOC_SHIFT) | R300_DATA_TYPE_FLOAT_4) << R300_DATA_TYPE_0_SHIFT) |
+                ((R300_LAST_VEC | (1 << R300_DST_VEC_LOC_SHIFT) | R300_DATA_TYPE_FLOAT_4) << R300_DATA_TYPE_1_SHIFT)));
+
+       OUT_BATCH_REGVAL(R300_FG_FOG_BLEND, 0);
+       OUT_BATCH_REGVAL(R300_VAP_PROG_STREAM_CNTL_EXT_0,
+          ((((R300_SWIZZLE_SELECT_X << R300_SWIZZLE_SELECT_X_SHIFT) |
+              (R300_SWIZZLE_SELECT_Y << R300_SWIZZLE_SELECT_Y_SHIFT) |
+              (R300_SWIZZLE_SELECT_Z << R300_SWIZZLE_SELECT_Z_SHIFT) |
+              (R300_SWIZZLE_SELECT_W << R300_SWIZZLE_SELECT_W_SHIFT) |
+              ((R300_WRITE_ENA_X | R300_WRITE_ENA_Y | R300_WRITE_ENA_Z | R300_WRITE_ENA_W) << R300_WRITE_ENA_SHIFT))
+             << R300_SWIZZLE0_SHIFT) |
+            (((R300_SWIZZLE_SELECT_X << R300_SWIZZLE_SELECT_X_SHIFT) |
+              (R300_SWIZZLE_SELECT_Y << R300_SWIZZLE_SELECT_Y_SHIFT) |
+              (R300_SWIZZLE_SELECT_Z << R300_SWIZZLE_SELECT_Z_SHIFT) |
+              (R300_SWIZZLE_SELECT_W << R300_SWIZZLE_SELECT_W_SHIFT) |
+              ((R300_WRITE_ENA_X | R300_WRITE_ENA_Y | R300_WRITE_ENA_Z | R300_WRITE_ENA_W) << R300_WRITE_ENA_SHIFT))
+             << R300_SWIZZLE1_SHIFT)));
+
+       /* R300_VAP_INPUT_CNTL_0, R300_VAP_INPUT_CNTL_1 */
+       OUT_BATCH_REGSEQ(R300_VAP_VTX_STATE_CNTL, 2);
+       OUT_BATCH((R300_SEL_USER_COLOR_0 << R300_COLOR_0_ASSEMBLY_SHIFT));
+       OUT_BATCH(R300_INPUT_CNTL_POS | R300_INPUT_CNTL_COLOR | R300_INPUT_CNTL_TC0);
+
+       /* comes from fglrx startup of clear */
+       OUT_BATCH_REGSEQ(R300_SE_VTE_CNTL, 2);
+       OUT_BATCH(R300_VTX_W0_FMT | R300_VPORT_X_SCALE_ENA |
+                 R300_VPORT_X_OFFSET_ENA | R300_VPORT_Y_SCALE_ENA |
+                 R300_VPORT_Y_OFFSET_ENA | R300_VPORT_Z_SCALE_ENA |
+                 R300_VPORT_Z_OFFSET_ENA);
+       OUT_BATCH(0x8);
+
+       OUT_BATCH_REGVAL(R300_VAP_PSC_SGN_NORM_CNTL, 0xaaaaaaaa);
+
+       OUT_BATCH_REGSEQ(R300_VAP_OUTPUT_VTX_FMT_0, 2);
+       OUT_BATCH(R300_VAP_OUTPUT_VTX_FMT_0__POS_PRESENT |
+                 R300_VAP_OUTPUT_VTX_FMT_0__COLOR_0_PRESENT);
+       OUT_BATCH(0); /* no textures */
+
+       OUT_BATCH_REGVAL(R300_TX_ENABLE, 0);
+
+       OUT_BATCH_REGSEQ(R300_SE_VPORT_XSCALE, 6);
+       OUT_BATCH_FLOAT32(1.0);
+       OUT_BATCH_FLOAT32(dPriv->x);
+       OUT_BATCH_FLOAT32(1.0);
+       OUT_BATCH_FLOAT32(dPriv->y);
+       OUT_BATCH_FLOAT32(1.0);
+       OUT_BATCH_FLOAT32(0.0);
+
+       OUT_BATCH_REGVAL(R300_FG_ALPHA_FUNC, 0);
+
+       OUT_BATCH_REGSEQ(R300_RB3D_CBLEND, 2);
+       OUT_BATCH(0x0);
+       OUT_BATCH(0x0);
+       END_BATCH();
+
+       R300_STATECHANGE(r300, vir[0]);
+       R300_STATECHANGE(r300, fogs);
+       R300_STATECHANGE(r300, vir[1]);
+       R300_STATECHANGE(r300, vic);
+       R300_STATECHANGE(r300, vte);
+       R300_STATECHANGE(r300, vof);
+       R300_STATECHANGE(r300, txe);
+       R300_STATECHANGE(r300, vpt);
+       R300_STATECHANGE(r300, at);
+       R300_STATECHANGE(r300, bld);
+       R300_STATECHANGE(r300, ps);
+
+       if (has_tcl) {
+               R300_STATECHANGE(r300, vap_clip_cntl);
+
+               BEGIN_BATCH_NO_AUTOSTATE(2);
+               OUT_BATCH_REGVAL(R300_VAP_CLIP_CNTL, R300_PS_UCP_MODE_CLIP_AS_TRIFAN | R300_CLIP_DISABLE);
+               END_BATCH();
+        }
+
+       BEGIN_BATCH_NO_AUTOSTATE(2);
+       OUT_BATCH_REGVAL(R300_GA_POINT_SIZE,
+               ((dPriv->w * 6) << R300_POINTSIZE_X_SHIFT) |
+               ((dPriv->h * 6) << R300_POINTSIZE_Y_SHIFT));
+       END_BATCH();
+
+       if (!is_r500) {
+               R300_STATECHANGE(r300, ri);
+               R300_STATECHANGE(r300, rc);
+               R300_STATECHANGE(r300, rr);
+
+               BEGIN_BATCH(14);
+               OUT_BATCH_REGSEQ(R300_RS_IP_0, 8);
+               for (i = 0; i < 8; ++i)
+                       OUT_BATCH(R300_RS_SEL_T(1) | R300_RS_SEL_R(2) | R300_RS_SEL_Q(3));
+
+               OUT_BATCH_REGSEQ(R300_RS_COUNT, 2);
+               OUT_BATCH((1 << R300_IC_COUNT_SHIFT) | R300_HIRES_EN);
+               OUT_BATCH(0x0);
+
+               OUT_BATCH_REGVAL(R300_RS_INST_0, R300_RS_INST_COL_CN_WRITE);
+               END_BATCH();
+       } else {
+               R300_STATECHANGE(r300, ri);
+               R300_STATECHANGE(r300, rc);
+               R300_STATECHANGE(r300, rr);
+
+               BEGIN_BATCH(14);
+               OUT_BATCH_REGSEQ(R500_RS_IP_0, 8);
+               for (i = 0; i < 8; ++i) {
+                       OUT_BATCH((R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_S_SHIFT) |
+                                 (R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_T_SHIFT) |
+                                 (R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_R_SHIFT) |
+                                 (R500_RS_IP_PTR_K1 << R500_RS_IP_TEX_PTR_Q_SHIFT));
+               }
+
+               OUT_BATCH_REGSEQ(R300_RS_COUNT, 2);
+               OUT_BATCH((1 << R300_IC_COUNT_SHIFT) | R300_HIRES_EN);
+               OUT_BATCH(0x0);
+
+               OUT_BATCH_REGVAL(R500_RS_INST_0, R500_RS_INST_COL_CN_WRITE);
+               END_BATCH();
+       }
+
+       if (!is_r500) {
+               R300_STATECHANGE(r300, fp);
+               R300_STATECHANGE(r300, fpi[0]);
+               R300_STATECHANGE(r300, fpi[1]);
+               R300_STATECHANGE(r300, fpi[2]);
+               R300_STATECHANGE(r300, fpi[3]);
+
+               BEGIN_BATCH(17);
+               OUT_BATCH_REGSEQ(R300_US_CONFIG, 3);
+               OUT_BATCH(0x0);
+               OUT_BATCH(0x0);
+               OUT_BATCH(0x0);
+               OUT_BATCH_REGSEQ(R300_US_CODE_ADDR_0, 4);
+               OUT_BATCH(0x0);
+               OUT_BATCH(0x0);
+               OUT_BATCH(0x0);
+               OUT_BATCH(R300_RGBA_OUT);
+
+               OUT_BATCH_REGVAL(R300_US_ALU_RGB_INST_0,
+                       FP_INSTRC(MAD, FP_ARGC(SRC0C_XYZ), FP_ARGC(ONE), FP_ARGC(ZERO)));
+               OUT_BATCH_REGVAL(R300_US_ALU_RGB_ADDR_0,
+                       FP_SELC(0, NO, XYZ, FP_TMP(0), 0, 0));
+               OUT_BATCH_REGVAL(R300_US_ALU_ALPHA_INST_0,
+                       FP_INSTRA(MAD, FP_ARGA(SRC0A), FP_ARGA(ONE), FP_ARGA(ZERO)));
+               OUT_BATCH_REGVAL(R300_US_ALU_ALPHA_ADDR_0,
+                       FP_SELA(0, NO, W, FP_TMP(0), 0, 0));
+               END_BATCH();
+       } else {
+               struct radeon_state_atom r500fp;
+               uint32_t _cmd[10];
+
+               R300_STATECHANGE(r300, fp);
+               R300_STATECHANGE(r300, r500fp);
+
+               BEGIN_BATCH(7);
+               OUT_BATCH_REGSEQ(R500_US_CONFIG, 2);
+               OUT_BATCH(R500_ZERO_TIMES_ANYTHING_EQUALS_ZERO);
+               OUT_BATCH(0x0);
+               OUT_BATCH_REGSEQ(R500_US_CODE_ADDR, 3);
+               OUT_BATCH(R500_US_CODE_START_ADDR(0) | R500_US_CODE_END_ADDR(1));
+               OUT_BATCH(R500_US_CODE_RANGE_ADDR(0) | R500_US_CODE_RANGE_SIZE(1));
+               OUT_BATCH(R500_US_CODE_OFFSET_ADDR(0));
+               END_BATCH();
+
+               r500fp.check = check_r500fp;
+               r500fp.cmd = _cmd;
+               r500fp.cmd[0] = cmdr500fp(r300->radeon.radeonScreen, 0, 1, 0, 0);
+               r500fp.cmd[1] = R500_INST_TYPE_OUT |
+                       R500_INST_TEX_SEM_WAIT |
+                       R500_INST_LAST |
+                       R500_INST_RGB_OMASK_R |
+                       R500_INST_RGB_OMASK_G |
+                       R500_INST_RGB_OMASK_B |
+                       R500_INST_ALPHA_OMASK |
+                       R500_INST_RGB_CLAMP |
+                       R500_INST_ALPHA_CLAMP;
+               r500fp.cmd[2] = R500_RGB_ADDR0(0) |
+                       R500_RGB_ADDR1(0) |
+                       R500_RGB_ADDR1_CONST |
+                       R500_RGB_ADDR2(0) |
+                       R500_RGB_ADDR2_CONST;
+               r500fp.cmd[3] = R500_ALPHA_ADDR0(0) |
+                       R500_ALPHA_ADDR1(0) |
+                       R500_ALPHA_ADDR1_CONST |
+                       R500_ALPHA_ADDR2(0) |
+                       R500_ALPHA_ADDR2_CONST;
+               r500fp.cmd[4] = R500_ALU_RGB_SEL_A_SRC0 |
+                       R500_ALU_RGB_R_SWIZ_A_R |
+                       R500_ALU_RGB_G_SWIZ_A_G |
+                       R500_ALU_RGB_B_SWIZ_A_B |
+                       R500_ALU_RGB_SEL_B_SRC0 |
+                       R500_ALU_RGB_R_SWIZ_B_R |
+                       R500_ALU_RGB_B_SWIZ_B_G |
+                       R500_ALU_RGB_G_SWIZ_B_B;
+               r500fp.cmd[5] = R500_ALPHA_OP_CMP |
+                       R500_ALPHA_SWIZ_A_A |
+                       R500_ALPHA_SWIZ_B_A;
+               r500fp.cmd[6] = R500_ALU_RGBA_OP_CMP |
+                       R500_ALU_RGBA_R_SWIZ_0 |
+                       R500_ALU_RGBA_G_SWIZ_0 |
+                       R500_ALU_RGBA_B_SWIZ_0 |
+                       R500_ALU_RGBA_A_SWIZ_0;
+               
+               r500fp.cmd[7] = 0;
+               emit_r500fp(ctx, &r500fp);
+       }
+
+       BEGIN_BATCH(2);
+       OUT_BATCH_REGVAL(R300_VAP_PVS_STATE_FLUSH_REG, 0);
+       END_BATCH();
+
+       if (has_tcl) {
+               vap_cntl = ((10 << R300_PVS_NUM_SLOTS_SHIFT) |
+                       (5 << R300_PVS_NUM_CNTLRS_SHIFT) |
+                       (12 << R300_VF_MAX_VTX_NUM_SHIFT));
+               if (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515)
+                       vap_cntl |= R500_TCL_STATE_OPTIMIZATION;
+       } else {
+               vap_cntl = ((10 << R300_PVS_NUM_SLOTS_SHIFT) |
+                       (5 << R300_PVS_NUM_CNTLRS_SHIFT) |
+                       (5 << R300_VF_MAX_VTX_NUM_SHIFT));
+       }
+
+       if (r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV515)
+               vap_cntl |= (2 << R300_PVS_NUM_FPUS_SHIFT);
+       else if ((r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV530) ||
+                (r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV560) ||
+                (r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV570))
+               vap_cntl |= (5 << R300_PVS_NUM_FPUS_SHIFT);
+       else if ((r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV410) ||
+                (r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_R420))
+               vap_cntl |= (6 << R300_PVS_NUM_FPUS_SHIFT);
+       else if ((r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_R520) ||
+                (r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_R580))
+               vap_cntl |= (8 << R300_PVS_NUM_FPUS_SHIFT);
+       else
+               vap_cntl |= (4 << R300_PVS_NUM_FPUS_SHIFT);
+
+       R300_STATECHANGE(r300, vap_cntl);
+
+       BEGIN_BATCH(2);
+       OUT_BATCH_REGVAL(R300_VAP_CNTL, vap_cntl);
+       END_BATCH();
+
+       if (has_tcl) {
+        struct radeon_state_atom vpu;
+        uint32_t _cmd[10];
+               R300_STATECHANGE(r300, pvs);
+               R300_STATECHANGE(r300, vpi);
+
+               BEGIN_BATCH(4);
+               OUT_BATCH_REGSEQ(R300_VAP_PVS_CODE_CNTL_0, 3);
+               OUT_BATCH((0 << R300_PVS_FIRST_INST_SHIFT) |
+                         (0 << R300_PVS_XYZW_VALID_INST_SHIFT) |
+                         (1 << R300_PVS_LAST_INST_SHIFT));
+               OUT_BATCH((0 << R300_PVS_CONST_BASE_OFFSET_SHIFT) |
+                         (0 << R300_PVS_MAX_CONST_ADDR_SHIFT));
+               OUT_BATCH(1 << R300_PVS_LAST_VTX_SRC_INST_SHIFT);
+               END_BATCH();
+
+               vpu.check = check_vpu;
+               vpu.cmd = _cmd;
+               vpu.cmd[0] = cmdvpu(r300->radeon.radeonScreen, 0, 2);
+
+               vpu.cmd[1] = PVS_OP_DST_OPERAND(VE_ADD, GL_FALSE, GL_FALSE,
+                                         0, 0xf, PVS_DST_REG_OUT);
+               vpu.cmd[2] = PVS_SRC_OPERAND(0, PVS_SRC_SELECT_X, PVS_SRC_SELECT_Y,
+                                      PVS_SRC_SELECT_Z, PVS_SRC_SELECT_W,
+                                      PVS_SRC_REG_INPUT, VSF_FLAG_NONE);
+               vpu.cmd[3] = PVS_SRC_OPERAND(0, PVS_SRC_SELECT_FORCE_0,
+                                      PVS_SRC_SELECT_FORCE_0,
+                                      PVS_SRC_SELECT_FORCE_0,
+                                      PVS_SRC_SELECT_FORCE_0,
+                                      PVS_SRC_REG_INPUT, VSF_FLAG_NONE);
+               vpu.cmd[4] = 0x0;
+
+               vpu.cmd[5] = PVS_OP_DST_OPERAND(VE_ADD, GL_FALSE, GL_FALSE, 1, 0xf,
+                                         PVS_DST_REG_OUT);
+               vpu.cmd[6] = PVS_SRC_OPERAND(1, PVS_SRC_SELECT_X,
+                                      PVS_SRC_SELECT_Y, PVS_SRC_SELECT_Z,
+                                      PVS_SRC_SELECT_W, PVS_SRC_REG_INPUT,
+
+                                      VSF_FLAG_NONE);
+               vpu.cmd[7] = PVS_SRC_OPERAND(1, PVS_SRC_SELECT_FORCE_0,
+                                      PVS_SRC_SELECT_FORCE_0,
+                                      PVS_SRC_SELECT_FORCE_0,
+                                      PVS_SRC_SELECT_FORCE_0,
+                                      PVS_SRC_REG_INPUT, VSF_FLAG_NONE);
+               vpu.cmd[8] = 0x0;
+
+               r300->vap_flush_needed = GL_TRUE;
+               emit_vpu(ctx, &vpu);
+       }
+}
+
+static void r300KernelClear(GLcontext *ctx, GLuint flags)
+{              
+       r300ContextPtr r300 = R300_CONTEXT(ctx);
+       __DRIdrawablePrivate *dPriv = r300->radeon.dri.drawable;
+       struct radeon_framebuffer *rfb = dPriv->driverPrivate;
+       struct radeon_renderbuffer *rrb;
+       struct radeon_renderbuffer *rrbd;
+       int bits = 0;
+
+       /* Make sure it fits there. */
+       rcommonEnsureCmdBufSpace(&r300->radeon, 421 * 3, __FUNCTION__);
+       if (flags || bits)
+               r300EmitClearState(ctx);
+       rrbd = radeon_get_renderbuffer(&rfb->base, BUFFER_DEPTH);
+       if (rrbd && (flags & BUFFER_BIT_DEPTH))
+               bits |= CLEARBUFFER_DEPTH;
+
+       if (rrbd && (flags & BUFFER_BIT_STENCIL))
+               bits |= CLEARBUFFER_STENCIL;
+
+       if (flags & BUFFER_BIT_COLOR0) {
+               rrb = radeon_get_renderbuffer(&rfb->base, BUFFER_COLOR0);
+               r300ClearBuffer(r300, CLEARBUFFER_COLOR, rrb, NULL);
+               bits = 0;
+       }
+               
+       if (flags & BUFFER_BIT_FRONT_LEFT) {
+               rrb = radeon_get_renderbuffer(&rfb->base, BUFFER_FRONT_LEFT);
+               r300ClearBuffer(r300, bits | CLEARBUFFER_COLOR, rrb, rrbd);
+               bits = 0;
+       }
+
+       if (flags & BUFFER_BIT_BACK_LEFT) {
+               rrb = radeon_get_renderbuffer(&rfb->base, BUFFER_BACK_LEFT);
+               r300ClearBuffer(r300, bits | CLEARBUFFER_COLOR, rrb, rrbd);
+               bits = 0;
+       }
+
+       if (bits)
+               r300ClearBuffer(r300, bits, NULL, rrbd);
+
+       COMMIT_BATCH();
+}
+
+/**
+ * Buffer clear
+ */
+static void r300Clear(GLcontext * ctx, GLbitfield mask)
+{
+       r300ContextPtr r300 = R300_CONTEXT(ctx);
+       __DRIdrawablePrivate *dPriv = r300->radeon.dri.drawable;
+       const GLuint colorMask = *((GLuint *) & ctx->Color.ColorMask);
+       GLbitfield swrast_mask = 0, tri_mask = 0;
+       int i;
+       struct gl_framebuffer *fb = ctx->DrawBuffer;
+
+       if (RADEON_DEBUG & DEBUG_IOCTL)
+               fprintf(stderr, "r300Clear\n");
+
+       if (!r300->radeon.radeonScreen->driScreen->dri2.enabled) {
+               LOCK_HARDWARE(&r300->radeon);
+               UNLOCK_HARDWARE(&r300->radeon);
+               if (dPriv->numClipRects == 0)
+                       return;
+       }
+
+       /* Flush swtcl vertices if necessary, because we will change hardware
+        * state during clear. See also the state-related comment in
+        * r300EmitClearState.
+        */
+       R300_NEWPRIM(r300);
+
+       if (colorMask == ~0)
+         tri_mask |= (mask & BUFFER_BITS_COLOR);
+
+
+       /* HW stencil */
+       if (mask & BUFFER_BIT_STENCIL) {
+               tri_mask |= BUFFER_BIT_STENCIL;
+       }
+
+       /* HW depth */
+       if (mask & BUFFER_BIT_DEPTH) {
+               tri_mask |= BUFFER_BIT_DEPTH;
+       }
+
+       /* If we're doing a tri pass for depth/stencil, include a likely color
+        * buffer with it.
+        */
+
+       for (i = 0; i < BUFFER_COUNT; i++) {
+         GLuint bufBit = 1 << i;
+         if ((tri_mask) & bufBit) {
+           if (!fb->Attachment[i].Renderbuffer->ClassID) {
+             tri_mask &= ~bufBit;
+             swrast_mask |= bufBit;
+           }
+         }
+       }
+
+       /* SW fallback clearing */
+       swrast_mask = mask & ~tri_mask;
+
+       if (tri_mask) {
+               if (r300->radeon.radeonScreen->kernel_mm)
+                       r300UserClear(ctx, tri_mask);
+               else
+                       r300KernelClear(ctx, tri_mask);
+       }
+       if (swrast_mask) {
+               if (RADEON_DEBUG & DEBUG_FALLBACKS)
+                       fprintf(stderr, "%s: swrast clear, mask: %x\n",
+                               __FUNCTION__, swrast_mask);
+               _swrast_Clear(ctx, swrast_mask);
+       }
+}
+
+
+void r300InitIoctlFuncs(struct dd_function_table *functions)
+{
+       functions->Clear = r300Clear;
+       functions->Finish = radeonFinish;
+       functions->Flush = radeonFlush;
+}
diff --git a/src/mesa/drivers/dri/r600/r600_ioctl.h b/src/mesa/drivers/dri/r600/r600_ioctl.h
new file mode 100644 (file)
index 0000000..63beab9
--- /dev/null
@@ -0,0 +1,44 @@
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Keith Whitwell <keith@tungstengraphics.com>
+ *   Nicolai Haehnle <prefect_@gmx.net>
+ */
+
+#ifndef __R600_IOCTL_H__
+#define __R600_IOCTL_H__
+
+#include "r600_context.h"
+#include "radeon_drm.h"
+
+extern void r300InitIoctlFuncs(struct dd_function_table *functions);
+
+#endif                         /* __R600_IOCTL_H__ */
diff --git a/src/mesa/drivers/dri/r600/r600_reg.h b/src/mesa/drivers/dri/r600/r600_reg.h
new file mode 100644 (file)
index 0000000..ed552d0
--- /dev/null
@@ -0,0 +1,3265 @@
+/**************************************************************************
+
+Copyright (C) 2004-2005 Nicolai Haehnle et al.
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the "Software"),
+to deal in the Software without restriction, including without limitation
+on the rights to use, copy, modify, merge, publish, distribute, sub
+license, and/or sell copies of the Software, and to permit persons to whom
+the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice (including the next
+paragraph) shall be included in all copies or substantial portions of the
+Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/* *INDENT-OFF* */
+
+#ifndef _R300_REG_H
+#define _R300_REG_H
+
+#define R300_MC_INIT_MISC_LAT_TIMER    0x180
+#      define R300_MC_MISC__MC_CPR_INIT_LAT_SHIFT      0
+#      define R300_MC_MISC__MC_VF_INIT_LAT_SHIFT       4
+#      define R300_MC_MISC__MC_DISP0R_INIT_LAT_SHIFT   8
+#      define R300_MC_MISC__MC_DISP1R_INIT_LAT_SHIFT   12
+#      define R300_MC_MISC__MC_FIXED_INIT_LAT_SHIFT    16
+#      define R300_MC_MISC__MC_E2R_INIT_LAT_SHIFT      20
+#      define R300_MC_MISC__MC_SAME_PAGE_PRIO_SHIFT    24
+#      define R300_MC_MISC__MC_GLOBW_INIT_LAT_SHIFT    28
+
+
+#define R300_MC_INIT_GFX_LAT_TIMER     0x154
+#      define R300_MC_MISC__MC_G3D0R_INIT_LAT_SHIFT    0
+#      define R300_MC_MISC__MC_G3D1R_INIT_LAT_SHIFT    4
+#      define R300_MC_MISC__MC_G3D2R_INIT_LAT_SHIFT    8
+#      define R300_MC_MISC__MC_G3D3R_INIT_LAT_SHIFT    12
+#      define R300_MC_MISC__MC_TX0R_INIT_LAT_SHIFT     16
+#      define R300_MC_MISC__MC_TX1R_INIT_LAT_SHIFT     20
+#      define R300_MC_MISC__MC_GLOBR_INIT_LAT_SHIFT    24
+#      define R300_MC_MISC__MC_GLOBW_FULL_LAT_SHIFT    28
+
+/*
+ * This file contains registers and constants for the R300. They have been
+ * found mostly by examining command buffers captured using glxtest, as well
+ * as by extrapolating some known registers and constants from the R200.
+ * I am fairly certain that they are correct unless stated otherwise
+ * in comments.
+ */
+
+#define R300_SE_VPORT_XSCALE                0x1D98
+#define R300_SE_VPORT_XOFFSET               0x1D9C
+#define R300_SE_VPORT_YSCALE                0x1DA0
+#define R300_SE_VPORT_YOFFSET               0x1DA4
+#define R300_SE_VPORT_ZSCALE                0x1DA8
+#define R300_SE_VPORT_ZOFFSET               0x1DAC
+
+#define R300_VAP_PORT_IDX0                 0x2040
+/*
+ * Vertex Array Processing (VAP) Control
+ */
+#define R300_VAP_CNTL  0x2080
+#       define R300_PVS_NUM_SLOTS_SHIFT                 0
+#       define R300_PVS_NUM_CNTLRS_SHIFT                4
+#       define R300_PVS_NUM_FPUS_SHIFT                  8
+#       define R300_VF_MAX_VTX_NUM_SHIFT                18
+#       define R300_GL_CLIP_SPACE_DEF                   (0 << 22)
+#       define R300_DX_CLIP_SPACE_DEF                   (1 << 22)
+#       define R500_TCL_STATE_OPTIMIZATION              (1 << 23)
+
+/* This register is written directly and also starts data section
+ * in many 3d CP_PACKET3's
+ */
+#define R300_VAP_VF_CNTL       0x2084
+#      define  R300_VAP_VF_CNTL__PRIM_TYPE__SHIFT              0
+#      define  R300_VAP_VF_CNTL__PRIM_NONE                     (0<<0)
+#      define  R300_VAP_VF_CNTL__PRIM_POINTS                   (1<<0)
+#      define  R300_VAP_VF_CNTL__PRIM_LINES                    (2<<0)
+#      define  R300_VAP_VF_CNTL__PRIM_LINE_STRIP               (3<<0)
+#      define  R300_VAP_VF_CNTL__PRIM_TRIANGLES                (4<<0)
+#      define  R300_VAP_VF_CNTL__PRIM_TRIANGLE_FAN             (5<<0)
+#      define  R300_VAP_VF_CNTL__PRIM_TRIANGLE_STRIP           (6<<0)
+#      define  R300_VAP_VF_CNTL__PRIM_LINE_LOOP                (12<<0)
+#      define  R300_VAP_VF_CNTL__PRIM_QUADS                    (13<<0)
+#      define  R300_VAP_VF_CNTL__PRIM_QUAD_STRIP               (14<<0)
+#      define  R300_VAP_VF_CNTL__PRIM_POLYGON                  (15<<0)
+
+#      define  R300_VAP_VF_CNTL__PRIM_WALK__SHIFT              4
+       /* State based - direct writes to registers trigger vertex
+           generation */
+#      define  R300_VAP_VF_CNTL__PRIM_WALK_STATE_BASED         (0<<4)
+#      define  R300_VAP_VF_CNTL__PRIM_WALK_INDICES             (1<<4)
+#      define  R300_VAP_VF_CNTL__PRIM_WALK_VERTEX_LIST         (2<<4)
+#      define  R300_VAP_VF_CNTL__PRIM_WALK_VERTEX_EMBEDDED     (3<<4)
+
+       /* I don't think I saw these three used.. */
+#      define  R300_VAP_VF_CNTL__COLOR_ORDER__SHIFT            6
+#      define  R300_VAP_VF_CNTL__TCL_OUTPUT_CTL_ENA__SHIFT     9
+#      define  R300_VAP_VF_CNTL__PROG_STREAM_ENA__SHIFT        10
+
+       /* index size - when not set the indices are assumed to be 16 bit */
+#      define  R300_VAP_VF_CNTL__INDEX_SIZE_32bit              (1<<11)
+       /* number of vertices */
+#      define  R300_VAP_VF_CNTL__NUM_VERTICES__SHIFT           16
+
+#define R500_VAP_INDEX_OFFSET              0x208c
+
+#define R300_VAP_OUTPUT_VTX_FMT_0           0x2090
+#       define R300_VAP_OUTPUT_VTX_FMT_0__POS_PRESENT     (1<<0)
+#       define R300_VAP_OUTPUT_VTX_FMT_0__COLOR_0_PRESENT (1<<1)
+#       define R300_VAP_OUTPUT_VTX_FMT_0__COLOR_1_PRESENT (1<<2)
+#       define R300_VAP_OUTPUT_VTX_FMT_0__COLOR_2_PRESENT (1<<3)
+#       define R300_VAP_OUTPUT_VTX_FMT_0__COLOR_3_PRESENT (1<<4)
+#       define R300_VAP_OUTPUT_VTX_FMT_0__PT_SIZE_PRESENT (1<<16)
+
+#define R300_VAP_OUTPUT_VTX_FMT_1           0x2094
+       /* each of the following is 3 bits wide, specifies number
+          of components */
+#       define R300_VAP_OUTPUT_VTX_FMT_1__TEX_0_COMP_CNT_SHIFT 0
+#       define R300_VAP_OUTPUT_VTX_FMT_1__TEX_1_COMP_CNT_SHIFT 3
+#       define R300_VAP_OUTPUT_VTX_FMT_1__TEX_2_COMP_CNT_SHIFT 6
+#       define R300_VAP_OUTPUT_VTX_FMT_1__TEX_3_COMP_CNT_SHIFT 9
+#       define R300_VAP_OUTPUT_VTX_FMT_1__TEX_4_COMP_CNT_SHIFT 12
+#       define R300_VAP_OUTPUT_VTX_FMT_1__TEX_5_COMP_CNT_SHIFT 15
+#       define R300_VAP_OUTPUT_VTX_FMT_1__TEX_6_COMP_CNT_SHIFT 18
+#       define R300_VAP_OUTPUT_VTX_FMT_1__TEX_7_COMP_CNT_SHIFT 21
+#      define R300_VAP_OUTPUT_VTX_FMT_1__NOT_PRESENT  0
+#      define R300_VAP_OUTPUT_VTX_FMT_1__1_COMPONENT  1
+#      define R300_VAP_OUTPUT_VTX_FMT_1__2_COMPONENTS 2
+#      define R300_VAP_OUTPUT_VTX_FMT_1__3_COMPONENTS 3
+#      define R300_VAP_OUTPUT_VTX_FMT_1__4_COMPONENTS 4
+
+#define R300_SE_VTE_CNTL                  0x20b0
+#      define     R300_VPORT_X_SCALE_ENA                (1 << 0)
+#      define     R300_VPORT_X_OFFSET_ENA               (1 << 1)
+#      define     R300_VPORT_Y_SCALE_ENA                (1 << 2)
+#      define     R300_VPORT_Y_OFFSET_ENA               (1 << 3)
+#      define     R300_VPORT_Z_SCALE_ENA                (1 << 4)
+#      define     R300_VPORT_Z_OFFSET_ENA               (1 << 5)
+#      define     R300_VTX_XY_FMT                       (1 << 8)
+#      define     R300_VTX_Z_FMT                        (1 << 9)
+#      define     R300_VTX_W0_FMT                       (1 << 10)
+#      define     R300_SERIAL_PROC_ENA                  (1 << 11)
+
+/* BEGIN: Vertex data assembly - lots of uncertainties */
+
+/* gap */
+
+/* Maximum Vertex Indx Clamp */
+#define R300_VAP_VF_MAX_VTX_INDX         0x2134
+/* Minimum Vertex Indx Clamp */
+#define R300_VAP_VF_MIN_VTX_INDX         0x2138
+
+/** Vertex assembler/processor control status */
+#define R300_VAP_CNTL_STATUS              0x2140
+/* No swap at all (default) */
+#      define R300_VC_NO_SWAP                  (0 << 0)
+/* 16-bit swap: 0xAABBCCDD becomes 0xBBAADDCC */
+#      define R300_VC_16BIT_SWAP               (1 << 0)
+/* 32-bit swap: 0xAABBCCDD becomes 0xDDCCBBAA */
+#      define R300_VC_32BIT_SWAP               (2 << 0)
+/* Half-dword swap: 0xAABBCCDD becomes 0xCCDDAABB */
+#      define R300_VC_HALF_DWORD_SWAP          (3 << 0)
+/* The TCL engine will not be used (as it is logically or even physically removed) */
+#      define R300_VAP_TCL_BYPASS              (1 << 8)
+/* Read only flag if TCL engine is busy. */
+#      define R300_VAP_PVS_BUSY                (1 << 11)
+/* TODO: gap for MAX_MPS */
+/* Read only flag if the vertex store is busy. */
+#      define R300_VAP_VS_BUSY                 (1 << 24)
+/* Read only flag if the reciprocal engine is busy. */
+#      define R300_VAP_RCP_BUSY                (1 << 25)
+/* Read only flag if the viewport transform engine is busy. */
+#      define R300_VAP_VTE_BUSY                (1 << 26)
+/* Read only flag if the memory interface unit is busy. */
+#      define R300_VAP_MUI_BUSY                (1 << 27)
+/* Read only flag if the vertex cache is busy. */
+#      define R300_VAP_VC_BUSY                 (1 << 28)
+/* Read only flag if the vertex fetcher is busy. */
+#      define R300_VAP_VF_BUSY                 (1 << 29)
+/* Read only flag if the register pipeline is busy. */
+#      define R300_VAP_REGPIPE_BUSY            (1 << 30)
+/* Read only flag if the VAP engine is busy. */
+#      define R300_VAP_VAP_BUSY                (1 << 31)
+
+/* gap */
+
+/* Where do we get our vertex data?
+ *
+ * Vertex data either comes either from immediate mode registers or from
+ * vertex arrays.
+ * There appears to be no mixed mode (though we can force the pitch of
+ * vertex arrays to 0, effectively reusing the same element over and over
+ * again).
+ *
+ * Immediate mode is controlled by the INPUT_CNTL registers. I am not sure
+ * if these registers influence vertex array processing.
+ *
+ * Vertex arrays are controlled via the 3D_LOAD_VBPNTR packet3.
+ *
+ * In both cases, vertex attributes are then passed through INPUT_ROUTE.
+ *
+ * Beginning with INPUT_ROUTE_0_0 is a list of WORDs that route vertex data
+ * into the vertex processor's input registers.
+ * The first word routes the first input, the second word the second, etc.
+ * The corresponding input is routed into the register with the given index.
+ * The list is ended by a word with INPUT_ROUTE_END set.
+ *
+ * Always set COMPONENTS_4 in immediate mode.
+ */
+
+#define R300_VAP_PROG_STREAM_CNTL_0                     0x2150
+#       define R300_DATA_TYPE_0_SHIFT                   0
+#       define R300_DATA_TYPE_FLOAT_1                   0
+#       define R300_DATA_TYPE_FLOAT_2                   1
+#       define R300_DATA_TYPE_FLOAT_3                   2
+#       define R300_DATA_TYPE_FLOAT_4                   3
+#       define R300_DATA_TYPE_BYTE                      4
+#       define R300_DATA_TYPE_D3DCOLOR                  5
+#       define R300_DATA_TYPE_SHORT_2                   6
+#       define R300_DATA_TYPE_SHORT_4                   7
+#       define R300_DATA_TYPE_VECTOR_3_TTT              8
+#       define R300_DATA_TYPE_VECTOR_3_EET              9
+#       define R300_SKIP_DWORDS_SHIFT                   4
+#       define R300_DST_VEC_LOC_SHIFT                   8
+#       define R300_LAST_VEC                            (1 << 13)
+#       define R300_SIGNED                              (1 << 14)
+#       define R300_NORMALIZE                           (1 << 15)
+#       define R300_DATA_TYPE_1_SHIFT                   16
+#define R300_VAP_PROG_STREAM_CNTL_1                     0x2154
+#define R300_VAP_PROG_STREAM_CNTL_2                     0x2158
+#define R300_VAP_PROG_STREAM_CNTL_3                     0x215C
+#define R300_VAP_PROG_STREAM_CNTL_4                     0x2160
+#define R300_VAP_PROG_STREAM_CNTL_5                     0x2164
+#define R300_VAP_PROG_STREAM_CNTL_6                     0x2168
+#define R300_VAP_PROG_STREAM_CNTL_7                     0x216C
+/* gap */
+
+/* Notes:
+ *  - always set up to produce at least two attributes:
+ *    if vertex program uses only position, fglrx will set normal, too
+ *  - INPUT_CNTL_0_COLOR and INPUT_CNTL_COLOR bits are always equal.
+ */
+#define R300_VAP_VTX_STATE_CNTL               0x2180
+#       define R300_COLOR_0_ASSEMBLY_SHIFT    0
+#       define R300_SEL_COLOR                 0
+#       define R300_SEL_USER_COLOR_0          1
+#       define R300_SEL_USER_COLOR_1          2
+#       define R300_COLOR_1_ASSEMBLY_SHIFT    2
+#       define R300_COLOR_2_ASSEMBLY_SHIFT    4
+#       define R300_COLOR_3_ASSEMBLY_SHIFT    6
+#       define R300_COLOR_4_ASSEMBLY_SHIFT    8
+#       define R300_COLOR_5_ASSEMBLY_SHIFT    10
+#       define R300_COLOR_6_ASSEMBLY_SHIFT    12
+#       define R300_COLOR_7_ASSEMBLY_SHIFT    14
+#       define R300_UPDATE_USER_COLOR_0_ENA   (1 << 16)
+
+/*
+ * Each bit in this field applies to the corresponding vector in the VSM
+ * memory (i.e. Bit 0 applies to VECTOR_0 (POSITION), etc.). If the bit
+ * is set, then the corresponding 4-Dword Vector is output into the Vertex Stream.
+ */
+#define R300_VAP_VSM_VTX_ASSM               0x2184
+#       define R300_INPUT_CNTL_POS               0x00000001
+#       define R300_INPUT_CNTL_NORMAL            0x00000002
+#       define R300_INPUT_CNTL_COLOR             0x00000004
+#       define R300_INPUT_CNTL_TC0               0x00000400
+#       define R300_INPUT_CNTL_TC1               0x00000800
+#       define R300_INPUT_CNTL_TC2               0x00001000 /* GUESS */
+#       define R300_INPUT_CNTL_TC3               0x00002000 /* GUESS */
+#       define R300_INPUT_CNTL_TC4               0x00004000 /* GUESS */
+#       define R300_INPUT_CNTL_TC5               0x00008000 /* GUESS */
+#       define R300_INPUT_CNTL_TC6               0x00010000 /* GUESS */
+#       define R300_INPUT_CNTL_TC7               0x00020000 /* GUESS */
+
+/* Programmable Stream Control Signed Normalize Control */
+#define R300_VAP_PSC_SGN_NORM_CNTL         0x21dc
+#      define SGN_NORM_ZERO                 0
+#      define SGN_NORM_ZERO_CLAMP_MINUS_ONE 1
+#      define SGN_NORM_NO_ZERO              2
+
+/* gap */
+
+/* Words parallel to INPUT_ROUTE_0; All words that are active in INPUT_ROUTE_0
+ * are set to a swizzling bit pattern, other words are 0.
+ *
+ * In immediate mode, the pattern is always set to xyzw. In vertex array
+ * mode, the swizzling pattern is e.g. used to set zw components in texture
+ * coordinates with only tweo components.
+ */
+#define R300_VAP_PROG_STREAM_CNTL_EXT_0                 0x21e0
+#       define R300_SWIZZLE0_SHIFT                      0
+#       define R300_SWIZZLE_SELECT_X_SHIFT              0
+#       define R300_SWIZZLE_SELECT_Y_SHIFT              3
+#       define R300_SWIZZLE_SELECT_Z_SHIFT              6
+#       define R300_SWIZZLE_SELECT_W_SHIFT              9
+
+#       define R300_SWIZZLE_SELECT_X                    0
+#       define R300_SWIZZLE_SELECT_Y                    1
+#       define R300_SWIZZLE_SELECT_Z                    2
+#       define R300_SWIZZLE_SELECT_W                    3
+#       define R300_SWIZZLE_SELECT_FP_ZERO              4
+#       define R300_SWIZZLE_SELECT_FP_ONE               5
+/* alternate forms for r300_emit.c */
+#       define R300_INPUT_ROUTE_SELECT_X    0
+#       define R300_INPUT_ROUTE_SELECT_Y    1
+#       define R300_INPUT_ROUTE_SELECT_Z    2
+#       define R300_INPUT_ROUTE_SELECT_W    3
+#       define R300_INPUT_ROUTE_SELECT_ZERO 4
+#       define R300_INPUT_ROUTE_SELECT_ONE  5
+
+#       define R300_WRITE_ENA_SHIFT                     12
+#       define R300_WRITE_ENA_X                         1
+#       define R300_WRITE_ENA_Y                         2
+#       define R300_WRITE_ENA_Z                         4
+#       define R300_WRITE_ENA_W                         8
+#       define R300_SWIZZLE1_SHIFT                      16
+#define R300_VAP_PROG_STREAM_CNTL_EXT_1                 0x21e4
+#define R300_VAP_PROG_STREAM_CNTL_EXT_2                 0x21e8
+#define R300_VAP_PROG_STREAM_CNTL_EXT_3                 0x21ec
+#define R300_VAP_PROG_STREAM_CNTL_EXT_4                 0x21f0
+#define R300_VAP_PROG_STREAM_CNTL_EXT_5                 0x21f4
+#define R300_VAP_PROG_STREAM_CNTL_EXT_6                 0x21f8
+#define R300_VAP_PROG_STREAM_CNTL_EXT_7                 0x21fc
+
+/* END: Vertex data assembly */
+
+/* gap */
+
+/* BEGIN: Upload vertex program and data */
+
+/*
+ * The programmable vertex shader unit has a memory bank of unknown size
+ * that can be written to in 16 byte units by writing the address into
+ * UPLOAD_ADDRESS, followed by data in UPLOAD_DATA (multiples of 4 DWORDs).
+ *
+ * Pointers into the memory bank are always in multiples of 16 bytes.
+ *
+ * The memory bank is divided into areas with fixed meaning.
+ *
+ * Starting at address UPLOAD_PROGRAM: Vertex program instructions.
+ * Native limits reported by drivers from ATI suggest size 256 (i.e. 4KB),
+ * whereas the difference between known addresses suggests size 512.
+ *
+ * Starting at address UPLOAD_PARAMETERS: Vertex program parameters.
+ * Native reported limits and the VPI layout suggest size 256, whereas
+ * difference between known addresses suggests size 512.
+ *
+ * At address UPLOAD_POINTSIZE is a vector (0, 0, ps, 0), where ps is the
+ * floating point pointsize. The exact purpose of this state is uncertain,
+ * as there is also the R300_RE_POINTSIZE register.
+ *
+ * Multiple vertex programs and parameter sets can be loaded at once,
+ * which could explain the size discrepancy.
+ */
+#define R300_VAP_PVS_VECTOR_INDX_REG         0x2200
+#       define R300_PVS_CODE_START           0
+#       define R300_MAX_PVS_CODE_LINES       256
+#       define R500_MAX_PVS_CODE_LINES       1024
+#       define R300_PVS_CONST_START          512
+#       define R500_PVS_CONST_START          1024
+#       define R300_MAX_PVS_CONST_VECS       256
+#       define R500_MAX_PVS_CONST_VECS       1024
+#       define R300_PVS_UCP_START            1024
+#       define R500_PVS_UCP_START            1536
+#       define R300_POINT_VPORT_SCALE_OFFSET 1030
+#       define R500_POINT_VPORT_SCALE_OFFSET 1542
+#       define R300_POINT_GEN_TEX_OFFSET     1031
+#       define R500_POINT_GEN_TEX_OFFSET     1543
+
+/*
+ * These are obsolete defines form r300_context.h, but they might give some
+ * clues when investigating the addresses further...
+ */
+#if 0
+#define VSF_DEST_PROGRAM        0x0
+#define VSF_DEST_MATRIX0        0x200
+#define VSF_DEST_MATRIX1        0x204
+#define VSF_DEST_MATRIX2        0x208
+#define VSF_DEST_VECTOR0        0x20c
+#define VSF_DEST_VECTOR1        0x20d
+#define VSF_DEST_UNKNOWN1       0x400
+#define VSF_DEST_UNKNOWN2       0x406
+#endif
+
+/* gap */
+
+#define R300_VAP_PVS_UPLOAD_DATA            0x2208
+
+/* END: Upload vertex program and data */
+
+/* gap */
+
+/* I do not know the purpose of this register. However, I do know that
+ * it is set to 221C_CLEAR for clear operations and to 221C_NORMAL
+ * for normal rendering.
+ *
+ * 2007-11-05: This register is the user clip plane control register, but there
+ * also seems to be a rendering mode control; the NORMAL/CLEAR defines.
+ *
+ * See bug #9871. http://bugs.freedesktop.org/attachment.cgi?id=10672&action=view
+ */
+#define R300_VAP_CLIP_CNTL                       0x221C
+#       define R300_VAP_UCP_ENABLE_0             (1 << 0)
+#       define R300_VAP_UCP_ENABLE_1             (1 << 1)
+#       define R300_VAP_UCP_ENABLE_2             (1 << 2)
+#       define R300_VAP_UCP_ENABLE_3             (1 << 3)
+#       define R300_VAP_UCP_ENABLE_4             (1 << 4)
+#       define R300_VAP_UCP_ENABLE_5             (1 << 5)
+#       define R300_PS_UCP_MODE_DIST_COP         (0 << 14)
+#       define R300_PS_UCP_MODE_RADIUS_COP       (1 << 14)
+#       define R300_PS_UCP_MODE_RADIUS_COP_CLIP  (2 << 14)
+#       define R300_PS_UCP_MODE_CLIP_AS_TRIFAN   (3 << 14)
+#       define R300_CLIP_DISABLE                 (1 << 16)
+#       define R300_UCP_CULL_ONLY_ENABLE         (1 << 17)
+#       define R300_BOUNDARY_EDGE_FLAG_ENABLE    (1 << 18)
+#       define R500_COLOR2_IS_TEXTURE            (1 << 20)
+#       define R500_COLOR3_IS_TEXTURE            (1 << 21)
+
+/* These seem to be per-pixel and per-vertex X and Y clipping planes. The first
+ * plane is per-pixel and the second plane is per-vertex.
+ *
+ * This was determined by experimentation alone but I believe it is correct.
+ *
+ * These registers are called X_QUAD0_1_FL to X_QUAD0_4_FL by glxtest.
+ */
+#define R300_VAP_GB_VERT_CLIP_ADJ                   0x2220
+#define R300_VAP_GB_VERT_DISC_ADJ                   0x2224
+#define R300_VAP_GB_HORZ_CLIP_ADJ                   0x2228
+#define R300_VAP_GB_HORZ_DISC_ADJ                   0x222c
+
+/* gap */
+
+/* Sometimes, END_OF_PKT and 0x2284=0 are the only commands sent between
+ * rendering commands and overwriting vertex program parameters.
+ * Therefore, I suspect writing zero to 0x2284 synchronizes the engine and
+ * avoids bugs caused by still running shaders reading bad data from memory.
+ */
+#define R300_VAP_PVS_STATE_FLUSH_REG        0x2284
+
+/* This register is used to define the number of core clocks to wait for a
+ * vertex to be received by the VAP input controller (while the primitive
+ * path is backed up) before forcing any accumulated vertices to be submitted
+ * to the vertex processing path.
+ */
+#define VAP_PVS_VTX_TIMEOUT_REG             0x2288
+#       define R300_2288_R300                    0x00750000 /* -- nh */
+#       define R300_2288_RV350                   0x0000FFFF /* -- Vladimir */
+
+/* gap */
+
+/* Addresses are relative to the vertex program instruction area of the
+ * memory bank. PROGRAM_END points to the last instruction of the active
+ * program
+ *
+ * The meaning of the two UNKNOWN fields is obviously not known. However,
+ * experiments so far have shown that both *must* point to an instruction
+ * inside the vertex program, otherwise the GPU locks up.
+ *
+ * fglrx usually sets CNTL_3_UNKNOWN to the end of the program and
+ * R300_PVS_CNTL_1_POS_END_SHIFT points to instruction where last write to
+ * position takes place.
+ *
+ * Most likely this is used to ignore rest of the program in cases
+ * where group of verts arent visible. For some reason this "section"
+ * is sometimes accepted other instruction that have no relationship with
+ * position calculations.
+ */
+#define R300_VAP_PVS_CODE_CNTL_0            0x22D0
+#       define R300_PVS_FIRST_INST_SHIFT         0
+#       define R300_PVS_XYZW_VALID_INST_SHIFT    10
+#       define R300_PVS_LAST_INST_SHIFT          20
+/* Addresses are relative the the vertex program parameters area. */
+#define R300_VAP_PVS_CONST_CNTL             0x22D4
+#       define R300_PVS_CONST_BASE_OFFSET_SHIFT  0
+#       define R300_PVS_MAX_CONST_ADDR_SHIFT     16
+#define R300_VAP_PVS_CODE_CNTL_1           0x22D8
+#       define R300_PVS_LAST_VTX_SRC_INST_SHIFT  0
+#define R300_VAP_PVS_FLOW_CNTL_OPC          0x22DC
+
+/* The entire range from 0x2300 to 0x2AC inclusive seems to be used for
+ * immediate vertices
+ */
+#define R300_VAP_VTX_COLOR_R                0x2464
+#define R300_VAP_VTX_COLOR_G                0x2468
+#define R300_VAP_VTX_COLOR_B                0x246C
+#define R300_VAP_VTX_POS_0_X_1              0x2490 /* used for glVertex2*() */
+#define R300_VAP_VTX_POS_0_Y_1              0x2494
+#define R300_VAP_VTX_COLOR_PKD              0x249C /* RGBA */
+#define R300_VAP_VTX_POS_0_X_2              0x24A0 /* used for glVertex3*() */
+#define R300_VAP_VTX_POS_0_Y_2              0x24A4
+#define R300_VAP_VTX_POS_0_Z_2              0x24A8
+/* write 0 to indicate end of packet? */
+#define R300_VAP_VTX_END_OF_PKT             0x24AC
+
+/* gap */
+
+/* These are values from r300_reg/r300_reg.h - they are known to be correct
+ * and are here so we can use one register file instead of several
+ * - Vladimir
+ */
+#define R300_GB_VAP_RASTER_VTX_FMT_0   0x4000
+#      define R300_GB_VAP_RASTER_VTX_FMT_0__POS_PRESENT        (1<<0)
+#      define R300_GB_VAP_RASTER_VTX_FMT_0__COLOR_0_PRESENT    (1<<1)
+#      define R300_GB_VAP_RASTER_VTX_FMT_0__COLOR_1_PRESENT    (1<<2)
+#      define R300_GB_VAP_RASTER_VTX_FMT_0__COLOR_2_PRESENT    (1<<3)
+#      define R300_GB_VAP_RASTER_VTX_FMT_0__COLOR_3_PRESENT    (1<<4)
+#      define R300_GB_VAP_RASTER_VTX_FMT_0__COLOR_SPACE        (0xf<<5)
+#      define R300_GB_VAP_RASTER_VTX_FMT_0__PT_SIZE_PRESENT    (0x1<<16)
+
+#define R300_GB_VAP_RASTER_VTX_FMT_1   0x4004
+       /* each of the following is 3 bits wide, specifies number
+          of components */
+#      define R300_GB_VAP_RASTER_VTX_FMT_1__TEX_0_COMP_CNT_SHIFT       0
+#      define R300_GB_VAP_RASTER_VTX_FMT_1__TEX_1_COMP_CNT_SHIFT       3
+#      define R300_GB_VAP_RASTER_VTX_FMT_1__TEX_2_COMP_CNT_SHIFT       6
+#      define R300_GB_VAP_RASTER_VTX_FMT_1__TEX_3_COMP_CNT_SHIFT       9
+#      define R300_GB_VAP_RASTER_VTX_FMT_1__TEX_4_COMP_CNT_SHIFT       12
+#      define R300_GB_VAP_RASTER_VTX_FMT_1__TEX_5_COMP_CNT_SHIFT       15
+#      define R300_GB_VAP_RASTER_VTX_FMT_1__TEX_6_COMP_CNT_SHIFT       18
+#      define R300_GB_VAP_RASTER_VTX_FMT_1__TEX_7_COMP_CNT_SHIFT       21
+
+/* UNK30 seems to enables point to quad transformation on textures
+ * (or something closely related to that).
+ * This bit is rather fatal at the time being due to lackings at pixel
+ * shader side
+ * Specifies top of Raster pipe specific enable controls.
+ */
+#define R300_GB_ENABLE 0x4008
+#      define R300_GB_POINT_STUFF_DISABLE     (0 << 0)
+#      define R300_GB_POINT_STUFF_ENABLE      (1 << 0) /* Specifies if points will have stuffed texture coordinates. */
+#      define R300_GB_LINE_STUFF_DISABLE      (0 << 1)
+#      define R300_GB_LINE_STUFF_ENABLE       (1 << 1) /* Specifies if lines will have stuffed texture coordinates. */
+#      define R300_GB_TRIANGLE_STUFF_DISABLE  (0 << 2)
+#      define R300_GB_TRIANGLE_STUFF_ENABLE   (1 << 2) /* Specifies if triangles will have stuffed texture coordinates. */
+#      define R300_GB_STENCIL_AUTO_DISABLE    (0 << 4)
+#      define R300_GB_STENCIL_AUTO_ENABLE     (1 << 4) /* Enable stencil auto inc/dec based on triangle cw/ccw, force into dzy low bit. */
+#      define R300_GB_STENCIL_AUTO_FORCE      (2 << 4) /* Force 0 into dzy low bit. */
+
+       /* each of the following is 2 bits wide */
+#define R300_GB_TEX_REPLICATE  0 /* Replicate VAP source texture coordinates (S,T,[R,Q]). */
+#define R300_GB_TEX_ST         1 /* Stuff with source texture coordinates (S,T). */
+#define R300_GB_TEX_STR                2 /* Stuff with source texture coordinates (S,T,R). */
+#      define R300_GB_TEX0_SOURCE_SHIFT        16
+#      define R300_GB_TEX1_SOURCE_SHIFT        18
+#      define R300_GB_TEX2_SOURCE_SHIFT        20
+#      define R300_GB_TEX3_SOURCE_SHIFT        22
+#      define R300_GB_TEX4_SOURCE_SHIFT        24
+#      define R300_GB_TEX5_SOURCE_SHIFT        26
+#      define R300_GB_TEX6_SOURCE_SHIFT        28
+#      define R300_GB_TEX7_SOURCE_SHIFT        30
+
+/* MSPOS - positions for multisample antialiasing (?) */
+#define R300_GB_MSPOS0                           0x4010
+       /* shifts - each of the fields is 4 bits */
+#      define R300_GB_MSPOS0__MS_X0_SHIFT      0
+#      define R300_GB_MSPOS0__MS_Y0_SHIFT      4
+#      define R300_GB_MSPOS0__MS_X1_SHIFT      8
+#      define R300_GB_MSPOS0__MS_Y1_SHIFT      12
+#      define R300_GB_MSPOS0__MS_X2_SHIFT      16
+#      define R300_GB_MSPOS0__MS_Y2_SHIFT      20
+#      define R300_GB_MSPOS0__MSBD0_Y          24
+#      define R300_GB_MSPOS0__MSBD0_X          28
+
+#define R300_GB_MSPOS1                           0x4014
+#      define R300_GB_MSPOS1__MS_X3_SHIFT      0
+#      define R300_GB_MSPOS1__MS_Y3_SHIFT      4
+#      define R300_GB_MSPOS1__MS_X4_SHIFT      8
+#      define R300_GB_MSPOS1__MS_Y4_SHIFT      12
+#      define R300_GB_MSPOS1__MS_X5_SHIFT      16
+#      define R300_GB_MSPOS1__MS_Y5_SHIFT      20
+#      define R300_GB_MSPOS1__MSBD1            24
+
+/* Specifies the graphics pipeline configuration for rasterization. */
+#define R300_GB_TILE_CONFIG                      0x4018
+#      define R300_GB_TILE_DISABLE             (0 << 0)
+#      define R300_GB_TILE_ENABLE              (1 << 0)
+#      define R300_GB_TILE_PIPE_COUNT_RV300    (0 << 1) /* RV350 (1 pipe, 1 ctx) */
+#      define R300_GB_TILE_PIPE_COUNT_R300     (3 << 1) /* R300 (2 pipes, 1 ctx) */
+#      define R300_GB_TILE_PIPE_COUNT_R420_3P  (6 << 1) /* R420-3P (3 pipes, 1 ctx) */
+#      define R300_GB_TILE_PIPE_COUNT_R420     (7 << 1) /* R420 (4 pipes, 1 ctx) */
+#      define R300_GB_TILE_SIZE_8              (0 << 4)
+#      define R300_GB_TILE_SIZE_16             (1 << 4)
+#      define R300_GB_TILE_SIZE_32             (2 << 4)
+#      define R300_GB_SUPER_SIZE_1             (0 << 6)
+#      define R300_GB_SUPER_SIZE_2             (1 << 6)
+#      define R300_GB_SUPER_SIZE_4             (2 << 6)
+#      define R300_GB_SUPER_SIZE_8             (3 << 6)
+#      define R300_GB_SUPER_SIZE_16            (4 << 6)
+#      define R300_GB_SUPER_SIZE_32            (5 << 6)
+#      define R300_GB_SUPER_SIZE_64            (6 << 6)
+#      define R300_GB_SUPER_SIZE_128           (7 << 6)
+#      define R300_GB_SUPER_X_SHIFT            9       /* 3 bits wide */
+#      define R300_GB_SUPER_Y_SHIFT            12      /* 3 bits wide */
+#      define R300_GB_SUPER_TILE_A             (0 << 15)
+#      define R300_GB_SUPER_TILE_B             (1 << 15)
+#      define R300_GB_SUBPIXEL_1_12            (0 << 16)
+#      define R300_GB_SUBPIXEL_1_16            (1 << 16)
+#      define GB_TILE_CONFIG_QUADS_PER_RAS_4   (0 << 17)
+#      define GB_TILE_CONFIG_QUADS_PER_RAS_8   (1 << 17)
+#      define GB_TILE_CONFIG_QUADS_PER_RAS_16  (2 << 17)
+#      define GB_TILE_CONFIG_QUADS_PER_RAS_32  (3 << 17)
+#      define GB_TILE_CONFIG_BB_SCAN_INTERCEPT (0 << 19)
+#      define GB_TILE_CONFIG_BB_SCAN_BOUND_BOX (1 << 19)
+#      define GB_TILE_CONFIG_ALT_SCAN_EN_LR    (0 << 20)
+#      define GB_TILE_CONFIG_ALT_SCAN_EN_LRL   (1 << 20)
+#      define GB_TILE_CONFIG_ALT_OFFSET        (0 << 21)
+#      define GB_TILE_CONFIG_SUBPRECISION      (0 << 22)
+#      define GB_TILE_CONFIG_ALT_TILING_DEF    (0 << 23)
+#      define GB_TILE_CONFIG_ALT_TILING_3_2    (1 << 23)
+#      define GB_TILE_CONFIG_Z_EXTENDED_24_1   (0 << 24)
+#      define GB_TILE_CONFIG_Z_EXTENDED_S25_1  (1 << 24)
+
+/* Specifies the sizes of the various FIFO`s in the sc/rs/us. This register must be the first one written */
+#define R300_GB_FIFO_SIZE      0x4024
+       /* each of the following is 2 bits wide */
+#define R300_GB_FIFO_SIZE_32   0
+#define R300_GB_FIFO_SIZE_64   1
+#define R300_GB_FIFO_SIZE_128  2
+#define R300_GB_FIFO_SIZE_256  3
+#      define R300_SC_IFIFO_SIZE_SHIFT 0
+#      define R300_SC_TZFIFO_SIZE_SHIFT        2
+#      define R300_SC_BFIFO_SIZE_SHIFT 4
+
+#      define R300_US_OFIFO_SIZE_SHIFT 12
+#      define R300_US_WFIFO_SIZE_SHIFT 14
+       /* the following use the same constants as above, but meaning is
+          is times 2 (i.e. instead of 32 words it means 64 */
+#      define R300_RS_TFIFO_SIZE_SHIFT 6
+#      define R300_RS_CFIFO_SIZE_SHIFT 8
+#      define R300_US_RAM_SIZE_SHIFT           10
+       /* watermarks, 3 bits wide */
+#      define R300_RS_HIGHWATER_COL_SHIFT      16
+#      define R300_RS_HIGHWATER_TEX_SHIFT      19
+#      define R300_OFIFO_HIGHWATER_SHIFT       22      /* two bits only */
+#      define R300_CUBE_FIFO_HIGHWATER_COL_SHIFT       24
+
+#define GB_Z_PEQ_CONFIG                          0x4028
+#      define GB_Z_PEQ_CONFIG_Z_PEQ_SIZE_4_4    (0 << 0)
+#      define GB_Z_PEQ_CONFIG_Z_PEQ_SIZE_8_8    (1 << 0)
+
+/* Specifies various polygon specific selects (fog, depth, perspective). */
+#define R300_GB_SELECT                           0x401c
+#      define R300_GB_FOG_SELECT_C0A           (0 << 0)
+#      define R300_GB_FOG_SELECT_C1A           (1 << 0)
+#      define R300_GB_FOG_SELECT_C2A           (2 << 0)
+#      define R300_GB_FOG_SELECT_C3A           (3 << 0)
+#      define R300_GB_FOG_SELECT_1_1_W         (4 << 0)
+#      define R300_GB_FOG_SELECT_Z             (5 << 0)
+#      define R300_GB_DEPTH_SELECT_Z           (0 << 3)
+#      define R300_GB_DEPTH_SELECT_1_1_W       (1 << 3)
+#      define R300_GB_W_SELECT_1_W             (0 << 4)
+#      define R300_GB_W_SELECT_1               (1 << 4)
+#      define R300_GB_FOG_STUFF_DISABLE        (0 << 5)
+#      define R300_GB_FOG_STUFF_ENABLE         (1 << 5)
+#      define R300_GB_FOG_STUFF_TEX_SHIFT      6
+#      define R300_GB_FOG_STUFF_TEX_MASK       0x000003c0
+#      define R300_GB_FOG_STUFF_COMP_SHIFT     10
+#      define R300_GB_FOG_STUFF_COMP_MASK      0x00000c00
+
+/* Specifies the graphics pipeline configuration for antialiasing. */
+#define GB_AA_CONFIG                                    0x4020
+#      define GB_AA_CONFIG_AA_DISABLE           (0 << 0)
+#      define GB_AA_CONFIG_AA_ENABLE            (1 << 0)
+#      define GB_AA_CONFIG_NUM_AA_SUBSAMPLES_2  (0 << 1)
+#      define GB_AA_CONFIG_NUM_AA_SUBSAMPLES_3  (1 << 1)
+#      define GB_AA_CONFIG_NUM_AA_SUBSAMPLES_4  (2 << 1)
+#      define GB_AA_CONFIG_NUM_AA_SUBSAMPLES_6  (3 << 1)
+
+/* Selects which of 4 pipes are active. */
+#define GB_PIPE_SELECT                           0x402c
+#      define GB_PIPE_SELECT_PIPE0_ID_SHIFT  0
+#      define GB_PIPE_SELECT_PIPE1_ID_SHIFT  2
+#      define GB_PIPE_SELECT_PIPE2_ID_SHIFT  4
+#      define GB_PIPE_SELECT_PIPE3_ID_SHIFT  6
+#      define GB_PIPE_SELECT_PIPE_MASK_SHIFT 8
+#      define GB_PIPE_SELECT_MAX_PIPE        12
+#      define GB_PIPE_SELECT_BAD_PIPES       14
+#      define GB_PIPE_SELECT_CONFIG_PIPES    18
+
+
+/* Specifies the sizes of the various FIFO`s in the sc/rs. */
+#define GB_FIFO_SIZE1                            0x4070
+/* High water mark for SC input fifo */
+#      define GB_FIFO_SIZE1_SC_HIGHWATER_IFIFO_SHIFT 0
+#      define GB_FIFO_SIZE1_SC_HIGHWATER_IFIFO_MASK  0x0000003f
+/* High water mark for SC input fifo (B) */
+#      define GB_FIFO_SIZE1_SC_HIGHWATER_BFIFO_SHIFT 6
+#      define GB_FIFO_SIZE1_SC_HIGHWATER_BFIFO_MASK  0x00000fc0
+/* High water mark for RS colors' fifo */
+#      define GB_FIFO_SIZE1_SC_HIGHWATER_COL_SHIFT   12
+#      define GB_FIFO_SIZE1_SC_HIGHWATER_COL_MASK    0x0003f000
+/* High water mark for RS textures' fifo */
+#      define GB_FIFO_SIZE1_SC_HIGHWATER_TEX_SHIFT   18
+#      define GB_FIFO_SIZE1_SC_HIGHWATER_TEX_MASK    0x00fc0000
+
+/* This table specifies the source location and format for up to 16 texture
+ * addresses (i[0]:i[15]) and four colors (c[0]:c[3])
+ */
+#define R500_RS_IP_0                                   0x4074
+#define R500_RS_IP_1                                   0x4078
+#define R500_RS_IP_2                                   0x407C
+#define R500_RS_IP_3                                   0x4080
+#define R500_RS_IP_4                                   0x4084
+#define R500_RS_IP_5                                   0x4088
+#define R500_RS_IP_6                                   0x408C
+#define R500_RS_IP_7                                   0x4090
+#define R500_RS_IP_8                                   0x4094
+#define R500_RS_IP_9                                   0x4098
+#define R500_RS_IP_10                                  0x409C
+#define R500_RS_IP_11                                  0x40A0
+#define R500_RS_IP_12                                  0x40A4
+#define R500_RS_IP_13                                  0x40A8
+#define R500_RS_IP_14                                  0x40AC
+#define R500_RS_IP_15                                  0x40B0
+#define R500_RS_IP_PTR_K0                               62
+#define R500_RS_IP_PTR_K1                               63
+#define R500_RS_IP_TEX_PTR_S_SHIFT                     0
+#define R500_RS_IP_TEX_PTR_T_SHIFT                     6
+#define R500_RS_IP_TEX_PTR_R_SHIFT                     12
+#define R500_RS_IP_TEX_PTR_Q_SHIFT                     18
+#define R500_RS_IP_COL_PTR_SHIFT                       24
+#define R500_RS_IP_COL_FMT_SHIFT                       27
+#      define R500_RS_COL_PTR(x)                       ((x) << 24)
+#       define R500_RS_COL_FMT(x)                       ((x) << 27)
+/* gap */
+#define R500_RS_IP_OFFSET_DIS                          (0 << 31)
+#define R500_RS_IP_OFFSET_EN                           (1 << 31)
+
+/* gap */
+
+/* Zero to flush caches. */
+#define R300_TX_INVALTAGS                   0x4100
+#define R300_TX_FLUSH                       0x0
+
+/* The upper enable bits are guessed, based on fglrx reported limits. */
+#define R300_TX_ENABLE                      0x4104
+#       define R300_TX_ENABLE_0                  (1 << 0)
+#       define R300_TX_ENABLE_1                  (1 << 1)
+#       define R300_TX_ENABLE_2                  (1 << 2)
+#       define R300_TX_ENABLE_3                  (1 << 3)
+#       define R300_TX_ENABLE_4                  (1 << 4)
+#       define R300_TX_ENABLE_5                  (1 << 5)
+#       define R300_TX_ENABLE_6                  (1 << 6)
+#       define R300_TX_ENABLE_7                  (1 << 7)
+#       define R300_TX_ENABLE_8                  (1 << 8)
+#       define R300_TX_ENABLE_9                  (1 << 9)
+#       define R300_TX_ENABLE_10                 (1 << 10)
+#       define R300_TX_ENABLE_11                 (1 << 11)
+#       define R300_TX_ENABLE_12                 (1 << 12)
+#       define R300_TX_ENABLE_13                 (1 << 13)
+#       define R300_TX_ENABLE_14                 (1 << 14)
+#       define R300_TX_ENABLE_15                 (1 << 15)
+
+#define R500_TX_FILTER_4                   0x4110
+#      define R500_TX_WEIGHT_1_SHIFT            (0)
+#      define R500_TX_WEIGHT_0_SHIFT            (11)
+#      define R500_TX_WEIGHT_PAIR               (1<<22)
+#      define R500_TX_PHASE_SHIFT               (23)
+#      define R500_TX_DIRECTION_HORIZONTAL      (0<<27)
+#      define R500_TX_DIRECTION_VERITCAL        (1<<27)
+
+/* S Texture Coordinate of Vertex 0 for Point texture stuffing (LLC) */
+#define R300_GA_POINT_S0                              0x4200
+
+/* T Texture Coordinate of Vertex 0 for Point texture stuffing (LLC) */
+#define R300_GA_POINT_T0                              0x4204
+
+/* S Texture Coordinate of Vertex 2 for Point texture stuffing (URC) */
+#define R300_GA_POINT_S1                              0x4208
+
+/* T Texture Coordinate of Vertex 2 for Point texture stuffing (URC) */
+#define R300_GA_POINT_T1                              0x420c
+
+/* Specifies amount to shift integer position of vertex (screen space) before
+ * converting to float for triangle stipple.
+ */
+#define R300_GA_TRIANGLE_STIPPLE            0x4214
+#      define R300_GA_TRIANGLE_STIPPLE_X_SHIFT_SHIFT 0
+#      define R300_GA_TRIANGLE_STIPPLE_X_SHIFT_MASK  0x0000000f
+#      define R300_GA_TRIANGLE_STIPPLE_Y_SHIFT_SHIFT 16
+#      define R300_GA_TRIANGLE_STIPPLE_Y_SHIFT_MASK  0x000f0000
+
+/* The pointsize is given in multiples of 6. The pointsize can be enormous:
+ * Clear() renders a single point that fills the entire framebuffer.
+ * 1/2 Height of point; fixed (16.0), subpixel format (1/12 or 1/16, even if in
+ * 8b precision).
+ */
+#define R300_GA_POINT_SIZE                   0x421C
+#       define R300_POINTSIZE_Y_SHIFT         0
+#       define R300_POINTSIZE_Y_MASK          0x0000ffff
+#       define R300_POINTSIZE_X_SHIFT         16
+#       define R300_POINTSIZE_X_MASK          0xffff0000
+#       define R300_POINTSIZE_MAX             (R300_POINTSIZE_Y_MASK / 6)
+
+/* Blue fill color */
+#define R500_GA_FILL_R                                0x4220
+
+/* Blue fill color */
+#define R500_GA_FILL_G                                0x4224
+
+/* Blue fill color */
+#define R500_GA_FILL_B                                0x4228
+
+/* Alpha fill color */
+#define R500_GA_FILL_A                                0x422c
+
+
+/* Specifies maximum and minimum point & sprite sizes for per vertex size
+ * specification. The lower part (15:0) is MIN and (31:16) is max.
+ */
+#define R300_GA_POINT_MINMAX                0x4230
+#       define R300_GA_POINT_MINMAX_MIN_SHIFT          0
+#       define R300_GA_POINT_MINMAX_MIN_MASK           (0xFFFF << 0)
+#       define R300_GA_POINT_MINMAX_MAX_SHIFT          16
+#       define R300_GA_POINT_MINMAX_MAX_MASK           (0xFFFF << 16)
+
+/* 1/2 width of line, in subpixels (1/12 or 1/16 only, even in 8b
+ * subprecision); (16.0) fixed format.
+ *
+ * The line width is given in multiples of 6.
+ * In default mode lines are classified as vertical lines.
+ * HO: horizontal
+ * VE: vertical or horizontal
+ * HO & VE: no classification
+ */
+#define R300_GA_LINE_CNTL                             0x4234
+#       define R300_GA_LINE_CNTL_WIDTH_SHIFT       0
+#       define R300_GA_LINE_CNTL_WIDTH_MASK        0x0000ffff
+#      define R300_GA_LINE_CNTL_END_TYPE_HOR      (0 << 16)
+#      define R300_GA_LINE_CNTL_END_TYPE_VER      (1 << 16)
+#      define R300_GA_LINE_CNTL_END_TYPE_SQR      (2 << 16) /* horizontal or vertical depending upon slope */
+#      define R300_GA_LINE_CNTL_END_TYPE_COMP     (3 << 16) /* Computed (perpendicular to slope) */
+#      define R500_GA_LINE_CNTL_SORT_NO           (0 << 18)
+#      define R500_GA_LINE_CNTL_SORT_MINX_MINY    (1 << 18)
+/** TODO: looks wrong */
+#       define R300_LINESIZE_MAX              (R300_GA_LINE_CNTL_WIDTH_MASK / 6)
+/** TODO: looks wrong */
+#       define R300_LINE_CNT_HO               (1 << 16)
+/** TODO: looks wrong */
+#       define R300_LINE_CNT_VE               (1 << 17)
+
+/* Line Stipple configuration information. */
+#define R300_GA_LINE_STIPPLE_CONFIG                   0x4238
+#      define R300_GA_LINE_STIPPLE_CONFIG_LINE_RESET_NO     (0 << 0)
+#      define R300_GA_LINE_STIPPLE_CONFIG_LINE_RESET_LINE   (1 << 0)
+#      define R300_GA_LINE_STIPPLE_CONFIG_LINE_RESET_PACKET (2 << 0)
+#      define R300_GA_LINE_STIPPLE_CONFIG_STIPPLE_SCALE_SHIFT 2
+#      define R300_GA_LINE_STIPPLE_CONFIG_STIPPLE_SCALE_MASK  0xfffffffc
+
+/* Used to load US instructions and constants */
+#define R500_GA_US_VECTOR_INDEX               0x4250
+#      define R500_GA_US_VECTOR_INDEX_SHIFT       0
+#      define R500_GA_US_VECTOR_INDEX_MASK        0x000000ff
+#      define R500_GA_US_VECTOR_INDEX_TYPE_INSTR  (0 << 16)
+#      define R500_GA_US_VECTOR_INDEX_TYPE_CONST  (1 << 16)
+#      define R500_GA_US_VECTOR_INDEX_CLAMP_NO    (0 << 17)
+#      define R500_GA_US_VECTOR_INDEX_CLAMP_CONST (1 << 17)
+
+/* Data register for loading US instructions and constants */
+#define R500_GA_US_VECTOR_DATA                0x4254
+
+/* Specifies color properties and mappings of textures. */
+#define R500_GA_COLOR_CONTROL_PS3                     0x4258
+#      define R500_TEX0_SHADING_PS3_SOLID       (0 << 0)
+#      define R500_TEX0_SHADING_PS3_FLAT        (1 << 0)
+#      define R500_TEX0_SHADING_PS3_GOURAUD     (2 << 0)
+#      define R500_TEX1_SHADING_PS3_SOLID       (0 << 2)
+#      define R500_TEX1_SHADING_PS3_FLAT        (1 << 2)
+#      define R500_TEX1_SHADING_PS3_GOURAUD     (2 << 2)
+#      define R500_TEX2_SHADING_PS3_SOLID       (0 << 4)
+#      define R500_TEX2_SHADING_PS3_FLAT        (1 << 4)
+#      define R500_TEX2_SHADING_PS3_GOURAUD     (2 << 4)
+#      define R500_TEX3_SHADING_PS3_SOLID       (0 << 6)
+#      define R500_TEX3_SHADING_PS3_FLAT        (1 << 6)
+#      define R500_TEX3_SHADING_PS3_GOURAUD     (2 << 6)
+#      define R500_TEX4_SHADING_PS3_SOLID       (0 << 8)
+#      define R500_TEX4_SHADING_PS3_FLAT        (1 << 8)
+#      define R500_TEX4_SHADING_PS3_GOURAUD     (2 << 8)
+#      define R500_TEX5_SHADING_PS3_SOLID       (0 << 10)
+#      define R500_TEX5_SHADING_PS3_FLAT        (1 << 10)
+#      define R500_TEX5_SHADING_PS3_GOURAUD     (2 << 10)
+#      define R500_TEX6_SHADING_PS3_SOLID       (0 << 12)
+#      define R500_TEX6_SHADING_PS3_FLAT        (1 << 12)
+#      define R500_TEX6_SHADING_PS3_GOURAUD     (2 << 12)
+#      define R500_TEX7_SHADING_PS3_SOLID       (0 << 14)
+#      define R500_TEX7_SHADING_PS3_FLAT        (1 << 14)
+#      define R500_TEX7_SHADING_PS3_GOURAUD     (2 << 14)
+#      define R500_TEX8_SHADING_PS3_SOLID       (0 << 16)
+#      define R500_TEX8_SHADING_PS3_FLAT        (1 << 16)
+#      define R500_TEX8_SHADING_PS3_GOURAUD     (2 << 16)
+#      define R500_TEX9_SHADING_PS3_SOLID       (0 << 18)
+#      define R500_TEX9_SHADING_PS3_FLAT        (1 << 18)
+#      define R500_TEX9_SHADING_PS3_GOURAUD     (2 << 18)
+#      define R500_TEX10_SHADING_PS3_SOLID      (0 << 20)
+#      define R500_TEX10_SHADING_PS3_FLAT       (1 << 20)
+#      define R500_TEX10_SHADING_PS3_GOURAUD    (2 << 20)
+#      define R500_COLOR0_TEX_OVERRIDE_NO       (0 << 22)
+#      define R500_COLOR0_TEX_OVERRIDE_TEX_0    (1 << 22)
+#      define R500_COLOR0_TEX_OVERRIDE_TEX_1    (2 << 22)
+#      define R500_COLOR0_TEX_OVERRIDE_TEX_2    (3 << 22)
+#      define R500_COLOR0_TEX_OVERRIDE_TEX_3    (4 << 22)
+#      define R500_COLOR0_TEX_OVERRIDE_TEX_4    (5 << 22)
+#      define R500_COLOR0_TEX_OVERRIDE_TEX_5    (6 << 22)
+#      define R500_COLOR0_TEX_OVERRIDE_TEX_6    (7 << 22)
+#      define R500_COLOR0_TEX_OVERRIDE_TEX_7    (8 << 22)
+#      define R500_COLOR0_TEX_OVERRIDE_TEX_8_C2 (9 << 22)
+#      define R500_COLOR0_TEX_OVERRIDE_TEX_9_C3 (10 << 22)
+#      define R500_COLOR1_TEX_OVERRIDE_NO       (0 << 26)
+#      define R500_COLOR1_TEX_OVERRIDE_TEX_0    (1 << 26)
+#      define R500_COLOR1_TEX_OVERRIDE_TEX_1    (2 << 26)
+#      define R500_COLOR1_TEX_OVERRIDE_TEX_2    (3 << 26)
+#      define R500_COLOR1_TEX_OVERRIDE_TEX_3    (4 << 26)
+#      define R500_COLOR1_TEX_OVERRIDE_TEX_4    (5 << 26)
+#      define R500_COLOR1_TEX_OVERRIDE_TEX_5    (6 << 26)
+#      define R500_COLOR1_TEX_OVERRIDE_TEX_6    (7 << 26)
+#      define R500_COLOR1_TEX_OVERRIDE_TEX_7    (8 << 26)
+#      define R500_COLOR1_TEX_OVERRIDE_TEX_8_C2 (9 << 26)
+#      define R500_COLOR1_TEX_OVERRIDE_TEX_9_C3 (10 << 26)
+
+/* Returns idle status of various G3D block, captured when GA_IDLE written or
+ * when hard or soft reset asserted.
+ */
+#define R500_GA_IDLE                                  0x425c
+#      define R500_GA_IDLE_PIPE3_Z_IDLE  (0 << 0)
+#      define R500_GA_IDLE_PIPE2_Z_IDLE  (0 << 1)
+#      define R500_GA_IDLE_PIPE3_CD_IDLE (0 << 2)
+#      define R500_GA_IDLE_PIPE2_CD_IDLE (0 << 3)
+#      define R500_GA_IDLE_PIPE3_FG_IDLE (0 << 4)
+#      define R500_GA_IDLE_PIPE2_FG_IDLE (0 << 5)
+#      define R500_GA_IDLE_PIPE3_US_IDLE (0 << 6)
+#      define R500_GA_IDLE_PIPE2_US_IDLE (0 << 7)
+#      define R500_GA_IDLE_PIPE3_SC_IDLE (0 << 8)
+#      define R500_GA_IDLE_PIPE2_SC_IDLE (0 << 9)
+#      define R500_GA_IDLE_PIPE3_RS_IDLE (0 << 10)
+#      define R500_GA_IDLE_PIPE2_RS_IDLE (0 << 11)
+#      define R500_GA_IDLE_PIPE1_Z_IDLE  (0 << 12)
+#      define R500_GA_IDLE_PIPE0_Z_IDLE  (0 << 13)
+#      define R500_GA_IDLE_PIPE1_CD_IDLE (0 << 14)
+#      define R500_GA_IDLE_PIPE0_CD_IDLE (0 << 15)
+#      define R500_GA_IDLE_PIPE1_FG_IDLE (0 << 16)
+#      define R500_GA_IDLE_PIPE0_FG_IDLE (0 << 17)
+#      define R500_GA_IDLE_PIPE1_US_IDLE (0 << 18)
+#      define R500_GA_IDLE_PIPE0_US_IDLE (0 << 19)
+#      define R500_GA_IDLE_PIPE1_SC_IDLE (0 << 20)
+#      define R500_GA_IDLE_PIPE0_SC_IDLE (0 << 21)
+#      define R500_GA_IDLE_PIPE1_RS_IDLE (0 << 22)
+#      define R500_GA_IDLE_PIPE0_RS_IDLE (0 << 23)
+#      define R500_GA_IDLE_SU_IDLE       (0 << 24)
+#      define R500_GA_IDLE_GA_IDLE       (0 << 25)
+#      define R500_GA_IDLE_GA_UNIT2_IDLE (0 << 26)
+
+/* Current value of stipple accumulator. */
+#define R300_GA_LINE_STIPPLE_VALUE            0x4260
+
+/* S Texture Coordinate Value for Vertex 0 of Line (stuff textures -- i.e. AA) */
+#define R300_GA_LINE_S0                               0x4264
+/* S Texture Coordinate Value for Vertex 1 of Lines (V2 of parallelogram -- stuff textures -- i.e. AA) */
+#define R300_GA_LINE_S1                               0x4268
+
+/* GA Input fifo high water marks */
+#define R500_GA_FIFO_CNTL                             0x4270
+#      define R500_GA_FIFO_CNTL_VERTEX_FIFO_MASK   0x00000007
+#      define R500_GA_FIFO_CNTL_VERTEX_FIFO_SHIFT  0
+#      define R500_GA_FIFO_CNTL_VERTEX_INDEX_MASK  0x00000038
+#      define R500_GA_FIFO_CNTL_VERTEX_INDEX_SHIFT 3
+#      define R500_GA_FIFO_CNTL_VERTEX_REG_MASK    0x00003fc0
+#      define R500_GA_FIFO_CNTL_VERTEX_REG_SHIFT   6
+
+/* GA enhance/tweaks */
+#define R300_GA_ENHANCE                               0x4274
+#      define R300_GA_ENHANCE_DEADLOCK_CNTL_NO_EFFECT   (0 << 0)
+#      define R300_GA_ENHANCE_DEADLOCK_CNTL_PREVENT_TCL (1 << 0) /* Prevents TCL interface from deadlocking on GA side. */
+#      define R300_GA_ENHANCE_FASTSYNC_CNTL_NO_EFFECT   (0 << 1)
+#      define R300_GA_ENHANCE_FASTSYNC_CNTL_ENABLE      (1 << 1) /* Enables high-performance register/primitive switching. */
+#      define R500_GA_ENHANCE_REG_READWRITE_NO_EFFECT   (0 << 2) /* R520+ only */
+#      define R500_GA_ENHANCE_REG_READWRITE_ENABLE      (1 << 2) /* R520+ only, Enables GA support of simultaneous register reads and writes. */
+#      define R500_GA_ENHANCE_REG_NOSTALL_NO_EFFECT     (0 << 3)
+#      define R500_GA_ENHANCE_REG_NOSTALL_ENABLE        (1 << 3) /* Enables GA support of no-stall reads for register read back. */
+
+#define R300_GA_COLOR_CONTROL                   0x4278
+#      define R300_GA_COLOR_CONTROL_RGB0_SHADING_SOLID      (0 << 0)
+#      define R300_GA_COLOR_CONTROL_RGB0_SHADING_FLAT       (1 << 0)
+#      define R300_GA_COLOR_CONTROL_RGB0_SHADING_GOURAUD    (2 << 0)
+#      define R300_GA_COLOR_CONTROL_ALPHA0_SHADING_SOLID    (0 << 2)
+#      define R300_GA_COLOR_CONTROL_ALPHA0_SHADING_FLAT     (1 << 2)
+#      define R300_GA_COLOR_CONTROL_ALPHA0_SHADING_GOURAUD  (2 << 2)
+#      define R300_GA_COLOR_CONTROL_RGB1_SHADING_SOLID      (0 << 4)
+#      define R300_GA_COLOR_CONTROL_RGB1_SHADING_FLAT       (1 << 4)
+#      define R300_GA_COLOR_CONTROL_RGB1_SHADING_GOURAUD    (2 << 4)
+#      define R300_GA_COLOR_CONTROL_ALPHA1_SHADING_SOLID    (0 << 6)
+#      define R300_GA_COLOR_CONTROL_ALPHA1_SHADING_FLAT     (1 << 6)
+#      define R300_GA_COLOR_CONTROL_ALPHA1_SHADING_GOURAUD  (2 << 6)
+#      define R300_GA_COLOR_CONTROL_RGB2_SHADING_SOLID      (0 << 8)
+#      define R300_GA_COLOR_CONTROL_RGB2_SHADING_FLAT       (1 << 8)
+#      define R300_GA_COLOR_CONTROL_RGB2_SHADING_GOURAUD    (2 << 8)
+#      define R300_GA_COLOR_CONTROL_ALPHA2_SHADING_SOLID    (0 << 10)
+#      define R300_GA_COLOR_CONTROL_ALPHA2_SHADING_FLAT     (1 << 10)
+#      define R300_GA_COLOR_CONTROL_ALPHA2_SHADING_GOURAUD  (2 << 10)
+#      define R300_GA_COLOR_CONTROL_RGB3_SHADING_SOLID      (0 << 12)
+#      define R300_GA_COLOR_CONTROL_RGB3_SHADING_FLAT       (1 << 12)
+#      define R300_GA_COLOR_CONTROL_RGB3_SHADING_GOURAUD    (2 << 12)
+#      define R300_GA_COLOR_CONTROL_ALPHA3_SHADING_SOLID    (0 << 14)
+#      define R300_GA_COLOR_CONTROL_ALPHA3_SHADING_FLAT     (1 << 14)
+#      define R300_GA_COLOR_CONTROL_ALPHA3_SHADING_GOURAUD  (2 << 14)
+#      define R300_GA_COLOR_CONTROL_PROVOKING_VERTEX_FIRST  (0 << 16)
+#      define R300_GA_COLOR_CONTROL_PROVOKING_VERTEX_SECOND (1 << 16)
+#      define R300_GA_COLOR_CONTROL_PROVOKING_VERTEX_THIRD  (2 << 16)
+#      define R300_GA_COLOR_CONTROL_PROVOKING_VERTEX_LAST   (3 << 16)
+
+/** TODO: might be candidate for removal */
+#      define R300_RE_SHADE_MODEL_SMOOTH     ( \
+       R300_GA_COLOR_CONTROL_RGB0_SHADING_GOURAUD | R300_GA_COLOR_CONTROL_ALPHA0_SHADING_GOURAUD | \
+       R300_GA_COLOR_CONTROL_RGB1_SHADING_GOURAUD | R300_GA_COLOR_CONTROL_ALPHA1_SHADING_GOURAUD | \
+       R300_GA_COLOR_CONTROL_RGB2_SHADING_GOURAUD | R300_GA_COLOR_CONTROL_ALPHA2_SHADING_GOURAUD | \
+       R300_GA_COLOR_CONTROL_RGB3_SHADING_GOURAUD | R300_GA_COLOR_CONTROL_ALPHA3_SHADING_GOURAUD | \
+       R300_GA_COLOR_CONTROL_PROVOKING_VERTEX_LAST )
+/** TODO: might be candidate for removal, the GOURAUD stuff also looks buggy to me */
+#      define R300_RE_SHADE_MODEL_FLAT     ( \
+       R300_GA_COLOR_CONTROL_RGB0_SHADING_FLAT | R300_GA_COLOR_CONTROL_ALPHA0_SHADING_FLAT | \
+       R300_GA_COLOR_CONTROL_RGB1_SHADING_FLAT | R300_GA_COLOR_CONTROL_ALPHA1_SHADING_GOURAUD | \
+       R300_GA_COLOR_CONTROL_RGB2_SHADING_FLAT | R300_GA_COLOR_CONTROL_ALPHA2_SHADING_FLAT | \
+       R300_GA_COLOR_CONTROL_RGB3_SHADING_FLAT | R300_GA_COLOR_CONTROL_ALPHA3_SHADING_GOURAUD | \
+       R300_GA_COLOR_CONTROL_PROVOKING_VERTEX_LAST )
+
+/* Specifies red & green components of fill color -- S312 format -- Backwards comp. */
+#define R300_GA_SOLID_RG                         0x427c
+#      define GA_SOLID_RG_COLOR_GREEN_SHIFT 0
+#      define GA_SOLID_RG_COLOR_GREEN_MASK  0x0000ffff
+#      define GA_SOLID_RG_COLOR_RED_SHIFT   16
+#      define GA_SOLID_RG_COLOR_RED_MASK    0xffff0000
+/* Specifies blue & alpha components of fill color -- S312 format -- Backwards comp. */
+#define R300_GA_SOLID_BA                         0x4280
+#      define GA_SOLID_BA_COLOR_ALPHA_SHIFT 0
+#      define GA_SOLID_BA_COLOR_ALPHA_MASK  0x0000ffff
+#      define GA_SOLID_BA_COLOR_BLUE_SHIFT  16
+#      define GA_SOLID_BA_COLOR_BLUE_MASK   0xffff0000
+
+/* Polygon Mode
+ * Dangerous
+ */
+#define R300_GA_POLY_MODE                             0x4288
+#      define R300_GA_POLY_MODE_DISABLE           (0 << 0)
+#      define R300_GA_POLY_MODE_DUAL              (1 << 0) /* send 2 sets of 3 polys with specified poly type */
+/* reserved */
+#      define R300_GA_POLY_MODE_FRONT_PTYPE_POINT (0 << 4)
+#      define R300_GA_POLY_MODE_FRONT_PTYPE_LINE  (1 << 4)
+#      define R300_GA_POLY_MODE_FRONT_PTYPE_TRI   (2 << 4)
+/* reserved */
+#      define R300_GA_POLY_MODE_BACK_PTYPE_POINT  (0 << 7)
+#      define R300_GA_POLY_MODE_BACK_PTYPE_LINE   (1 << 7)
+#      define R300_GA_POLY_MODE_BACK_PTYPE_TRI    (2 << 7)
+/* reserved */
+
+/* Specifies the rouding mode for geometry & color SPFP to FP conversions. */
+#define R300_GA_ROUND_MODE                            0x428c
+#      define R300_GA_ROUND_MODE_GEOMETRY_ROUND_TRUNC   (0 << 0)
+#      define R300_GA_ROUND_MODE_GEOMETRY_ROUND_NEAREST (1 << 0)
+#      define R300_GA_ROUND_MODE_COLOR_ROUND_TRUNC      (0 << 2)
+#      define R300_GA_ROUND_MODE_COLOR_ROUND_NEAREST    (1 << 2)
+#      define R300_GA_ROUND_MODE_RGB_CLAMP_RGB          (0 << 4)
+#      define R300_GA_ROUND_MODE_RGB_CLAMP_FP20         (1 << 4)
+#      define R300_GA_ROUND_MODE_ALPHA_CLAMP_RGB        (0 << 5)
+#      define R300_GA_ROUND_MODE_ALPHA_CLAMP_FP20       (1 << 5)
+#      define R500_GA_ROUND_MODE_GEOMETRY_MASK_SHIFT    6
+#      define R500_GA_ROUND_MODE_GEOMETRY_MASK_MASK     0x000003c0
+
+/* Specifies x & y offsets for vertex data after conversion to FP.
+ * Offsets are in S15 format (subpixels -- 1/12 or 1/16, even in 8b
+ * subprecision).
+ */
+#define R300_GA_OFFSET                                0x4290
+#      define R300_GA_OFFSET_X_OFFSET_SHIFT 0
+#      define R300_GA_OFFSET_X_OFFSET_MASK  0x0000ffff
+#      define R300_GA_OFFSET_Y_OFFSET_SHIFT 16
+#      define R300_GA_OFFSET_Y_OFFSET_MASK  0xffff0000
+
+/* Specifies the scale to apply to fog. */
+#define R300_GA_FOG_SCALE                     0x4294
+/* Specifies the offset to apply to fog. */
+#define R300_GA_FOG_OFFSET                    0x4298
+/* Specifies number of cycles to assert reset, and also causes RB3D soft reset to assert. */
+#define R300_GA_SOFT_RESET                    0x429c
+
+/* Not sure why there are duplicate of factor and constant values.
+ * My best guess so far is that there are seperate zbiases for test and write.
+ * Ordering might be wrong.
+ * Some of the tests indicate that fgl has a fallback implementation of zbias
+ * via pixel shaders.
+ */
+#define R300_SU_TEX_WRAP                      0x42A0
+#define R300_SU_POLY_OFFSET_FRONT_SCALE       0x42A4
+#define R300_SU_POLY_OFFSET_FRONT_OFFSET      0x42A8
+#define R300_SU_POLY_OFFSET_BACK_SCALE        0x42AC
+#define R300_SU_POLY_OFFSET_BACK_OFFSET       0x42B0
+
+/* This register needs to be set to (1<<1) for RV350 to correctly
+ * perform depth test (see --vb-triangles in r300_demo)
+ * Don't know about other chips. - Vladimir
+ * This is set to 3 when GL_POLYGON_OFFSET_FILL is on.
+ * My guess is that there are two bits for each zbias primitive
+ * (FILL, LINE, POINT).
+ *  One to enable depth test and one for depth write.
+ * Yet this doesnt explain why depth writes work ...
+ */
+#define R300_SU_POLY_OFFSET_ENABLE            0x42B4
+#      define R300_FRONT_ENABLE               (1 << 0)
+#      define R300_BACK_ENABLE                (1 << 1)
+#      define R300_PARA_ENABLE                (1 << 2)
+
+#define R300_SU_CULL_MODE                      0x42B8
+#       define R300_CULL_FRONT                   (1 << 0)
+#       define R300_CULL_BACK                    (1 << 1)
+#       define R300_FRONT_FACE_CCW               (0 << 2)
+#       define R300_FRONT_FACE_CW                (1 << 2)
+
+/* SU Depth Scale value */
+#define R300_SU_DEPTH_SCALE                 0x42c0
+/* SU Depth Offset value */
+#define R300_SU_DEPTH_OFFSET                0x42c4
+
+
+/* BEGIN: Rasterization / Interpolators - many guesses */
+
+/*
+ * TC_CNT is the number of incoming texture coordinate sets (i.e. it depends
+ * on the vertex program, *not* the fragment program)
+ */
+#define R300_RS_COUNT                      0x4300
+#       define R300_IT_COUNT_SHIFT               0
+#       define R300_IT_COUNT_MASK                0x0000007f
+#       define R300_IC_COUNT_SHIFT               7
+#       define R300_IC_COUNT_MASK                0x00000780
+#       define R300_W_ADDR_SHIFT                 12
+#       define R300_W_ADDR_MASK                  0x0003f000
+#       define R300_HIRES_DIS                    (0 << 18)
+#       define R300_HIRES_EN                     (1 << 18)
+
+#define R300_RS_INST_COUNT                       0x4304
+#       define R300_RS_INST_COUNT_SHIFT          0
+#       define R300_RS_INST_COUNT_MASK           0x0000000f
+#       define R300_RS_TX_OFFSET_SHIFT           5
+#      define R300_RS_TX_OFFSET_MASK            0x000000e0
+
+/* gap */
+
+/* Only used for texture coordinates.
+ * Use the source field to route texture coordinate input from the
+ * vertex program to the desired interpolator. Note that the source
+ * field is relative to the outputs the vertex program *actually*
+ * writes. If a vertex program only writes texcoord[1], this will
+ * be source index 0.
+ * Set INTERP_USED on all interpolators that produce data used by
+ * the fragment program. INTERP_USED looks like a swizzling mask,
+ * but I haven't seen it used that way.
+ *
+ * Note: The _UNKNOWN constants are always set in their respective
+ * register. I don't know if this is necessary.
+ */
+#define R300_RS_IP_0                                   0x4310
+#define R300_RS_IP_1                                   0x4314
+#define R300_RS_IP_2                                   0x4318
+#define R300_RS_IP_3                                   0x431C
+#       define R300_RS_INTERP_SRC_SHIFT          2 /* TODO: check for removal */
+#       define R300_RS_INTERP_SRC_MASK           (7 << 2) /* TODO: check for removal */
+#      define R300_RS_TEX_PTR(x)                       ((x) << 0)
+#      define R300_RS_COL_PTR(x)                       ((x) << 6)
+#      define R300_RS_COL_FMT(x)                       ((x) << 9)
+#      define R300_RS_COL_FMT_RGBA                     0
+#      define R300_RS_COL_FMT_RGB0                     1
+#      define R300_RS_COL_FMT_RGB1                     2
+#      define R300_RS_COL_FMT_000A                     4
+#      define R300_RS_COL_FMT_0000                     5
+#      define R300_RS_COL_FMT_0001                     6
+#      define R300_RS_COL_FMT_111A                     8
+#      define R300_RS_COL_FMT_1110                     9
+#      define R300_RS_COL_FMT_1111                     10
+#      define R300_RS_SEL_S(x)                         ((x) << 13)
+#      define R300_RS_SEL_T(x)                         ((x) << 16)
+#      define R300_RS_SEL_R(x)                         ((x) << 19)
+#      define R300_RS_SEL_Q(x)                         ((x) << 22)
+#      define R300_RS_SEL_C0                           0
+#      define R300_RS_SEL_C1                           1
+#      define R300_RS_SEL_C2                           2
+#      define R300_RS_SEL_C3                           3
+#      define R300_RS_SEL_K0                           4
+#      define R300_RS_SEL_K1                           5
+
+
+/*  */
+#define R500_RS_INST_0                                 0x4320
+#define R500_RS_INST_1                                 0x4324
+#define R500_RS_INST_2                                 0x4328
+#define R500_RS_INST_3                                 0x432c
+#define R500_RS_INST_4                                 0x4330
+#define R500_RS_INST_5                                 0x4334
+#define R500_RS_INST_6                                 0x4338
+#define R500_RS_INST_7                                 0x433c
+#define R500_RS_INST_8                                 0x4340
+#define R500_RS_INST_9                                 0x4344
+#define R500_RS_INST_10                                        0x4348
+#define R500_RS_INST_11                                        0x434c
+#define R500_RS_INST_12                                        0x4350
+#define R500_RS_INST_13                                        0x4354
+#define R500_RS_INST_14                                        0x4358
+#define R500_RS_INST_15                                        0x435c
+#define R500_RS_INST_TEX_ID_SHIFT                      0
+#define R500_RS_INST_TEX_CN_WRITE                      (1 << 4)
+#define R500_RS_INST_TEX_ADDR_SHIFT                    5
+#define R500_RS_INST_COL_ID_SHIFT                      12
+#define R500_RS_INST_COL_CN_NO_WRITE                   (0 << 16)
+#define R500_RS_INST_COL_CN_WRITE                      (1 << 16)
+#define R500_RS_INST_COL_CN_WRITE_FBUFFER              (2 << 16)
+#define R500_RS_INST_COL_CN_WRITE_BACKFACE             (3 << 16)
+#define R500_RS_INST_COL_ADDR_SHIFT                    18
+#define R500_RS_INST_TEX_ADJ                           (1 << 25)
+#define R500_RS_INST_W_CN                              (1 << 26)
+#define R500_RS_INST_TEX_ID(x)                         ((x) << R500_RS_INST_TEX_ID_SHIFT)
+#define R500_RS_INST_TEX_ADDR(x)                       ((x) << R500_RS_INST_TEX_ADDR_SHIFT)
+#define R500_RS_INST_COL_ID(x)                         ((x) << R500_RS_INST_COL_ID_SHIFT)
+#define R500_RS_INST_COL_ADDR(x)                       ((x) << R500_RS_INST_COL_ADDR_SHIFT)
+
+/* These DWORDs control how vertex data is routed into fragment program
+ * registers, after interpolators.
+ */
+#define R300_RS_INST_0                     0x4330
+#define R300_RS_INST_1                     0x4334
+#define R300_RS_INST_2                     0x4338
+#define R300_RS_INST_3                     0x433C /* GUESS */
+#define R300_RS_INST_4                     0x4340 /* GUESS */
+#define R300_RS_INST_5                     0x4344 /* GUESS */
+#define R300_RS_INST_6                     0x4348 /* GUESS */
+#define R300_RS_INST_7                     0x434C /* GUESS */
+#      define R300_RS_INST_TEX_ID(x)           ((x) << 0)
+#      define R300_RS_INST_TEX_CN_WRITE        (1 << 3)
+#      define R300_RS_INST_TEX_ADDR_SHIFT      6
+#      define R300_RS_INST_TEX_ADDR(x)         ((x) << R300_RS_INST_TEX_ADDR_SHIFT)
+#      define R300_RS_INST_COL_ID(x)           ((x) << 11)
+#      define R300_RS_INST_COL_CN_WRITE        (1 << 14)
+#      define R300_RS_INST_COL_ADDR_SHIFT      17
+#      define R300_RS_INST_COL_ADDR(x)         ((x) << R300_RS_INST_COL_ADDR_SHIFT)
+#      define R300_RS_INST_TEX_ADJ             (1 << 22)
+#      define R300_RS_COL_BIAS_UNUSED_SHIFT    23
+
+/* END: Rasterization / Interpolators - many guesses */
+
+/* Hierarchical Z Enable */
+#define R300_SC_HYPERZ                   0x43a4
+#      define R300_SC_HYPERZ_DISABLE     (0 << 0)
+#      define R300_SC_HYPERZ_ENABLE      (1 << 0)
+#      define R300_SC_HYPERZ_MIN         (0 << 1)
+#      define R300_SC_HYPERZ_MAX         (1 << 1)
+#      define R300_SC_HYPERZ_ADJ_256     (0 << 2)
+#      define R300_SC_HYPERZ_ADJ_128     (1 << 2)
+#      define R300_SC_HYPERZ_ADJ_64      (2 << 2)
+#      define R300_SC_HYPERZ_ADJ_32      (3 << 2)
+#      define R300_SC_HYPERZ_ADJ_16      (4 << 2)
+#      define R300_SC_HYPERZ_ADJ_8       (5 << 2)
+#      define R300_SC_HYPERZ_ADJ_4       (6 << 2)
+#      define R300_SC_HYPERZ_ADJ_2       (7 << 2)
+#      define R300_SC_HYPERZ_HZ_Z0MIN_NO (0 << 5)
+#      define R300_SC_HYPERZ_HZ_Z0MIN    (1 << 5)
+#      define R300_SC_HYPERZ_HZ_Z0MAX_NO (0 << 6)
+#      define R300_SC_HYPERZ_HZ_Z0MAX    (1 << 6)
+
+#define R300_SC_EDGERULE                 0x43a8
+
+/* BEGIN: Scissors and cliprects */
+
+/* There are four clipping rectangles. Their corner coordinates are inclusive.
+ * Every pixel is assigned a number from 0 and 15 by setting bits 0-3 depending
+ * on whether the pixel is inside cliprects 0-3, respectively. For example,
+ * if a pixel is inside cliprects 0 and 1, but outside 2 and 3, it is assigned
+ * the number 3 (binary 0011).
+ * Iff the bit corresponding to the pixel's number in RE_CLIPRECT_CNTL is set,
+ * the pixel is rasterized.
+ *
+ * In addition to this, there is a scissors rectangle. Only pixels inside the
+ * scissors rectangle are drawn. (coordinates are inclusive)
+ *
+ * For some reason, the top-left corner of the framebuffer is at (1440, 1440)
+ * for the purpose of clipping and scissors.
+ */
+#define R300_SC_CLIPRECT_TL_0               0x43B0
+#define R300_SC_CLIPRECT_BR_0               0x43B4
+#define R300_SC_CLIPRECT_TL_1               0x43B8
+#define R300_SC_CLIPRECT_BR_1               0x43BC
+#define R300_SC_CLIPRECT_TL_2               0x43C0
+#define R300_SC_CLIPRECT_BR_2               0x43C4
+#define R300_SC_CLIPRECT_TL_3               0x43C8
+#define R300_SC_CLIPRECT_BR_3               0x43CC
+#       define R300_CLIPRECT_OFFSET              1440
+#       define R300_CLIPRECT_MASK                0x1FFF
+#       define R300_CLIPRECT_X_SHIFT             0
+#       define R300_CLIPRECT_X_MASK              (0x1FFF << 0)
+#       define R300_CLIPRECT_Y_SHIFT             13
+#       define R300_CLIPRECT_Y_MASK              (0x1FFF << 13)
+#define R300_SC_CLIP_RULE                   0x43D0
+#       define R300_CLIP_OUT                     (1 << 0)
+#       define R300_CLIP_0                       (1 << 1)
+#       define R300_CLIP_1                       (1 << 2)
+#       define R300_CLIP_10                      (1 << 3)
+#       define R300_CLIP_2                       (1 << 4)
+#       define R300_CLIP_20                      (1 << 5)
+#       define R300_CLIP_21                      (1 << 6)
+#       define R300_CLIP_210                     (1 << 7)
+#       define R300_CLIP_3                       (1 << 8)
+#       define R300_CLIP_30                      (1 << 9)
+#       define R300_CLIP_31                      (1 << 10)
+#       define R300_CLIP_310                     (1 << 11)
+#       define R300_CLIP_32                      (1 << 12)
+#       define R300_CLIP_320                     (1 << 13)
+#       define R300_CLIP_321                     (1 << 14)
+#       define R300_CLIP_3210                    (1 << 15)
+
+/* gap */
+
+#define R300_SC_SCISSORS_TL                 0x43E0
+#define R300_SC_SCISSORS_BR                 0x43E4
+#       define R300_SCISSORS_OFFSET              1440
+#       define R300_SCISSORS_X_SHIFT             0
+#       define R300_SCISSORS_X_MASK              (0x1FFF << 0)
+#       define R300_SCISSORS_Y_SHIFT             13
+#       define R300_SCISSORS_Y_MASK              (0x1FFF << 13)
+
+/* Screen door sample mask */
+#define R300_SC_SCREENDOOR                 0x43e8
+
+/* END: Scissors and cliprects */
+
+/* BEGIN: Texture specification */
+
+/*
+ * The texture specification dwords are grouped by meaning and not by texture
+ * unit. This means that e.g. the offset for texture image unit N is found in
+ * register TX_OFFSET_0 + (4*N)
+ */
+#define R300_TX_FILTER0_0                        0x4400
+#define R300_TX_FILTER0_1                        0x4404
+#define R300_TX_FILTER0_2                        0x4408
+#define R300_TX_FILTER0_3                        0x440c
+#define R300_TX_FILTER0_4                        0x4410
+#define R300_TX_FILTER0_5                        0x4414
+#define R300_TX_FILTER0_6                        0x4418
+#define R300_TX_FILTER0_7                        0x441c
+#define R300_TX_FILTER0_8                        0x4420
+#define R300_TX_FILTER0_9                        0x4424
+#define R300_TX_FILTER0_10                       0x4428
+#define R300_TX_FILTER0_11                       0x442c
+#define R300_TX_FILTER0_12                       0x4430
+#define R300_TX_FILTER0_13                       0x4434
+#define R300_TX_FILTER0_14                       0x4438
+#define R300_TX_FILTER0_15                       0x443c
+#       define R300_TX_REPEAT                    0
+#       define R300_TX_MIRRORED                  1
+#       define R300_TX_CLAMP_TO_EDGE             2
+#      define R300_TX_MIRROR_ONCE_TO_EDGE       3
+#       define R300_TX_CLAMP                     4
+#      define R300_TX_MIRROR_ONCE               5
+#       define R300_TX_CLAMP_TO_BORDER           6
+#      define R300_TX_MIRROR_ONCE_TO_BORDER     7
+#       define R300_TX_WRAP_S_SHIFT              0
+#       define R300_TX_WRAP_S_MASK               (7 << 0)
+#       define R300_TX_WRAP_T_SHIFT              3
+#       define R300_TX_WRAP_T_MASK               (7 << 3)
+#       define R300_TX_WRAP_R_SHIFT              6
+#       define R300_TX_WRAP_R_MASK               (7 << 6)
+#      define R300_TX_MAG_FILTER_4              (0 << 9)
+#       define R300_TX_MAG_FILTER_NEAREST        (1 << 9)
+#       define R300_TX_MAG_FILTER_LINEAR         (2 << 9)
+#       define R300_TX_MAG_FILTER_ANISO          (3 << 9)
+#       define R300_TX_MAG_FILTER_MASK           (3 << 9)
+#       define R300_TX_MIN_FILTER_NEAREST        (1 << 11)
+#       define R300_TX_MIN_FILTER_LINEAR         (2 << 11)
+#      define R300_TX_MIN_FILTER_ANISO          (3 << 11)
+#      define R300_TX_MIN_FILTER_MASK           (3 << 11)
+#      define R300_TX_MIN_FILTER_MIP_NONE       (0 << 13)
+#      define R300_TX_MIN_FILTER_MIP_NEAREST    (1 << 13)
+#      define R300_TX_MIN_FILTER_MIP_LINEAR     (2 << 13)
+#      define R300_TX_MIN_FILTER_MIP_MASK       (3 << 13)
+#      define R300_TX_MAX_ANISO_1_TO_1          (0 << 21)
+#      define R300_TX_MAX_ANISO_2_TO_1          (1 << 21)
+#      define R300_TX_MAX_ANISO_4_TO_1          (2 << 21)
+#      define R300_TX_MAX_ANISO_8_TO_1          (3 << 21)
+#      define R300_TX_MAX_ANISO_16_TO_1         (4 << 21)
+#      define R300_TX_MAX_ANISO_MASK            (7 << 21)
+
+#define R300_TX_FILTER1_0                      0x4440
+#      define R300_CHROMA_KEY_MODE_DISABLE    0
+#      define R300_CHROMA_KEY_FORCE           1
+#      define R300_CHROMA_KEY_BLEND           2
+#      define R300_MC_ROUND_NORMAL            (0<<2)
+#      define R300_MC_ROUND_MPEG4             (1<<2)
+#      define R300_LOD_BIAS_SHIFT             3
+#      define R300_LOD_BIAS_MASK              0x1ff8
+#      define R300_EDGE_ANISO_EDGE_DIAG       (0<<13)
+#      define R300_EDGE_ANISO_EDGE_ONLY       (1<<13)
+#      define R300_MC_COORD_TRUNCATE_DISABLE  (0<<14)
+#      define R300_MC_COORD_TRUNCATE_MPEG     (1<<14)
+#      define R300_TX_TRI_PERF_0_8            (0<<15)
+#      define R300_TX_TRI_PERF_1_8            (1<<15)
+#      define R300_TX_TRI_PERF_1_4            (2<<15)
+#      define R300_TX_TRI_PERF_3_8            (3<<15)
+#      define R300_ANISO_THRESHOLD_MASK       (7<<17)
+
+#      define R500_MACRO_SWITCH               (1<<22)
+#      define R500_BORDER_FIX                 (1<<31)
+
+#define R300_TX_SIZE_0                      0x4480
+#       define R300_TX_WIDTHMASK_SHIFT           0
+#       define R300_TX_WIDTHMASK_MASK            (2047 << 0)
+#       define R300_TX_HEIGHTMASK_SHIFT          11
+#       define R300_TX_HEIGHTMASK_MASK           (2047 << 11)
+#      define R300_TX_DEPTHMASK_SHIFT           22
+#      define R300_TX_DEPTHMASK_MASK            (0xf << 22)
+#       define R300_TX_MAX_MIP_LEVEL_SHIFT       26
+#       define R300_TX_MAX_MIP_LEVEL_MASK        (0xf << 26)
+#       define R300_TX_SIZE_PROJECTED            (1<<30)
+#       define R300_TX_SIZE_TXPITCH_EN           (1<<31)
+#define R300_TX_FORMAT_0                    0x44C0
+       /* The interpretation of the format word by Wladimir van der Laan */
+       /* The X, Y, Z and W refer to the layout of the components.
+          They are given meanings as R, G, B and Alpha by the swizzle
+          specification */
+#      define R300_TX_FORMAT_X8                    0x0
+#      define R500_TX_FORMAT_X1                    0x0 // bit set in format 2
+#      define R300_TX_FORMAT_X16                   0x1
+#      define R500_TX_FORMAT_X1_REV                0x0 // bit set in format 2
+#      define R300_TX_FORMAT_Y4X4                  0x2
+#      define R300_TX_FORMAT_Y8X8                  0x3
+#      define R300_TX_FORMAT_Y16X16                0x4
+#      define R300_TX_FORMAT_Z3Y3X2                0x5
+#      define R300_TX_FORMAT_Z5Y6X5                0x6
+#      define R300_TX_FORMAT_Z6Y5X5                0x7
+#      define R300_TX_FORMAT_Z11Y11X10             0x8
+#      define R300_TX_FORMAT_Z10Y11X11             0x9
+#      define R300_TX_FORMAT_W4Z4Y4X4              0xA
+#      define R300_TX_FORMAT_W1Z5Y5X5              0xB
+#      define R300_TX_FORMAT_W8Z8Y8X8              0xC
+#      define R300_TX_FORMAT_W2Z10Y10X10           0xD
+#      define R300_TX_FORMAT_W16Z16Y16X16          0xE
+#      define R300_TX_FORMAT_DXT1                  0xF
+#      define R300_TX_FORMAT_DXT3                  0x10
+#      define R300_TX_FORMAT_DXT5                  0x11
+#      define R300_TX_FORMAT_D3DMFT_CxV8U8         0x12     /* no swizzle */
+#      define R300_TX_FORMAT_A8R8G8B8              0x13     /* no swizzle */
+#      define R300_TX_FORMAT_B8G8_B8G8             0x14     /* no swizzle */
+#      define R300_TX_FORMAT_G8R8_G8B8             0x15     /* no swizzle */
+
+       /* These two values are wrong, but they're the only values that
+        * produce any even vaguely correct results.  Can r300 only do 16-bit
+        * depth textures?
+        */
+#      define R300_TX_FORMAT_X24_Y8                0x1e
+#      define R300_TX_FORMAT_X32                   0x1e
+
+       /* 0x16 - some 16 bit green format.. ?? */
+#      define R300_TX_FORMAT_3D                   (1 << 25)
+#      define R300_TX_FORMAT_CUBIC_MAP            (2 << 25)
+
+       /* gap */
+       /* Floating point formats */
+       /* Note - hardware supports both 16 and 32 bit floating point */
+#      define R300_TX_FORMAT_FL_I16                0x18
+#      define R300_TX_FORMAT_FL_I16A16             0x19
+#      define R300_TX_FORMAT_FL_R16G16B16A16       0x1A
+#      define R300_TX_FORMAT_FL_I32                0x1B
+#      define R300_TX_FORMAT_FL_I32A32             0x1C
+#      define R300_TX_FORMAT_FL_R32G32B32A32       0x1D
+       /* alpha modes, convenience mostly */
+       /* if you have alpha, pick constant appropriate to the
+          number of channels (1 for I8, 2 for I8A8, 4 for R8G8B8A8, etc */
+#      define R300_TX_FORMAT_ALPHA_1CH             0x000
+#      define R300_TX_FORMAT_ALPHA_2CH             0x200
+#      define R300_TX_FORMAT_ALPHA_4CH             0x600
+#      define R300_TX_FORMAT_ALPHA_NONE            0xA00
+       /* Swizzling */
+       /* constants */
+#      define R300_TX_FORMAT_X         0
+#      define R300_TX_FORMAT_Y         1
+#      define R300_TX_FORMAT_Z         2
+#      define R300_TX_FORMAT_W         3
+#      define R300_TX_FORMAT_ZERO      4
+#      define R300_TX_FORMAT_ONE       5
+       /* 2.0*Z, everything above 1.0 is set to 0.0 */
+#      define R300_TX_FORMAT_CUT_Z     6
+       /* 2.0*W, everything above 1.0 is set to 0.0 */
+#      define R300_TX_FORMAT_CUT_W     7
+
+#      define R300_TX_FORMAT_B_SHIFT   18
+#      define R300_TX_FORMAT_G_SHIFT   15
+#      define R300_TX_FORMAT_R_SHIFT   12
+#      define R300_TX_FORMAT_A_SHIFT   9
+       /* Convenience macro to take care of layout and swizzling */
+#      define R300_EASY_TX_FORMAT(B, G, R, A, FMT)     (               \
+               ((R300_TX_FORMAT_##B)<<R300_TX_FORMAT_B_SHIFT)          \
+               | ((R300_TX_FORMAT_##G)<<R300_TX_FORMAT_G_SHIFT)        \
+               | ((R300_TX_FORMAT_##R)<<R300_TX_FORMAT_R_SHIFT)        \
+               | ((R300_TX_FORMAT_##A)<<R300_TX_FORMAT_A_SHIFT)        \
+               | (R300_TX_FORMAT_##FMT)                                \
+               )
+       /* These can be ORed with result of R300_EASY_TX_FORMAT()
+          We don't really know what they do. Take values from a
+           constant color ? */
+#      define R300_TX_FORMAT_CONST_X           (1<<5)
+#      define R300_TX_FORMAT_CONST_Y           (2<<5)
+#      define R300_TX_FORMAT_CONST_Z           (4<<5)
+#      define R300_TX_FORMAT_CONST_W           (8<<5)
+
+#      define R300_TX_FORMAT_YUV_MODE          0x00800000
+
+#define R300_TX_FORMAT2_0                  0x4500 /* obvious missing in gap */
+#       define R300_TX_PITCHMASK_SHIFT           0
+#       define R300_TX_PITCHMASK_MASK            (2047 << 0)
+#      define R500_TXFORMAT_MSB                 (1 << 14)
+#      define R500_TXWIDTH_BIT11                (1 << 15)
+#      define R500_TXHEIGHT_BIT11               (1 << 16)
+#      define R500_POW2FIX2FLT                  (1 << 17)
+#      define R500_SEL_FILTER4_TC0              (0 << 18)
+#      define R500_SEL_FILTER4_TC1              (1 << 18)
+#      define R500_SEL_FILTER4_TC2              (2 << 18)
+#      define R500_SEL_FILTER4_TC3              (3 << 18)
+
+#define R300_TX_OFFSET_0                    0x4540
+#define R300_TX_OFFSET_1                    0x4544
+#define R300_TX_OFFSET_2                    0x4548
+#define R300_TX_OFFSET_3                    0x454C
+#define R300_TX_OFFSET_4                    0x4550
+#define R300_TX_OFFSET_5                    0x4554
+#define R300_TX_OFFSET_6                    0x4558
+#define R300_TX_OFFSET_7                    0x455C
+       /* BEGIN: Guess from R200 */
+#       define R300_TXO_ENDIAN_NO_SWAP           (0 << 0)
+#       define R300_TXO_ENDIAN_BYTE_SWAP         (1 << 0)
+#       define R300_TXO_ENDIAN_WORD_SWAP         (2 << 0)
+#       define R300_TXO_ENDIAN_HALFDW_SWAP       (3 << 0)
+#       define R300_TXO_MACRO_TILE               (1 << 2)
+#       define R300_TXO_MICRO_TILE_LINEAR        (0 << 3)
+#       define R300_TXO_MICRO_TILE               (1 << 3)
+#       define R300_TXO_MICRO_TILE_SQUARE        (2 << 3)
+#       define R300_TXO_OFFSET_MASK              0xffffffe0
+#       define R300_TXO_OFFSET_SHIFT             5
+       /* END: Guess from R200 */
+
+/* 32 bit chroma key */
+#define R300_TX_CHROMA_KEY_0                      0x4580
+#define R300_TX_CHROMA_KEY_1                      0x4584
+#define R300_TX_CHROMA_KEY_2                      0x4588
+#define R300_TX_CHROMA_KEY_3                      0x458c
+#define R300_TX_CHROMA_KEY_4                      0x4590
+#define R300_TX_CHROMA_KEY_5                      0x4594
+#define R300_TX_CHROMA_KEY_6                      0x4598
+#define R300_TX_CHROMA_KEY_7                      0x459c
+#define R300_TX_CHROMA_KEY_8                      0x45a0
+#define R300_TX_CHROMA_KEY_9                      0x45a4
+#define R300_TX_CHROMA_KEY_10                     0x45a8
+#define R300_TX_CHROMA_KEY_11                     0x45ac
+#define R300_TX_CHROMA_KEY_12                     0x45b0
+#define R300_TX_CHROMA_KEY_13                     0x45b4
+#define R300_TX_CHROMA_KEY_14                     0x45b8
+#define R300_TX_CHROMA_KEY_15                     0x45bc
+/* ff00ff00 == { 0, 1.0, 0, 1.0 } */
+
+/* Border Color */
+#define R300_TX_BORDER_COLOR_0              0x45c0
+#define R300_TX_BORDER_COLOR_1              0x45c4
+#define R300_TX_BORDER_COLOR_2              0x45c8
+#define R300_TX_BORDER_COLOR_3              0x45cc
+#define R300_TX_BORDER_COLOR_4              0x45d0
+#define R300_TX_BORDER_COLOR_5              0x45d4
+#define R300_TX_BORDER_COLOR_6              0x45d8
+#define R300_TX_BORDER_COLOR_7              0x45dc
+#define R300_TX_BORDER_COLOR_8              0x45e0
+#define R300_TX_BORDER_COLOR_9              0x45e4
+#define R300_TX_BORDER_COLOR_10             0x45e8
+#define R300_TX_BORDER_COLOR_11             0x45ec
+#define R300_TX_BORDER_COLOR_12             0x45f0
+#define R300_TX_BORDER_COLOR_13             0x45f4
+#define R300_TX_BORDER_COLOR_14             0x45f8
+#define R300_TX_BORDER_COLOR_15             0x45fc
+
+
+/* END: Texture specification */
+
+/* BEGIN: Fragment program instruction set */
+
+/* Fragment programs are written directly into register space.
+ * There are separate instruction streams for texture instructions and ALU
+ * instructions.
+ * In order to synchronize these streams, the program is divided into up
+ * to 4 nodes. Each node begins with a number of TEX operations, followed
+ * by a number of ALU operations.
+ * The first node can have zero TEX ops, all subsequent nodes must have at
+ * least
+ * one TEX ops.
+ * All nodes must have at least one ALU op.
+ *
+ * The index of the last node is stored in PFS_CNTL_0: A value of 0 means
+ * 1 node, a value of 3 means 4 nodes.
+ * The total amount of instructions is defined in PFS_CNTL_2. The offsets are
+ * offsets into the respective instruction streams, while *_END points to the
+ * last instruction relative to this offset.
+ */
+#define R300_US_CONFIG                      0x4600
+#       define R300_PFS_CNTL_LAST_NODES_SHIFT    0
+#       define R300_PFS_CNTL_LAST_NODES_MASK     (3 << 0)
+#       define R300_PFS_CNTL_FIRST_NODE_HAS_TEX  (1 << 3)
+#define R300_US_PIXSIZE                     0x4604
+/* There is an unshifted value here which has so far always been equal to the
+ * index of the highest used temporary register.
+ */
+#define R300_US_CODE_OFFSET                 0x4608
+#       define R300_PFS_CNTL_ALU_OFFSET_SHIFT    0
+#       define R300_PFS_CNTL_ALU_OFFSET_MASK     (63 << 0)
+#       define R300_PFS_CNTL_ALU_END_SHIFT       6
+#       define R300_PFS_CNTL_ALU_END_MASK        (63 << 6)
+#       define R300_PFS_CNTL_TEX_OFFSET_SHIFT    13
+#       define R300_PFS_CNTL_TEX_OFFSET_MASK     (31 << 13)
+#       define R300_PFS_CNTL_TEX_END_SHIFT       18
+#       define R300_PFS_CNTL_TEX_END_MASK        (31 << 18)
+
+/* gap */
+
+/* Nodes are stored backwards. The last active node is always stored in
+ * PFS_NODE_3.
+ * Example: In a 2-node program, NODE_0 and NODE_1 are set to 0. The
+ * first node is stored in NODE_2, the second node is stored in NODE_3.
+ *
+ * Offsets are relative to the master offset from PFS_CNTL_2.
+ */
+#define R300_US_CODE_ADDR_0                 0x4610
+#define R300_US_CODE_ADDR_1                 0x4614
+#define R300_US_CODE_ADDR_2                 0x4618
+#define R300_US_CODE_ADDR_3                 0x461C
+#       define R300_ALU_START_SHIFT         0
+#       define R300_ALU_START_MASK          (63 << 0)
+#       define R300_ALU_SIZE_SHIFT          6
+#       define R300_ALU_SIZE_MASK           (63 << 6)
+#       define R300_TEX_START_SHIFT         12
+#       define R300_TEX_START_MASK          (31 << 12)
+#       define R300_TEX_SIZE_SHIFT          17
+#       define R300_TEX_SIZE_MASK           (31 << 17)
+#      define R300_RGBA_OUT                (1 << 22)
+#      define R300_W_OUT                   (1 << 23)
+
+/* TEX
+ * As far as I can tell, texture instructions cannot write into output
+ * registers directly. A subsequent ALU instruction is always necessary,
+ * even if it's just MAD o0, r0, 1, 0
+ */
+#define R300_US_TEX_INST_0                  0x4620
+#      define R300_SRC_ADDR_SHIFT          0
+#      define R300_SRC_ADDR_MASK           (31 << 0)
+#      define R300_DST_ADDR_SHIFT          6
+#      define R300_DST_ADDR_MASK           (31 << 6)
+#      define R300_TEX_ID_SHIFT            11
+#       define R300_TEX_ID_MASK             (15 << 11)
+#      define R300_TEX_INST_SHIFT              15
+#              define R300_TEX_OP_NOP          0
+#              define R300_TEX_OP_LD           1
+#              define R300_TEX_OP_KIL          2
+#              define R300_TEX_OP_TXP          3
+#              define R300_TEX_OP_TXB          4
+#      define R300_TEX_INST_MASK               (7 << 15)
+
+/* Output format from the unfied shader */
+#define R300_US_OUT_FMT                     0x46A4
+#      define R300_US_OUT_FMT_C4_8         (0 << 0)
+#      define R300_US_OUT_FMT_C4_10        (1 << 0)
+#      define R300_US_OUT_FMT_C4_10_GAMMA  (2 << 0)
+#      define R300_US_OUT_FMT_C_16         (3 << 0)
+#      define R300_US_OUT_FMT_C2_16        (4 << 0)
+#      define R300_US_OUT_FMT_C4_16        (5 << 0)
+#      define R300_US_OUT_FMT_C_16_MPEG    (6 << 0)
+#      define R300_US_OUT_FMT_C2_16_MPEG   (7 << 0)
+#      define R300_US_OUT_FMT_C2_4         (8 << 0)
+#      define R300_US_OUT_FMT_C_3_3_2      (9 << 0)
+#      define R300_US_OUT_FMT_C_6_5_6      (10 << 0)
+#      define R300_US_OUT_FMT_C_11_11_10   (11 << 0)
+#      define R300_US_OUT_FMT_C_10_11_11   (12 << 0)
+#      define R300_US_OUT_FMT_C_2_10_10_10 (13 << 0)
+/* reserved */
+#      define R300_US_OUT_FMT_UNUSED       (15 << 0)
+#      define R300_US_OUT_FMT_C_16_FP      (16 << 0)
+#      define R300_US_OUT_FMT_C2_16_FP     (17 << 0)
+#      define R300_US_OUT_FMT_C4_16_FP     (18 << 0)
+#      define R300_US_OUT_FMT_C_32_FP      (19 << 0)
+#      define R300_US_OUT_FMT_C2_32_FP     (20 << 0)
+#      define R300_US_OUT_FMT_C4_32_FP     (20 << 0)
+
+/* ALU
+ * The ALU instructions register blocks are enumerated according to the order
+ * in which fglrx. I assume there is space for 64 instructions, since
+ * each block has space for a maximum of 64 DWORDs, and this matches reported
+ * native limits.
+ *
+ * The basic functional block seems to be one MAD for each color and alpha,
+ * and an adder that adds all components after the MUL.
+ *  - ADD, MUL, MAD etc.: use MAD with appropriate neutral operands
+ *  - DP4: Use OUTC_DP4, OUTA_DP4
+ *  - DP3: Use OUTC_DP3, OUTA_DP4, appropriate alpha operands
+ *  - DPH: Use OUTC_DP4, OUTA_DP4, appropriate alpha operands
+ *  - CMPH: If ARG2 > 0.5, return ARG0, else return ARG1
+ *  - CMP: If ARG2 < 0, return ARG1, else return ARG0
+ *  - FLR: use FRC+MAD
+ *  - XPD: use MAD+MAD
+ *  - SGE, SLT: use MAD+CMP
+ *  - RSQ: use ABS modifier for argument
+ *  - Use OUTC_REPL_ALPHA to write results of an alpha-only operation
+ *    (e.g. RCP) into color register
+ *  - apparently, there's no quick DST operation
+ *  - fglrx set FPI2_UNKNOWN_31 on a "MAD fragment.color, tmp0, tmp1, tmp2"
+ *  - fglrx set FPI2_UNKNOWN_31 on a "MAX r2, r1, c0"
+ *  - fglrx once set FPI0_UNKNOWN_31 on a "FRC r1, r1"
+ *
+ * Operand selection
+ * First stage selects three sources from the available registers and
+ * constant parameters. This is defined in INSTR1 (color) and INSTR3 (alpha).
+ * fglrx sorts the three source fields: Registers before constants,
+ * lower indices before higher indices; I do not know whether this is
+ * necessary.
+ *
+ * fglrx fills unused sources with "read constant 0"
+ * According to specs, you cannot select more than two different constants.
+ *
+ * Second stage selects the operands from the sources. This is defined in
+ * INSTR0 (color) and INSTR2 (alpha). You can also select the special constants
+ * zero and one.
+ * Swizzling and negation happens in this stage, as well.
+ *
+ * Important: Color and alpha seem to be mostly separate, i.e. their sources
+ * selection appears to be fully independent (the register storage is probably
+ * physically split into a color and an alpha section).
+ * However (because of the apparent physical split), there is some interaction
+ * WRT swizzling. If, for example, you want to load an R component into an
+ * Alpha operand, this R component is taken from a *color* source, not from
+ * an alpha source. The corresponding register doesn't even have to appear in
+ * the alpha sources list. (I hope this all makes sense to you)
+ *
+ * Destination selection
+ * The destination register index is in FPI1 (color) and FPI3 (alpha)
+ * together with enable bits.
+ * There are separate enable bits for writing into temporary registers
+ * (DSTC_REG_* /DSTA_REG) and and program output registers (DSTC_OUTPUT_*
+ * /DSTA_OUTPUT). You can write to both at once, or not write at all (the
+ * same index must be used for both).
+ *
+ * Note: There is a special form for LRP
+ *  - Argument order is the same as in ARB_fragment_program.
+ *  - Operation is MAD
+ *  - ARG1 is set to ARGC_SRC1C_LRP/ARGC_SRC1A_LRP
+ *  - Set FPI0/FPI2_SPECIAL_LRP
+ * Arbitrary LRP (including support for swizzling) requires vanilla MAD+MAD
+ */
+#define R300_US_ALU_RGB_ADDR_0                   0x46C0
+#       define R300_ALU_SRC0C_SHIFT             0
+#       define R300_ALU_SRC0C_MASK              (31 << 0)
+#       define R300_ALU_SRC0C_CONST             (1 << 5)
+#       define R300_ALU_SRC1C_SHIFT             6
+#       define R300_ALU_SRC1C_MASK              (31 << 6)
+#       define R300_ALU_SRC1C_CONST             (1 << 11)
+#       define R300_ALU_SRC2C_SHIFT             12
+#       define R300_ALU_SRC2C_MASK              (31 << 12)
+#       define R300_ALU_SRC2C_CONST             (1 << 17)
+#       define R300_ALU_SRC_MASK                0x0003ffff
+#       define R300_ALU_DSTC_SHIFT              18
+#       define R300_ALU_DSTC_MASK               (31 << 18)
+#              define R300_ALU_DSTC_REG_MASK_SHIFT     23
+#       define R300_ALU_DSTC_REG_X              (1 << 23)
+#       define R300_ALU_DSTC_REG_Y              (1 << 24)
+#       define R300_ALU_DSTC_REG_Z              (1 << 25)
+#              define R300_ALU_DSTC_OUTPUT_MASK_SHIFT  26
+#       define R300_ALU_DSTC_OUTPUT_X           (1 << 26)
+#       define R300_ALU_DSTC_OUTPUT_Y           (1 << 27)
+#       define R300_ALU_DSTC_OUTPUT_Z           (1 << 28)
+
+#define R300_US_ALU_ALPHA_ADDR_0                 0x47C0
+#       define R300_ALU_SRC0A_SHIFT             0
+#       define R300_ALU_SRC0A_MASK              (31 << 0)
+#       define R300_ALU_SRC0A_CONST             (1 << 5)
+#       define R300_ALU_SRC1A_SHIFT             6
+#       define R300_ALU_SRC1A_MASK              (31 << 6)
+#       define R300_ALU_SRC1A_CONST             (1 << 11)
+#       define R300_ALU_SRC2A_SHIFT             12
+#       define R300_ALU_SRC2A_MASK              (31 << 12)
+#       define R300_ALU_SRC2A_CONST             (1 << 17)
+#       define R300_ALU_SRC_MASK                0x0003ffff
+#       define R300_ALU_DSTA_SHIFT              18
+#       define R300_ALU_DSTA_MASK               (31 << 18)
+#       define R300_ALU_DSTA_REG                (1 << 23)
+#       define R300_ALU_DSTA_OUTPUT             (1 << 24)
+#              define R300_ALU_DSTA_DEPTH              (1 << 27)
+
+#define R300_US_ALU_RGB_INST_0                   0x48C0
+#       define R300_ALU_ARGC_SRC0C_XYZ          0
+#       define R300_ALU_ARGC_SRC0C_XXX          1
+#       define R300_ALU_ARGC_SRC0C_YYY          2
+#       define R300_ALU_ARGC_SRC0C_ZZZ          3
+#       define R300_ALU_ARGC_SRC1C_XYZ          4
+#       define R300_ALU_ARGC_SRC1C_XXX          5
+#       define R300_ALU_ARGC_SRC1C_YYY          6
+#       define R300_ALU_ARGC_SRC1C_ZZZ          7
+#       define R300_ALU_ARGC_SRC2C_XYZ          8
+#       define R300_ALU_ARGC_SRC2C_XXX          9
+#       define R300_ALU_ARGC_SRC2C_YYY          10
+#       define R300_ALU_ARGC_SRC2C_ZZZ          11
+#       define R300_ALU_ARGC_SRC0A              12
+#       define R300_ALU_ARGC_SRC1A              13
+#       define R300_ALU_ARGC_SRC2A              14
+#       define R300_ALU_ARGC_SRCP_XYZ           15
+#       define R300_ALU_ARGC_SRCP_XXX           16
+#       define R300_ALU_ARGC_SRCP_YYY           17
+#       define R300_ALU_ARGC_SRCP_ZZZ           18
+#       define R300_ALU_ARGC_SRCP_WWW           19
+#       define R300_ALU_ARGC_ZERO               20
+#       define R300_ALU_ARGC_ONE                21
+#       define R300_ALU_ARGC_HALF               22
+#       define R300_ALU_ARGC_SRC0C_YZX          23
+#       define R300_ALU_ARGC_SRC1C_YZX          24
+#       define R300_ALU_ARGC_SRC2C_YZX          25
+#       define R300_ALU_ARGC_SRC0C_ZXY          26
+#       define R300_ALU_ARGC_SRC1C_ZXY          27
+#       define R300_ALU_ARGC_SRC2C_ZXY          28
+#       define R300_ALU_ARGC_SRC0CA_WZY         29
+#       define R300_ALU_ARGC_SRC1CA_WZY         30
+#       define R300_ALU_ARGC_SRC2CA_WZY         31
+
+#       define R300_ALU_ARG0C_SHIFT             0
+#       define R300_ALU_ARG0C_MASK              (31 << 0)
+#       define R300_ALU_ARG0C_NOP               (0 << 5)
+#       define R300_ALU_ARG0C_NEG               (1 << 5)
+#       define R300_ALU_ARG0C_ABS               (2 << 5)
+#       define R300_ALU_ARG0C_NAB               (3 << 5)
+#       define R300_ALU_ARG1C_SHIFT             7
+#       define R300_ALU_ARG1C_MASK              (31 << 7)
+#       define R300_ALU_ARG1C_NOP               (0 << 12)
+#       define R300_ALU_ARG1C_NEG               (1 << 12)
+#       define R300_ALU_ARG1C_ABS               (2 << 12)
+#       define R300_ALU_ARG1C_NAB               (3 << 12)
+#       define R300_ALU_ARG2C_SHIFT             14
+#       define R300_ALU_ARG2C_MASK              (31 << 14)
+#       define R300_ALU_ARG2C_NOP               (0 << 19)
+#       define R300_ALU_ARG2C_NEG               (1 << 19)
+#       define R300_ALU_ARG2C_ABS               (2 << 19)
+#       define R300_ALU_ARG2C_NAB               (3 << 19)
+#       define R300_ALU_SRCP_1_MINUS_2_SRC0     (0 << 21)
+#       define R300_ALU_SRCP_SRC1_MINUS_SRC0    (1 << 21)
+#       define R300_ALU_SRCP_SRC1_PLUS_SRC0     (2 << 21)
+#       define R300_ALU_SRCP_1_MINUS_SRC0       (3 << 21)
+
+#       define R300_ALU_OUTC_MAD                (0 << 23)
+#       define R300_ALU_OUTC_DP3                (1 << 23)
+#       define R300_ALU_OUTC_DP4                (2 << 23)
+#       define R300_ALU_OUTC_D2A                (3 << 23)
+#       define R300_ALU_OUTC_MIN                (4 << 23)
+#       define R300_ALU_OUTC_MAX                (5 << 23)
+#       define R300_ALU_OUTC_CMPH               (7 << 23)
+#       define R300_ALU_OUTC_CMP                (8 << 23)
+#       define R300_ALU_OUTC_FRC                (9 << 23)
+#       define R300_ALU_OUTC_REPL_ALPHA         (10 << 23)
+
+#       define R300_ALU_OUTC_MOD_NOP            (0 << 27)
+#       define R300_ALU_OUTC_MOD_MUL2           (1 << 27)
+#       define R300_ALU_OUTC_MOD_MUL4           (2 << 27)
+#       define R300_ALU_OUTC_MOD_MUL8           (3 << 27)
+#       define R300_ALU_OUTC_MOD_DIV2           (4 << 27)
+#       define R300_ALU_OUTC_MOD_DIV4           (5 << 27)
+#       define R300_ALU_OUTC_MOD_DIV8           (6 << 27)
+
+#       define R300_ALU_OUTC_CLAMP              (1 << 30)
+#       define R300_ALU_INSERT_NOP              (1 << 31)
+
+#define R300_US_ALU_ALPHA_INST_0                 0x49C0
+#       define R300_ALU_ARGA_SRC0C_X            0
+#       define R300_ALU_ARGA_SRC0C_Y            1
+#       define R300_ALU_ARGA_SRC0C_Z            2
+#       define R300_ALU_ARGA_SRC1C_X            3
+#       define R300_ALU_ARGA_SRC1C_Y            4
+#       define R300_ALU_ARGA_SRC1C_Z            5
+#       define R300_ALU_ARGA_SRC2C_X            6
+#       define R300_ALU_ARGA_SRC2C_Y            7
+#       define R300_ALU_ARGA_SRC2C_Z            8
+#       define R300_ALU_ARGA_SRC0A              9
+#       define R300_ALU_ARGA_SRC1A              10
+#       define R300_ALU_ARGA_SRC2A              11
+#       define R300_ALU_ARGA_SRCP_X             12
+#       define R300_ALU_ARGA_SRCP_Y             13
+#       define R300_ALU_ARGA_SRCP_Z             14
+#       define R300_ALU_ARGA_SRCP_W             15
+
+#       define R300_ALU_ARGA_ZERO               16
+#       define R300_ALU_ARGA_ONE                17
+#       define R300_ALU_ARGA_HALF               18
+#       define R300_ALU_ARG0A_SHIFT             0
+#       define R300_ALU_ARG0A_MASK              (31 << 0)
+#       define R300_ALU_ARG0A_NOP               (0 << 5)
+#       define R300_ALU_ARG0A_NEG               (1 << 5)
+#      define R300_ALU_ARG0A_ABS                (2 << 5)
+#      define R300_ALU_ARG0A_NAB                (3 << 5)
+#       define R300_ALU_ARG1A_SHIFT             7
+#       define R300_ALU_ARG1A_MASK              (31 << 7)
+#       define R300_ALU_ARG1A_NOP               (0 << 12)
+#       define R300_ALU_ARG1A_NEG               (1 << 12)
+#      define R300_ALU_ARG1A_ABS                (2 << 12)
+#      define R300_ALU_ARG1A_NAB                (3 << 12)
+#       define R300_ALU_ARG2A_SHIFT             14
+#       define R300_ALU_ARG2A_MASK              (31 << 14)
+#       define R300_ALU_ARG2A_NOP               (0 << 19)
+#       define R300_ALU_ARG2A_NEG               (1 << 19)
+#      define R300_ALU_ARG2A_ABS                (2 << 19)
+#      define R300_ALU_ARG2A_NAB                (3 << 19)
+#       define R300_ALU_SRCP_1_MINUS_2_SRC0     (0 << 21)
+#       define R300_ALU_SRCP_SRC1_MINUS_SRC0    (1 << 21)
+#       define R300_ALU_SRCP_SRC1_PLUS_SRC0     (2 << 21)
+#       define R300_ALU_SRCP_1_MINUS_SRC0       (3 << 21)
+
+#       define R300_ALU_OUTA_MAD                (0 << 23)
+#       define R300_ALU_OUTA_DP4                (1 << 23)
+#       define R300_ALU_OUTA_MIN                (2 << 23)
+#       define R300_ALU_OUTA_MAX                (3 << 23)
+#       define R300_ALU_OUTA_CND                (5 << 23)
+#       define R300_ALU_OUTA_CMP                (6 << 23)
+#       define R300_ALU_OUTA_FRC                (7 << 23)
+#       define R300_ALU_OUTA_EX2                (8 << 23)
+#       define R300_ALU_OUTA_LG2                (9 << 23)
+#       define R300_ALU_OUTA_RCP                (10 << 23)
+#       define R300_ALU_OUTA_RSQ                (11 << 23)
+
+#       define R300_ALU_OUTA_MOD_NOP            (0 << 27)
+#       define R300_ALU_OUTA_MOD_MUL2           (1 << 27)
+#       define R300_ALU_OUTA_MOD_MUL4           (2 << 27)
+#       define R300_ALU_OUTA_MOD_MUL8           (3 << 27)
+#       define R300_ALU_OUTA_MOD_DIV2           (4 << 27)
+#       define R300_ALU_OUTA_MOD_DIV4           (5 << 27)
+#       define R300_ALU_OUTA_MOD_DIV8           (6 << 27)
+
+#       define R300_ALU_OUTA_CLAMP              (1 << 30)
+/* END: Fragment program instruction set */
+
+/* Fog: Fog Blending Enable */
+#define R300_FG_FOG_BLEND                             0x4bc0
+#       define R300_FG_FOG_BLEND_DISABLE              (0 << 0)
+#       define R300_FG_FOG_BLEND_ENABLE               (1 << 0)
+#      define R300_FG_FOG_BLEND_FN_LINEAR            (0 << 1)
+#      define R300_FG_FOG_BLEND_FN_EXP               (1 << 1)
+#      define R300_FG_FOG_BLEND_FN_EXP2              (2 << 1)
+#      define R300_FG_FOG_BLEND_FN_CONSTANT          (3 << 1)
+#      define R300_FG_FOG_BLEND_FN_MASK              (3 << 1)
+
+/* Fog: Red Component of Fog Color */
+#define R300_FG_FOG_COLOR_R                           0x4bc8
+/* Fog: Green Component of Fog Color */
+#define R300_FG_FOG_COLOR_G                           0x4bcc
+/* Fog: Blue Component of Fog Color */
+#define R300_FG_FOG_COLOR_B                           0x4bd0
+#      define R300_FG_FOG_COLOR_MASK 0x000003ff
+
+/* Fog: Constant Factor for Fog Blending */
+#define R300_FG_FOG_FACTOR                            0x4bc4
+#      define FG_FOG_FACTOR_MASK 0x000003ff
+
+/* Fog: Alpha function */
+#define R300_FG_ALPHA_FUNC                            0x4bd4
+#       define R300_FG_ALPHA_FUNC_VAL_MASK               0x000000ff
+#       define R300_FG_ALPHA_FUNC_NEVER                     (0 << 8)
+#       define R300_FG_ALPHA_FUNC_LESS                      (1 << 8)
+#       define R300_FG_ALPHA_FUNC_EQUAL                     (2 << 8)
+#       define R300_FG_ALPHA_FUNC_LE                        (3 << 8)
+#       define R300_FG_ALPHA_FUNC_GREATER                   (4 << 8)
+#       define R300_FG_ALPHA_FUNC_NOTEQUAL                  (5 << 8)
+#       define R300_FG_ALPHA_FUNC_GE                        (6 << 8)
+#       define R300_FG_ALPHA_FUNC_ALWAYS                    (7 << 8)
+#       define R300_ALPHA_TEST_OP_MASK                      (7 << 8)
+#       define R300_FG_ALPHA_FUNC_DISABLE                   (0 << 11)
+#       define R300_FG_ALPHA_FUNC_ENABLE                    (1 << 11)
+
+#       define R500_FG_ALPHA_FUNC_10BIT                     (0 << 12)
+#       define R500_FG_ALPHA_FUNC_8BIT                      (1 << 12)
+
+#       define R300_FG_ALPHA_FUNC_MASK_DISABLE              (0 << 16)
+#       define R300_FG_ALPHA_FUNC_MASK_ENABLE               (1 << 16)
+#       define R300_FG_ALPHA_FUNC_CFG_2_OF_4                (0 << 17)
+#       define R300_FG_ALPHA_FUNC_CFG_3_OF_6                (1 << 17)
+
+#       define R300_FG_ALPHA_FUNC_DITH_DISABLE              (0 << 20)
+#       define R300_FG_ALPHA_FUNC_DITH_ENABLE               (1 << 20)
+
+#       define R500_FG_ALPHA_FUNC_OFFSET_DISABLE            (0 << 24)
+#       define R500_FG_ALPHA_FUNC_OFFSET_ENABLE             (1 << 24) /* Not supported in R520 */
+#       define R500_FG_ALPHA_FUNC_DISC_ZERO_MASK_DISABLE    (0 << 25)
+#       define R500_FG_ALPHA_FUNC_DISC_ZERO_MASK_ENABLE     (1 << 25)
+
+#       define R500_FG_ALPHA_FUNC_FP16_DISABLE              (0 << 28)
+#       define R500_FG_ALPHA_FUNC_FP16_ENABLE               (1 << 28)
+
+
+/* Fog: Where does the depth come from? */
+#define R300_FG_DEPTH_SRC                  0x4bd8
+#      define R300_FG_DEPTH_SRC_SCAN   (0 << 0)
+#      define R300_FG_DEPTH_SRC_SHADER (1 << 0)
+
+/* Fog: Alpha Compare Value */
+#define R500_FG_ALPHA_VALUE                0x4be0
+#      define R500_FG_ALPHA_VALUE_MASK 0x0000ffff
+
+/* gap */
+
+/* Fragment program parameters in 7.16 floating point */
+#define R300_PFS_PARAM_0_X                  0x4C00
+#define R300_PFS_PARAM_0_Y                  0x4C04
+#define R300_PFS_PARAM_0_Z                  0x4C08
+#define R300_PFS_PARAM_0_W                  0x4C0C
+/* last consts */
+#define R300_PFS_PARAM_31_X                 0x4DF0
+#define R300_PFS_PARAM_31_Y                 0x4DF4
+#define R300_PFS_PARAM_31_Z                 0x4DF8
+#define R300_PFS_PARAM_31_W                 0x4DFC
+
+/* Unpipelined. */
+#define R300_RB3D_CCTL                      0x4e00
+#      define R300_RB3D_CCTL_NUM_MULTIWRITES_1_BUFFER                (0 << 5)
+#      define R300_RB3D_CCTL_NUM_MULTIWRITES_2_BUFFERS               (1 << 5)
+#      define R300_RB3D_CCTL_NUM_MULTIWRITES_3_BUFFERS               (2 << 5)
+#      define R300_RB3D_CCTL_NUM_MULTIWRITES_4_BUFFERS               (3 << 5)
+#      define R300_RB3D_CCTL_CLRCMP_FLIPE_DISABLE                    (0 << 7)
+#      define R300_RB3D_CCTL_CLRCMP_FLIPE_ENABLE                     (1 << 7)
+#      define R300_RB3D_CCTL_AA_COMPRESSION_DISABLE                  (0 << 9)
+#      define R300_RB3D_CCTL_AA_COMPRESSION_ENABLE                   (1 << 9)
+#      define R300_RB3D_CCTL_CMASK_DISABLE                           (0 << 10)
+#      define R300_RB3D_CCTL_CMASK_ENABLE                            (1 << 10)
+/* reserved */
+#      define R300_RB3D_CCTL_INDEPENDENT_COLOR_CHANNEL_MASK_DISABLE  (0 << 12)
+#      define R300_RB3D_CCTL_INDEPENDENT_COLOR_CHANNEL_MASK_ENABLE   (1 << 12)
+#      define R300_RB3D_CCTL_WRITE_COMPRESSION_ENABLE                (0 << 13)
+#      define R300_RB3D_CCTL_WRITE_COMPRESSION_DISABLE               (1 << 13)
+#      define R300_RB3D_CCTL_INDEPENDENT_COLORFORMAT_ENABLE_DISABLE  (0 << 14)
+#      define R300_RB3D_CCTL_INDEPENDENT_COLORFORMAT_ENABLE_ENABLE   (1 << 14)
+
+
+/* Notes:
+ * - AFAIK fglrx always sets BLEND_UNKNOWN when blending is used in
+ *   the application
+ * - AFAIK fglrx always sets BLEND_NO_SEPARATE when CBLEND and ABLEND
+ *    are set to the same
+ *   function (both registers are always set up completely in any case)
+ * - Most blend flags are simply copied from R200 and not tested yet
+ */
+#define R300_RB3D_CBLEND                    0x4E04
+#define R300_RB3D_ABLEND                    0x4E08
+/* the following only appear in CBLEND */
+#       define R300_ALPHA_BLEND_ENABLE         (1 << 0)
+#       define R300_SEPARATE_ALPHA_ENABLE      (1 << 1)
+#       define R300_READ_ENABLE                (1 << 2)
+#       define R300_DISCARD_SRC_PIXELS_DIS     (0 << 3)
+#       define R300_DISCARD_SRC_PIXELS_SRC_ALPHA_0     (1 << 3)
+#       define R300_DISCARD_SRC_PIXELS_SRC_COLOR_0     (2 << 3)
+#       define R300_DISCARD_SRC_PIXELS_SRC_ALPHA_COLOR_0     (3 << 3)
+#       define R300_DISCARD_SRC_PIXELS_SRC_ALPHA_1     (4 << 3)
+#       define R300_DISCARD_SRC_PIXELS_SRC_COLOR_1     (5 << 3)
+#       define R300_DISCARD_SRC_PIXELS_SRC_ALPHA_COLOR_1     (6 << 3)
+
+/* the following are shared between CBLEND and ABLEND */
+#       define R300_FCN_MASK                         (3  << 12)
+#       define R300_COMB_FCN_ADD_CLAMP               (0  << 12)
+#       define R300_COMB_FCN_ADD_NOCLAMP             (1  << 12)
+#       define R300_COMB_FCN_SUB_CLAMP               (2  << 12)
+#       define R300_COMB_FCN_SUB_NOCLAMP             (3  << 12)
+#       define R300_COMB_FCN_MIN                     (4  << 12)
+#       define R300_COMB_FCN_MAX                     (5  << 12)
+#       define R300_COMB_FCN_RSUB_CLAMP              (6  << 12)
+#       define R300_COMB_FCN_RSUB_NOCLAMP            (7  << 12)
+#       define R300_BLEND_GL_ZERO                    (32)
+#       define R300_BLEND_GL_ONE                     (33)
+#       define R300_BLEND_GL_SRC_COLOR               (34)
+#       define R300_BLEND_GL_ONE_MINUS_SRC_COLOR     (35)
+#       define R300_BLEND_GL_DST_COLOR               (36)
+#       define R300_BLEND_GL_ONE_MINUS_DST_COLOR     (37)
+#       define R300_BLEND_GL_SRC_ALPHA               (38)
+#       define R300_BLEND_GL_ONE_MINUS_SRC_ALPHA     (39)
+#       define R300_BLEND_GL_DST_ALPHA               (40)
+#       define R300_BLEND_GL_ONE_MINUS_DST_ALPHA     (41)
+#       define R300_BLEND_GL_SRC_ALPHA_SATURATE      (42)
+#       define R300_BLEND_GL_CONST_COLOR             (43)
+#       define R300_BLEND_GL_ONE_MINUS_CONST_COLOR   (44)
+#       define R300_BLEND_GL_CONST_ALPHA             (45)
+#       define R300_BLEND_GL_ONE_MINUS_CONST_ALPHA   (46)
+#       define R300_BLEND_MASK                       (63)
+#       define R300_SRC_BLEND_SHIFT                  (16)
+#       define R300_DST_BLEND_SHIFT                  (24)
+
+/* Constant color used by the blender. Pipelined through the blender.
+ * Note: For R520, this field is ignored, use RB3D_CONSTANT_COLOR_GB__BLUE,
+ * RB3D_CONSTANT_COLOR_GB__GREEN, etc. instead.
+ */
+#define R300_RB3D_BLEND_COLOR               0x4E10
+
+
+/* 3D Color Channel Mask. If all the channels used in the current color format
+ * are disabled, then the cb will discard all the incoming quads. Pipelined
+ * through the blender.
+ */
+#define RB3D_COLOR_CHANNEL_MASK                  0x4E0C
+#      define RB3D_COLOR_CHANNEL_MASK_BLUE_MASK0  (1 << 0)
+#      define RB3D_COLOR_CHANNEL_MASK_GREEN_MASK0 (1 << 1)
+#      define RB3D_COLOR_CHANNEL_MASK_RED_MASK0   (1 << 2)
+#      define RB3D_COLOR_CHANNEL_MASK_ALPHA_MASK0 (1 << 3)
+#      define RB3D_COLOR_CHANNEL_MASK_BLUE_MASK1  (1 << 4)
+#      define RB3D_COLOR_CHANNEL_MASK_GREEN_MASK1 (1 << 5)
+#      define RB3D_COLOR_CHANNEL_MASK_RED_MASK1   (1 << 6)
+#      define RB3D_COLOR_CHANNEL_MASK_ALPHA_MASK1 (1 << 7)
+#      define RB3D_COLOR_CHANNEL_MASK_BLUE_MASK2  (1 << 8)
+#      define RB3D_COLOR_CHANNEL_MASK_GREEN_MASK2 (1 << 9)
+#      define RB3D_COLOR_CHANNEL_MASK_RED_MASK2   (1 << 10)
+#      define RB3D_COLOR_CHANNEL_MASK_ALPHA_MASK2 (1 << 11)
+#      define RB3D_COLOR_CHANNEL_MASK_BLUE_MASK3  (1 << 12)
+#      define RB3D_COLOR_CHANNEL_MASK_GREEN_MASK3 (1 << 13)
+#      define RB3D_COLOR_CHANNEL_MASK_RED_MASK3   (1 << 14)
+#      define RB3D_COLOR_CHANNEL_MASK_ALPHA_MASK3 (1 << 15)
+
+/* Clear color that is used when the color mask is set to 00. Unpipelined.
+ * Program this register with a 32-bit value in ARGB8888 or ARGB2101010
+ * formats, ignoring the fields.
+ */
+#define RB3D_COLOR_CLEAR_VALUE                   0x4e14
+
+/* gap */
+
+/* Color Compare Color. Stalls the 2d/3d datapath until it is idle. */
+#define RB3D_CLRCMP_CLR                     0x4e20
+
+/* Color Compare Mask. Stalls the 2d/3d datapath until it is idle. */
+#define RB3D_CLRCMP_MSK                     0x4e24
+
+/* Color Buffer Address Offset of multibuffer 0. Unpipelined. */
+#define R300_RB3D_COLOROFFSET0              0x4E28
+#       define R300_COLOROFFSET_MASK             0xFFFFFFE0
+/* Color Buffer Address Offset of multibuffer 1. Unpipelined. */
+#define R300_RB3D_COLOROFFSET1              0x4E2C
+/* Color Buffer Address Offset of multibuffer 2. Unpipelined. */
+#define R300_RB3D_COLOROFFSET2              0x4E30
+/* Color Buffer Address Offset of multibuffer 3. Unpipelined. */
+#define R300_RB3D_COLOROFFSET3              0x4E34
+
+/* Color buffer format and tiling control for all the multibuffers and the
+ * pitch of multibuffer 0 to 3. Unpipelined. The cache must be empty before any
+ * of the registers are changed.
+ *
+ * Bit 16: Larger tiles
+ * Bit 17: 4x2 tiles
+ * Bit 18: Extremely weird tile like, but some pixels duplicated?
+ */
+#define R300_RB3D_COLORPITCH0               0x4E38
+#       define R300_COLORPITCH_MASK              0x00003FFE
+#       define R300_COLOR_TILE_DISABLE            (0 << 16)
+#       define R300_COLOR_TILE_ENABLE             (1 << 16)
+#       define R300_COLOR_MICROTILE_DISABLE       (0 << 17)
+#       define R300_COLOR_MICROTILE_ENABLE        (1 << 17)
+#       define R300_COLOR_MICROTILE_ENABLE_SQUARE (2 << 17) /* Only available in 16-bit */
+#       define R300_COLOR_ENDIAN_NO_SWAP          (0 << 19)
+#       define R300_COLOR_ENDIAN_WORD_SWAP        (1 << 19)
+#       define R300_COLOR_ENDIAN_DWORD_SWAP       (2 << 19)
+#       define R300_COLOR_ENDIAN_HALF_DWORD_SWAP  (3 << 19)
+#      define R500_COLOR_FORMAT_ARGB10101010     (0 << 21)
+#      define R500_COLOR_FORMAT_UV1010           (1 << 21)
+#      define R500_COLOR_FORMAT_CI8              (2 << 21) /* 2D only */
+#      define R300_COLOR_FORMAT_ARGB1555         (3 << 21)
+#       define R300_COLOR_FORMAT_RGB565           (4 << 21)
+#       define R500_COLOR_FORMAT_ARGB2101010      (5 << 21)
+#       define R300_COLOR_FORMAT_ARGB8888         (6 << 21)
+#       define R300_COLOR_FORMAT_ARGB32323232     (7 << 21)
+/* reserved */
+#       define R300_COLOR_FORMAT_I8               (9 << 21)
+#       define R300_COLOR_FORMAT_ARGB16161616     (10 << 21)
+#       define R300_COLOR_FORMAT_VYUY             (11 << 21)
+#       define R300_COLOR_FORMAT_YVYU             (12 << 21)
+#       define R300_COLOR_FORMAT_UV88             (13 << 21)
+#       define R500_COLOR_FORMAT_I10              (14 << 21)
+#       define R300_COLOR_FORMAT_ARGB4444         (15 << 21)
+#define R300_RB3D_COLORPITCH1               0x4E3C
+#define R300_RB3D_COLORPITCH2               0x4E40
+#define R300_RB3D_COLORPITCH3               0x4E44
+
+/* gap */
+
+/* Destination Color Buffer Cache Control/Status. If the cb is in e2 mode, then
+ * a flush or free will not occur upon a write to this register, but a sync
+ * will be immediately sent if one is requested. If both DC_FLUSH and DC_FREE
+ * are zero but DC_FINISH is one, then a sync will be sent immediately -- the
+ * cb will not wait for all the previous operations to complete before sending
+ * the sync. Unpipelined except when DC_FINISH and DC_FREE are both set to
+ * zero.
+ *
+ * Set to 0A before 3D operations, set to 02 afterwards.
+ */
+#define R300_RB3D_DSTCACHE_CTLSTAT               0x4e4c
+#      define R300_RB3D_DSTCACHE_CTLSTAT_DC_FLUSH_NO_EFFECT         (0 << 0)
+#      define R300_RB3D_DSTCACHE_CTLSTAT_DC_FLUSH_NO_EFFECT_1       (1 << 0)
+#      define R300_RB3D_DSTCACHE_CTLSTAT_DC_FLUSH_FLUSH_DIRTY_3D    (2 << 0)
+#      define R300_RB3D_DSTCACHE_CTLSTAT_DC_FLUSH_FLUSH_DIRTY_3D_1  (3 << 0)
+#      define R300_RB3D_DSTCACHE_CTLSTAT_DC_FREE_NO_EFFECT          (0 << 2)
+#      define R300_RB3D_DSTCACHE_CTLSTAT_DC_FREE_NO_EFFECT_1        (1 << 2)
+#      define R300_RB3D_DSTCACHE_CTLSTAT_DC_FREE_FREE_3D_TAGS       (2 << 2)
+#      define R300_RB3D_DSTCACHE_CTLSTAT_DC_FREE_FREE_3D_TAGS_1     (3 << 2)
+#      define R300_RB3D_DSTCACHE_CTLSTAT_DC_FINISH_NO_SIGNAL        (0 << 4)
+#      define R300_RB3D_DSTCACHE_CTLSTAT_DC_FINISH_SIGNAL           (1 << 4)
+
+#define R300_RB3D_DITHER_CTL 0x4E50
+#      define R300_RB3D_DITHER_CTL_DITHER_MODE_TRUNCATE         (0 << 0)
+#      define R300_RB3D_DITHER_CTL_DITHER_MODE_ROUND            (1 << 0)
+#      define R300_RB3D_DITHER_CTL_DITHER_MODE_LUT              (2 << 0)
+/* reserved */
+#      define R300_RB3D_DITHER_CTL_ALPHA_DITHER_MODE_TRUNCATE   (0 << 2)
+#      define R300_RB3D_DITHER_CTL_ALPHA_DITHER_MODE_ROUND      (1 << 2)
+#      define R300_RB3D_DITHER_CTL_ALPHA_DITHER_MODE_LUT        (2 << 2)
+/* reserved */
+
+/* Resolve buffer destination address. The cache must be empty before changing
+ * this register if the cb is in resolve mode. Unpipelined
+ */
+#define R300_RB3D_AARESOLVE_OFFSET        0x4e80
+#      define R300_RB3D_AARESOLVE_OFFSET_SHIFT 5
+#      define R300_RB3D_AARESOLVE_OFFSET_MASK 0xffffffe0 /* At least according to the calculations of Christoph Brill */
+
+/* Resolve Buffer Pitch and Tiling Control. The cache must be empty before
+ * changing this register if the cb is in resolve mode. Unpipelined
+ */
+#define R300_RB3D_AARESOLVE_PITCH         0x4e84
+#      define R300_RB3D_AARESOLVE_PITCH_SHIFT 1
+#      define R300_RB3D_AARESOLVE_PITCH_MASK  0x00003ffe /* At least according to the calculations of Christoph Brill */
+
+/* Resolve Buffer Control. Unpipelined */
+#define R300_RB3D_AARESOLVE_CTL           0x4e88
+#      define R300_RB3D_AARESOLVE_CTL_AARESOLVE_MODE_NORMAL   (0 << 0)
+#      define R300_RB3D_AARESOLVE_CTL_AARESOLVE_MODE_RESOLVE  (1 << 0)
+#      define R300_RB3D_AARESOLVE_CTL_AARESOLVE_GAMMA_10      (0 << 1)
+#      define R300_RB3D_AARESOLVE_CTL_AARESOLVE_GAMMA_22      (1 << 1)
+#      define R300_RB3D_AARESOLVE_CTL_AARESOLVE_ALPHA_SAMPLE0 (0 << 2)
+#      define R300_RB3D_AARESOLVE_CTL_AARESOLVE_ALPHA_AVERAGE (1 << 2)
+
+
+/* Discard src pixels less than or equal to threshold. */
+#define R500_RB3D_DISCARD_SRC_PIXEL_LTE_THRESHOLD 0x4ea0
+/* Discard src pixels greater than or equal to threshold. */
+#define R500_RB3D_DISCARD_SRC_PIXEL_GTE_THRESHOLD 0x4ea4
+#      define R500_RB3D_DISCARD_SRC_PIXEL_THRESHOLD_BLUE_SHIFT 0
+#      define R500_RB3D_DISCARD_SRC_PIXEL_THRESHOLD_BLUE_MASK 0x000000ff
+#      define R500_RB3D_DISCARD_SRC_PIXEL_THRESHOLD_GREEN_SHIFT 8
+#      define R500_RB3D_DISCARD_SRC_PIXEL_THRESHOLD_GREEN_MASK 0x0000ff00
+#      define R500_RB3D_DISCARD_SRC_PIXEL_THRESHOLD_RED_SHIFT 16
+#      define R500_RB3D_DISCARD_SRC_PIXEL_THRESHOLD_RED_MASK 0x00ff0000
+#      define R500_RB3D_DISCARD_SRC_PIXEL_THRESHOLD_ALPHA_SHIFT 24
+#      define R500_RB3D_DISCARD_SRC_PIXEL_THRESHOLD_ALPHA_MASK 0xff000000
+
+/* 3D ROP Control. Stalls the 2d/3d datapath until it is idle. */
+#define R300_RB3D_ROPCNTL                             0x4e18
+#      define R300_RB3D_ROPCNTL_ROP_ENABLE            0x00000004
+#      define R300_RB3D_ROPCNTL_ROP_MASK              (15 << 8)
+#      define R300_RB3D_ROPCNTL_ROP_SHIFT             8
+
+/* Color Compare Flip. Stalls the 2d/3d datapath until it is idle. */
+#define R300_RB3D_CLRCMP_FLIPE                        0x4e1c
+
+/* Sets the fifo sizes */
+#define R500_RB3D_FIFO_SIZE                           0x4ef4
+#      define R500_RB3D_FIFO_SIZE_OP_FIFO_SIZE_FULL   (0 << 0)
+#      define R500_RB3D_FIFO_SIZE_OP_FIFO_SIZE_HALF   (1 << 0)
+#      define R500_RB3D_FIFO_SIZE_OP_FIFO_SIZE_QUATER (2 << 0)
+#      define R500_RB3D_FIFO_SIZE_OP_FIFO_SIZE_EIGTHS (3 << 0)
+
+/* Constant color used by the blender. Pipelined through the blender. */
+#define R500_RB3D_CONSTANT_COLOR_AR                   0x4ef8
+#      define R500_RB3D_CONSTANT_COLOR_AR_RED_MASK    0x0000ffff
+#      define R500_RB3D_CONSTANT_COLOR_AR_RED_SHIFT   0
+#      define R500_RB3D_CONSTANT_COLOR_AR_ALPHA_MASK  0xffff0000
+#      define R500_RB3D_CONSTANT_COLOR_AR_ALPHA_SHIFT 16
+
+/* Constant color used by the blender. Pipelined through the blender. */
+#define R500_RB3D_CONSTANT_COLOR_GB                   0x4efc
+#      define R500_RB3D_CONSTANT_COLOR_AR_BLUE_MASK   0x0000ffff
+#      define R500_RB3D_CONSTANT_COLOR_AR_BLUE_SHIFT  0
+#      define R500_RB3D_CONSTANT_COLOR_AR_GREEN_MASK  0xffff0000
+#      define R500_RB3D_CONSTANT_COLOR_AR_GREEN_SHIFT 16
+
+/* gap */
+/* There seems to be no "write only" setting, so use Z-test = ALWAYS
+ * for this.
+ * Bit (1<<8) is the "test" bit. so plain write is 6  - vd
+ */
+#define R300_ZB_CNTL                             0x4F00
+#      define R300_STENCIL_ENABLE               (1 << 0)
+#      define R300_Z_ENABLE                     (1 << 1)
+#      define R300_Z_WRITE_ENABLE               (1 << 2)
+#      define R300_Z_SIGNED_COMPARE             (1 << 3)
+#      define R300_STENCIL_FRONT_BACK           (1 << 4)
+
+#define R300_ZB_ZSTENCILCNTL                   0x4f04
+       /* functions */
+#      define R300_ZS_NEVER                    0
+#      define R300_ZS_LESS                     1
+#      define R300_ZS_LEQUAL                   2
+#      define R300_ZS_EQUAL                    3
+#      define R300_ZS_GEQUAL                   4
+#      define R300_ZS_GREATER                  5
+#      define R300_ZS_NOTEQUAL                 6
+#      define R300_ZS_ALWAYS                   7
+#       define R300_ZS_MASK                     7
+       /* operations */
+#      define R300_ZS_KEEP                     0
+#      define R300_ZS_ZERO                     1
+#      define R300_ZS_REPLACE                  2
+#      define R300_ZS_INCR                     3
+#      define R300_ZS_DECR                     4
+#      define R300_ZS_INVERT                   5
+#      define R300_ZS_INCR_WRAP                6
+#      define R300_ZS_DECR_WRAP                7
+#      define R300_Z_FUNC_SHIFT                0
+       /* front and back refer to operations done for front
+          and back faces, i.e. separate stencil function support */
+#      define R300_S_FRONT_FUNC_SHIFT          3
+#      define R300_S_FRONT_SFAIL_OP_SHIFT      6
+#      define R300_S_FRONT_ZPASS_OP_SHIFT      9
+#      define R300_S_FRONT_ZFAIL_OP_SHIFT      12
+#      define R300_S_BACK_FUNC_SHIFT           15
+#      define R300_S_BACK_SFAIL_OP_SHIFT       18
+#      define R300_S_BACK_ZPASS_OP_SHIFT       21
+#      define R300_S_BACK_ZFAIL_OP_SHIFT       24
+
+#define R300_ZB_STENCILREFMASK                        0x4f08
+#      define R300_STENCILREF_SHIFT       0
+#      define R300_STENCILREF_MASK        0x000000ff
+#      define R300_STENCILMASK_SHIFT      8
+#      define R300_STENCILMASK_MASK       0x0000ff00
+#      define R300_STENCILWRITEMASK_SHIFT 16
+#      define R300_STENCILWRITEMASK_MASK  0x00ff0000
+
+/* gap */
+
+#define R300_ZB_FORMAT                             0x4f10
+#      define R300_DEPTHFORMAT_16BIT_INT_Z   (0 << 0)
+#      define R300_DEPTHFORMAT_16BIT_13E3    (1 << 0)
+#      define R300_DEPTHFORMAT_24BIT_INT_Z_8BIT_STENCIL   (2 << 0)
+/* reserved up to (15 << 0) */
+#      define R300_INVERT_13E3_LEADING_ONES  (0 << 4)
+#      define R300_INVERT_13E3_LEADING_ZEROS (1 << 4)
+
+#define R300_ZB_ZTOP                             0x4F14
+#      define R300_ZTOP_DISABLE                 (0 << 0)
+#      define R300_ZTOP_ENABLE                  (1 << 0)
+
+/* gap */
+
+#define R300_ZB_ZCACHE_CTLSTAT            0x4f18
+#       define R300_ZB_ZCACHE_CTLSTAT_ZC_FLUSH_NO_EFFECT      (0 << 0)
+#       define R300_ZB_ZCACHE_CTLSTAT_ZC_FLUSH_FLUSH_AND_FREE (1 << 0)
+#       define R300_ZB_ZCACHE_CTLSTAT_ZC_FREE_NO_EFFECT       (0 << 1)
+#       define R300_ZB_ZCACHE_CTLSTAT_ZC_FREE_FREE            (1 << 1)
+#       define R300_ZB_ZCACHE_CTLSTAT_ZC_BUSY_IDLE            (0 << 31)
+#       define R300_ZB_ZCACHE_CTLSTAT_ZC_BUSY_BUSY            (1 << 31)
+
+#define R300_ZB_BW_CNTL                     0x4f1c
+#      define R300_HIZ_DISABLE                              (0 << 0)
+#      define R300_HIZ_ENABLE                               (1 << 0)
+#      define R300_HIZ_MIN                                  (0 << 1)
+#      define R300_HIZ_MAX                                  (1 << 1)
+#      define R300_FAST_FILL_DISABLE                        (0 << 2)
+#      define R300_FAST_FILL_ENABLE                         (1 << 2)
+#      define R300_RD_COMP_DISABLE                          (0 << 3)
+#      define R300_RD_COMP_ENABLE                           (1 << 3)
+#      define R300_WR_COMP_DISABLE                          (0 << 4)
+#      define R300_WR_COMP_ENABLE                           (1 << 4)
+#      define R300_ZB_CB_CLEAR_RMW                          (0 << 5)
+#      define R300_ZB_CB_CLEAR_CACHE_LINEAR                 (1 << 5)
+#      define R300_FORCE_COMPRESSED_STENCIL_VALUE_DISABLE   (0 << 6)
+#      define R300_FORCE_COMPRESSED_STENCIL_VALUE_ENABLE    (1 << 6)
+
+#      define R500_ZEQUAL_OPTIMIZE_ENABLE                   (0 << 7)
+#      define R500_ZEQUAL_OPTIMIZE_DISABLE                  (1 << 7)
+#      define R500_SEQUAL_OPTIMIZE_ENABLE                   (0 << 8)
+#      define R500_SEQUAL_OPTIMIZE_DISABLE                  (1 << 8)
+
+#      define R500_BMASK_ENABLE                             (0 << 10)
+#      define R500_BMASK_DISABLE                            (1 << 10)
+#      define R500_HIZ_EQUAL_REJECT_DISABLE                 (0 << 11)
+#      define R500_HIZ_EQUAL_REJECT_ENABLE                  (1 << 11)
+#      define R500_HIZ_FP_EXP_BITS_DISABLE                  (0 << 12)
+#      define R500_HIZ_FP_EXP_BITS_1                        (1 << 12)
+#      define R500_HIZ_FP_EXP_BITS_2                        (2 << 12)
+#      define R500_HIZ_FP_EXP_BITS_3                        (3 << 12)
+#      define R500_HIZ_FP_EXP_BITS_4                        (4 << 12)
+#      define R500_HIZ_FP_EXP_BITS_5                        (5 << 12)
+#      define R500_HIZ_FP_INVERT_LEADING_ONES               (0 << 15)
+#      define R500_HIZ_FP_INVERT_LEADING_ZEROS              (1 << 15)
+#      define R500_TILE_OVERWRITE_RECOMPRESSION_ENABLE      (0 << 16)
+#      define R500_TILE_OVERWRITE_RECOMPRESSION_DISABLE     (1 << 16)
+#      define R500_CONTIGUOUS_6XAA_SAMPLES_ENABLE           (0 << 17)
+#      define R500_CONTIGUOUS_6XAA_SAMPLES_DISABLE          (1 << 17)
+#      define R500_PEQ_PACKING_DISABLE                      (0 << 18)
+#      define R500_PEQ_PACKING_ENABLE                       (1 << 18)
+#      define R500_COVERED_PTR_MASKING_DISABLE              (0 << 18)
+#      define R500_COVERED_PTR_MASKING_ENABLE               (1 << 18)
+
+
+/* gap */
+
+/* Z Buffer Address Offset.
+ * Bits 31 to 5 are used for aligned Z buffer address offset for macro tiles.
+ */
+#define R300_ZB_DEPTHOFFSET               0x4f20
+
+/* Z Buffer Pitch and Endian Control */
+#define R300_ZB_DEPTHPITCH                0x4f24
+#       define R300_DEPTHPITCH_MASK              0x00003FFC
+#       define R300_DEPTHMACROTILE_DISABLE      (0 << 16)
+#       define R300_DEPTHMACROTILE_ENABLE       (1 << 16)
+#       define R300_DEPTHMICROTILE_LINEAR       (0 << 17)
+#       define R300_DEPTHMICROTILE_TILED        (1 << 17)
+#       define R300_DEPTHMICROTILE_TILED_SQUARE (2 << 17)
+#       define R300_DEPTHENDIAN_NO_SWAP         (0 << 18)
+#       define R300_DEPTHENDIAN_WORD_SWAP       (1 << 18)
+#       define R300_DEPTHENDIAN_DWORD_SWAP      (2 << 18)
+#       define R300_DEPTHENDIAN_HALF_DWORD_SWAP (3 << 18)
+
+/* Z Buffer Clear Value */
+#define R300_ZB_DEPTHCLEARVALUE                  0x4f28
+
+/* Hierarchical Z Memory Offset */
+#define R300_ZB_HIZ_OFFSET                       0x4f44
+
+/* Hierarchical Z Write Index */
+#define R300_ZB_HIZ_WRINDEX                      0x4f48
+
+/* Hierarchical Z Data */
+#define R300_ZB_HIZ_DWORD                        0x4f4c
+
+/* Hierarchical Z Read Index */
+#define R300_ZB_HIZ_RDINDEX                      0x4f50
+
+/* Hierarchical Z Pitch */
+#define R300_ZB_HIZ_PITCH                        0x4f54
+
+/* Z Buffer Z Pass Counter Data */
+#define R300_ZB_ZPASS_DATA                       0x4f58
+
+/* Z Buffer Z Pass Counter Address */
+#define R300_ZB_ZPASS_ADDR                       0x4f5c
+
+/* Depth buffer X and Y coordinate offset */
+#define R300_ZB_DEPTHXY_OFFSET                   0x4f60
+#      define R300_DEPTHX_OFFSET_SHIFT  1
+#      define R300_DEPTHX_OFFSET_MASK   0x000007FE
+#      define R300_DEPTHY_OFFSET_SHIFT  17
+#      define R300_DEPTHY_OFFSET_MASK   0x07FE0000
+
+/* Sets the fifo sizes */
+#define R500_ZB_FIFO_SIZE                        0x4fd0
+#      define R500_OP_FIFO_SIZE_FULL   (0 << 0)
+#      define R500_OP_FIFO_SIZE_HALF   (1 << 0)
+#      define R500_OP_FIFO_SIZE_QUATER (2 << 0)
+#      define R500_OP_FIFO_SIZE_EIGTHS (4 << 0)
+
+/* Stencil Reference Value and Mask for backfacing quads */
+/* R300_ZB_STENCILREFMASK handles front face */
+#define R500_ZB_STENCILREFMASK_BF                0x4fd4
+#      define R500_STENCILREF_SHIFT       0
+#      define R500_STENCILREF_MASK        0x000000ff
+#      define R500_STENCILMASK_SHIFT      8
+#      define R500_STENCILMASK_MASK       0x0000ff00
+#      define R500_STENCILWRITEMASK_SHIFT 16
+#      define R500_STENCILWRITEMASK_MASK  0x00ff0000
+
+/**
+ * \defgroup R3XX_R5XX_PROGRAMMABLE_VERTEX_SHADER_DESCRIPTION R3XX-R5XX PROGRAMMABLE VERTEX SHADER DESCRIPTION
+ *
+ * The PVS_DST_MATH_INST is used to identify whether the instruction is a Vector
+ * Engine instruction or a Math Engine instruction.
+ */
+
+/*\{*/
+
+enum {
+       /* R3XX */
+       VECTOR_NO_OP                    = 0,
+       VE_DOT_PRODUCT                  = 1,
+       VE_MULTIPLY                     = 2,
+       VE_ADD                          = 3,
+       VE_MULTIPLY_ADD                 = 4,
+       VE_DISTANCE_VECTOR              = 5,
+       VE_FRACTION                     = 6,
+       VE_MAXIMUM                      = 7,
+       VE_MINIMUM                      = 8,
+       VE_SET_GREATER_THAN_EQUAL       = 9,
+       VE_SET_LESS_THAN                = 10,
+       VE_MULTIPLYX2_ADD               = 11,
+       VE_MULTIPLY_CLAMP               = 12,
+       VE_FLT2FIX_DX                   = 13,
+       VE_FLT2FIX_DX_RND               = 14,
+       /* R5XX */
+       VE_PRED_SET_EQ_PUSH             = 15,
+       VE_PRED_SET_GT_PUSH             = 16,
+       VE_PRED_SET_GTE_PUSH            = 17,
+       VE_PRED_SET_NEQ_PUSH            = 18,
+       VE_COND_WRITE_EQ                = 19,
+       VE_COND_WRITE_GT                = 20,
+       VE_COND_WRITE_GTE               = 21,
+       VE_COND_WRITE_NEQ               = 22,
+       VE_COND_MUX_EQ                  = 23,
+       VE_COND_MUX_GT                  = 24,
+       VE_COND_MUX_GTE                 = 25,
+       VE_SET_GREATER_THAN             = 26,
+       VE_SET_EQUAL                    = 27,
+       VE_SET_NOT_EQUAL                = 28,
+};
+
+enum {
+       /* R3XX */
+       MATH_NO_OP                      = 0,
+       ME_EXP_BASE2_DX                 = 1,
+       ME_LOG_BASE2_DX                 = 2,
+       ME_EXP_BASEE_FF                 = 3,
+       ME_LIGHT_COEFF_DX               = 4,
+       ME_POWER_FUNC_FF                = 5,
+       ME_RECIP_DX                     = 6,
+       ME_RECIP_FF                     = 7,
+       ME_RECIP_SQRT_DX                = 8,
+       ME_RECIP_SQRT_FF                = 9,
+       ME_MULTIPLY                     = 10,
+       ME_EXP_BASE2_FULL_DX            = 11,
+       ME_LOG_BASE2_FULL_DX            = 12,
+       ME_POWER_FUNC_FF_CLAMP_B        = 13,
+       ME_POWER_FUNC_FF_CLAMP_B1       = 14,
+       ME_POWER_FUNC_FF_CLAMP_01       = 15,
+       ME_SIN                          = 16,
+       ME_COS                          = 17,
+       /* R5XX */
+       ME_LOG_BASE2_IEEE               = 18,
+       ME_RECIP_IEEE                   = 19,
+       ME_RECIP_SQRT_IEEE              = 20,
+       ME_PRED_SET_EQ                  = 21,
+       ME_PRED_SET_GT                  = 22,
+       ME_PRED_SET_GTE                 = 23,
+       ME_PRED_SET_NEQ                 = 24,
+       ME_PRED_SET_CLR                 = 25,
+       ME_PRED_SET_INV                 = 26,
+       ME_PRED_SET_POP                 = 27,
+       ME_PRED_SET_RESTORE             = 28,
+};
+
+enum {
+       /* R3XX */
+       PVS_MACRO_OP_2CLK_MADD          = 0,
+       PVS_MACRO_OP_2CLK_M2X_ADD       = 1,
+};
+
+enum {
+       PVS_SRC_REG_TEMPORARY           = 0,    /* Intermediate Storage */
+       PVS_SRC_REG_INPUT               = 1,    /* Input Vertex Storage */
+       PVS_SRC_REG_CONSTANT            = 2,    /* Constant State Storage */
+       PVS_SRC_REG_ALT_TEMPORARY       = 3,    /* Alternate Intermediate Storage */
+};
+
+enum {
+       PVS_DST_REG_TEMPORARY           = 0,    /* Intermediate Storage */
+       PVS_DST_REG_A0                  = 1,    /* Address Register Storage */
+       PVS_DST_REG_OUT                 = 2,    /* Output Memory. Used for all outputs */
+       PVS_DST_REG_OUT_REPL_X          = 3,    /* Output Memory & Replicate X to all channels */
+       PVS_DST_REG_ALT_TEMPORARY       = 4,    /* Alternate Intermediate Storage */
+       PVS_DST_REG_INPUT               = 5,    /* Output Memory & Replicate X to all channels */
+};
+
+enum {
+       PVS_SRC_SELECT_X                = 0,    /* Select X Component */
+       PVS_SRC_SELECT_Y                = 1,    /* Select Y Component */
+       PVS_SRC_SELECT_Z                = 2,    /* Select Z Component */
+       PVS_SRC_SELECT_W                = 3,    /* Select W Component */
+       PVS_SRC_SELECT_FORCE_0          = 4,    /* Force Component to 0.0 */
+       PVS_SRC_SELECT_FORCE_1          = 5,    /* Force Component to 1.0 */
+};
+
+/* PVS Opcode & Destination Operand Description */
+
+enum {
+       PVS_DST_OPCODE_MASK             = 0x3f,
+       PVS_DST_OPCODE_SHIFT            = 0,
+       PVS_DST_MATH_INST_MASK          = 0x1,
+       PVS_DST_MATH_INST_SHIFT         = 6,
+       PVS_DST_MACRO_INST_MASK         = 0x1,
+       PVS_DST_MACRO_INST_SHIFT        = 7,
+       PVS_DST_REG_TYPE_MASK           = 0xf,
+       PVS_DST_REG_TYPE_SHIFT          = 8,
+       PVS_DST_ADDR_MODE_1_MASK        = 0x1,
+       PVS_DST_ADDR_MODE_1_SHIFT       = 12,
+       PVS_DST_OFFSET_MASK             = 0x7f,
+       PVS_DST_OFFSET_SHIFT            = 13,
+       PVS_DST_WE_X_MASK               = 0x1,
+       PVS_DST_WE_X_SHIFT              = 20,
+       PVS_DST_WE_Y_MASK               = 0x1,
+       PVS_DST_WE_Y_SHIFT              = 21,
+       PVS_DST_WE_Z_MASK               = 0x1,
+       PVS_DST_WE_Z_SHIFT              = 22,
+       PVS_DST_WE_W_MASK               = 0x1,
+       PVS_DST_WE_W_SHIFT              = 23,
+       PVS_DST_VE_SAT_MASK             = 0x1,
+       PVS_DST_VE_SAT_SHIFT            = 24,
+       PVS_DST_ME_SAT_MASK             = 0x1,
+       PVS_DST_ME_SAT_SHIFT            = 25,
+       PVS_DST_PRED_ENABLE_MASK        = 0x1,
+       PVS_DST_PRED_ENABLE_SHIFT       = 26,
+       PVS_DST_PRED_SENSE_MASK         = 0x1,
+       PVS_DST_PRED_SENSE_SHIFT        = 27,
+       PVS_DST_DUAL_MATH_OP_MASK       = 0x3,
+       PVS_DST_DUAL_MATH_OP_SHIFT      = 27,
+       PVS_DST_ADDR_SEL_MASK           = 0x3,
+       PVS_DST_ADDR_SEL_SHIFT          = 29,
+       PVS_DST_ADDR_MODE_0_MASK        = 0x1,
+       PVS_DST_ADDR_MODE_0_SHIFT       = 31,
+};
+
+/* PVS Source Operand Description */
+
+enum {
+       PVS_SRC_REG_TYPE_MASK           = 0x3,
+       PVS_SRC_REG_TYPE_SHIFT          = 0,
+       SPARE_0_MASK                    = 0x1,
+       SPARE_0_SHIFT                   = 2,
+       PVS_SRC_ABS_XYZW_MASK           = 0x1,
+       PVS_SRC_ABS_XYZW_SHIFT          = 3,
+       PVS_SRC_ADDR_MODE_0_MASK        = 0x1,
+       PVS_SRC_ADDR_MODE_0_SHIFT       = 4,
+       PVS_SRC_OFFSET_MASK             = 0xff,
+       PVS_SRC_OFFSET_SHIFT            = 5,
+       PVS_SRC_SWIZZLE_X_MASK          = 0x7,
+       PVS_SRC_SWIZZLE_X_SHIFT         = 13,
+       PVS_SRC_SWIZZLE_Y_MASK          = 0x7,
+       PVS_SRC_SWIZZLE_Y_SHIFT         = 16,
+       PVS_SRC_SWIZZLE_Z_MASK          = 0x7,
+       PVS_SRC_SWIZZLE_Z_SHIFT         = 19,
+       PVS_SRC_SWIZZLE_W_MASK          = 0x7,
+       PVS_SRC_SWIZZLE_W_SHIFT         = 22,
+       PVS_SRC_MODIFIER_X_MASK         = 0x1,
+       PVS_SRC_MODIFIER_X_SHIFT        = 25,
+       PVS_SRC_MODIFIER_Y_MASK         = 0x1,
+       PVS_SRC_MODIFIER_Y_SHIFT        = 26,
+       PVS_SRC_MODIFIER_Z_MASK         = 0x1,
+       PVS_SRC_MODIFIER_Z_SHIFT        = 27,
+       PVS_SRC_MODIFIER_W_MASK         = 0x1,
+       PVS_SRC_MODIFIER_W_SHIFT        = 28,
+       PVS_SRC_ADDR_SEL_MASK           = 0x3,
+       PVS_SRC_ADDR_SEL_SHIFT          = 29,
+       PVS_SRC_ADDR_MODE_1_MASK        = 0x0,
+       PVS_SRC_ADDR_MODE_1_SHIFT       = 32,
+};
+
+/*\}*/
+
+/* BEGIN: Packet 3 commands */
+
+/* A primitive emission dword. */
+#define R300_PRIM_TYPE_NONE                     (0 << 0)
+#define R300_PRIM_TYPE_POINT                    (1 << 0)
+#define R300_PRIM_TYPE_LINE                     (2 << 0)
+#define R300_PRIM_TYPE_LINE_STRIP               (3 << 0)
+#define R300_PRIM_TYPE_TRI_LIST                 (4 << 0)
+#define R300_PRIM_TYPE_TRI_FAN                  (5 << 0)
+#define R300_PRIM_TYPE_TRI_STRIP                (6 << 0)
+#define R300_PRIM_TYPE_TRI_TYPE2                (7 << 0)
+#define R300_PRIM_TYPE_RECT_LIST                (8 << 0)
+#define R300_PRIM_TYPE_3VRT_POINT_LIST          (9 << 0)
+#define R300_PRIM_TYPE_3VRT_LINE_LIST           (10 << 0)
+       /* GUESS (based on r200) */
+#define R300_PRIM_TYPE_POINT_SPRITES            (11 << 0)
+#define R300_PRIM_TYPE_LINE_LOOP                (12 << 0)
+#define R300_PRIM_TYPE_QUADS                    (13 << 0)
+#define R300_PRIM_TYPE_QUAD_STRIP               (14 << 0)
+#define R300_PRIM_TYPE_POLYGON                  (15 << 0)
+#define R300_PRIM_TYPE_MASK                     0xF
+#define R300_PRIM_WALK_IND                      (1 << 4)
+#define R300_PRIM_WALK_LIST                     (2 << 4)
+#define R300_PRIM_WALK_RING                     (3 << 4)
+#define R300_PRIM_WALK_MASK                     (3 << 4)
+       /* GUESS (based on r200) */
+#define R300_PRIM_COLOR_ORDER_BGRA              (0 << 6)
+#define R300_PRIM_COLOR_ORDER_RGBA              (1 << 6)
+#define R300_PRIM_NUM_VERTICES_SHIFT            16
+#define R300_PRIM_NUM_VERTICES_MASK             0xffff
+
+
+
+/*
+ * The R500 unified shader (US) registers come in banks of 512 each, one
+ * for each instruction slot in the shader.  You can't touch them directly.
+ * R500_US_VECTOR_INDEX() sets the base instruction to modify; successive
+ * writes to R500_GA_US_VECTOR_DATA autoincrement the index after the
+ * instruction is fully specified.
+ */
+#define R500_US_ALU_ALPHA_INST_0                       0xa800
+#   define R500_ALPHA_OP_MAD                           0
+#   define R500_ALPHA_OP_DP                            1
+#   define R500_ALPHA_OP_MIN                           2
+#   define R500_ALPHA_OP_MAX                           3
+/* #define R500_ALPHA_OP_RESERVED                      4 */
+#   define R500_ALPHA_OP_CND                           5
+#   define R500_ALPHA_OP_CMP                           6
+#   define R500_ALPHA_OP_FRC                           7
+#   define R500_ALPHA_OP_EX2                           8
+#   define R500_ALPHA_OP_LN2                           9
+#   define R500_ALPHA_OP_RCP                           10
+#   define R500_ALPHA_OP_RSQ                           11
+#   define R500_ALPHA_OP_SIN                           12
+#   define R500_ALPHA_OP_COS                           13
+#   define R500_ALPHA_OP_MDH                           14
+#   define R500_ALPHA_OP_MDV                           15
+#   define R500_ALPHA_ADDRD(x)                         ((x) << 4)
+#   define R500_ALPHA_ADDRD_REL                                (1 << 11)
+#  define R500_ALPHA_SEL_A_SHIFT                       12
+#   define R500_ALPHA_SEL_A_SRC0                       (0 << 12)
+#   define R500_ALPHA_SEL_A_SRC1                       (1 << 12)
+#   define R500_ALPHA_SEL_A_SRC2                       (2 << 12)
+#   define R500_ALPHA_SEL_A_SRCP                       (3 << 12)
+#   define R500_ALPHA_SWIZ_A_R                         (0 << 14)
+#   define R500_ALPHA_SWIZ_A_G                         (1 << 14)
+#   define R500_ALPHA_SWIZ_A_B                         (2 << 14)
+#   define R500_ALPHA_SWIZ_A_A                         (3 << 14)
+#   define R500_ALPHA_SWIZ_A_0                         (4 << 14)
+#   define R500_ALPHA_SWIZ_A_HALF                      (5 << 14)
+#   define R500_ALPHA_SWIZ_A_1                         (6 << 14)
+/* #define R500_ALPHA_SWIZ_A_UNUSED                    (7 << 14) */
+#   define R500_ALPHA_MOD_A_NOP                                (0 << 17)
+#   define R500_ALPHA_MOD_A_NEG                                (1 << 17)
+#   define R500_ALPHA_MOD_A_ABS                                (2 << 17)
+#   define R500_ALPHA_MOD_A_NAB                                (3 << 17)
+#  define R500_ALPHA_SEL_B_SHIFT                       19
+#   define R500_ALPHA_SEL_B_SRC0                       (0 << 19)
+#   define R500_ALPHA_SEL_B_SRC1                       (1 << 19)
+#   define R500_ALPHA_SEL_B_SRC2                       (2 << 19)
+#   define R500_ALPHA_SEL_B_SRCP                       (3 << 19)
+#   define R500_ALPHA_SWIZ_B_R                         (0 << 21)
+#   define R500_ALPHA_SWIZ_B_G                         (1 << 21)
+#   define R500_ALPHA_SWIZ_B_B                         (2 << 21)
+#   define R500_ALPHA_SWIZ_B_A                         (3 << 21)
+#   define R500_ALPHA_SWIZ_B_0                         (4 << 21)
+#   define R500_ALPHA_SWIZ_B_HALF                      (5 << 21)
+#   define R500_ALPHA_SWIZ_B_1                         (6 << 21)
+/* #define R500_ALPHA_SWIZ_B_UNUSED                    (7 << 21) */
+#   define R500_ALPHA_MOD_B_NOP                                (0 << 24)
+#   define R500_ALPHA_MOD_B_NEG                                (1 << 24)
+#   define R500_ALPHA_MOD_B_ABS                                (2 << 24)
+#   define R500_ALPHA_MOD_B_NAB                                (3 << 24)
+#   define R500_ALPHA_OMOD_IDENTITY                    (0 << 26)
+#   define R500_ALPHA_OMOD_MUL_2                       (1 << 26)
+#   define R500_ALPHA_OMOD_MUL_4                       (2 << 26)
+#   define R500_ALPHA_OMOD_MUL_8                       (3 << 26)
+#   define R500_ALPHA_OMOD_DIV_2                       (4 << 26)
+#   define R500_ALPHA_OMOD_DIV_4                       (5 << 26)
+#   define R500_ALPHA_OMOD_DIV_8                       (6 << 26)
+#   define R500_ALPHA_OMOD_DISABLE                     (7 << 26)
+#   define R500_ALPHA_TARGET(x)                                ((x) << 29)
+#   define R500_ALPHA_W_OMASK                          (1 << 31)
+#define R500_US_ALU_ALPHA_ADDR_0                       0x9800
+#   define R500_ALPHA_ADDR0(x)                         ((x) << 0)
+#   define R500_ALPHA_ADDR0_CONST                      (1 << 8)
+#   define R500_ALPHA_ADDR0_REL                                (1 << 9)
+#   define R500_ALPHA_ADDR1(x)                         ((x) << 10)
+#   define R500_ALPHA_ADDR1_CONST                      (1 << 18)
+#   define R500_ALPHA_ADDR1_REL                                (1 << 19)
+#   define R500_ALPHA_ADDR2(x)                         ((x) << 20)
+#   define R500_ALPHA_ADDR2_CONST                      (1 << 28)
+#   define R500_ALPHA_ADDR2_REL                                (1 << 29)
+#   define R500_ALPHA_SRCP_OP_1_MINUS_2A0              (0 << 30)
+#   define R500_ALPHA_SRCP_OP_A1_MINUS_A0              (1 << 30)
+#   define R500_ALPHA_SRCP_OP_A1_PLUS_A0               (2 << 30)
+#   define R500_ALPHA_SRCP_OP_1_MINUS_A0               (3 << 30)
+#define R500_US_ALU_RGBA_INST_0                                0xb000
+#   define R500_ALU_RGBA_OP_MAD                                (0 << 0)
+#   define R500_ALU_RGBA_OP_DP3                                (1 << 0)
+#   define R500_ALU_RGBA_OP_DP4                                (2 << 0)
+#   define R500_ALU_RGBA_OP_D2A                                (3 << 0)
+#   define R500_ALU_RGBA_OP_MIN                                (4 << 0)
+#   define R500_ALU_RGBA_OP_MAX                                (5 << 0)
+/* #define R500_ALU_RGBA_OP_RESERVED                   (6 << 0) */
+#   define R500_ALU_RGBA_OP_CND                                (7 << 0)
+#   define R500_ALU_RGBA_OP_CMP                                (8 << 0)
+#   define R500_ALU_RGBA_OP_FRC                                (9 << 0)
+#   define R500_ALU_RGBA_OP_SOP                                (10 << 0)
+#   define R500_ALU_RGBA_OP_MDH                                (11 << 0)
+#   define R500_ALU_RGBA_OP_MDV                                (12 << 0)
+#   define R500_ALU_RGBA_ADDRD(x)                      ((x) << 4)
+#   define R500_ALU_RGBA_ADDRD_REL                     (1 << 11)
+#  define R500_ALU_RGBA_SEL_C_SHIFT                    12
+#   define R500_ALU_RGBA_SEL_C_SRC0                    (0 << 12)
+#   define R500_ALU_RGBA_SEL_C_SRC1                    (1 << 12)
+#   define R500_ALU_RGBA_SEL_C_SRC2                    (2 << 12)
+#   define R500_ALU_RGBA_SEL_C_SRCP                    (3 << 12)
+#   define R500_ALU_RGBA_R_SWIZ_R                      (0 << 14)
+#   define R500_ALU_RGBA_R_SWIZ_G                      (1 << 14)
+#   define R500_ALU_RGBA_R_SWIZ_B                      (2 << 14)
+#   define R500_ALU_RGBA_R_SWIZ_A                      (3 << 14)
+#   define R500_ALU_RGBA_R_SWIZ_0                      (4 << 14)
+#   define R500_ALU_RGBA_R_SWIZ_HALF                   (5 << 14)
+#   define R500_ALU_RGBA_R_SWIZ_1                      (6 << 14)
+/* #define R500_ALU_RGBA_R_SWIZ_UNUSED                 (7 << 14) */
+#   define R500_ALU_RGBA_G_SWIZ_R                      (0 << 17)
+#   define R500_ALU_RGBA_G_SWIZ_G                      (1 << 17)
+#   define R500_ALU_RGBA_G_SWIZ_B                      (2 << 17)
+#   define R500_ALU_RGBA_G_SWIZ_A                      (3 << 17)
+#   define R500_ALU_RGBA_G_SWIZ_0                      (4 << 17)
+#   define R500_ALU_RGBA_G_SWIZ_HALF                   (5 << 17)
+#   define R500_ALU_RGBA_G_SWIZ_1                      (6 << 17)
+/* #define R500_ALU_RGBA_G_SWIZ_UNUSED                 (7 << 17) */
+#   define R500_ALU_RGBA_B_SWIZ_R                      (0 << 20)
+#   define R500_ALU_RGBA_B_SWIZ_G                      (1 << 20)
+#   define R500_ALU_RGBA_B_SWIZ_B                      (2 << 20)
+#   define R500_ALU_RGBA_B_SWIZ_A                      (3 << 20)
+#   define R500_ALU_RGBA_B_SWIZ_0                      (4 << 20)
+#   define R500_ALU_RGBA_B_SWIZ_HALF                   (5 << 20)
+#   define R500_ALU_RGBA_B_SWIZ_1                      (6 << 20)
+/* #define R500_ALU_RGBA_B_SWIZ_UNUSED                 (7 << 20) */
+#   define R500_ALU_RGBA_MOD_C_NOP                     (0 << 23)
+#   define R500_ALU_RGBA_MOD_C_NEG                     (1 << 23)
+#   define R500_ALU_RGBA_MOD_C_ABS                     (2 << 23)
+#   define R500_ALU_RGBA_MOD_C_NAB                     (3 << 23)
+#  define R500_ALU_RGBA_ALPHA_SEL_C_SHIFT              25
+#   define R500_ALU_RGBA_ALPHA_SEL_C_SRC0              (0 << 25)
+#   define R500_ALU_RGBA_ALPHA_SEL_C_SRC1              (1 << 25)
+#   define R500_ALU_RGBA_ALPHA_SEL_C_SRC2              (2 << 25)
+#   define R500_ALU_RGBA_ALPHA_SEL_C_SRCP              (3 << 25)
+#   define R500_ALU_RGBA_A_SWIZ_R                      (0 << 27)
+#   define R500_ALU_RGBA_A_SWIZ_G                      (1 << 27)
+#   define R500_ALU_RGBA_A_SWIZ_B                      (2 << 27)
+#   define R500_ALU_RGBA_A_SWIZ_A                      (3 << 27)
+#   define R500_ALU_RGBA_A_SWIZ_0                      (4 << 27)
+#   define R500_ALU_RGBA_A_SWIZ_HALF                   (5 << 27)
+#   define R500_ALU_RGBA_A_SWIZ_1                      (6 << 27)
+/* #define R500_ALU_RGBA_A_SWIZ_UNUSED                 (7 << 27) */
+#   define R500_ALU_RGBA_ALPHA_MOD_C_NOP               (0 << 30)
+#   define R500_ALU_RGBA_ALPHA_MOD_C_NEG               (1 << 30)
+#   define R500_ALU_RGBA_ALPHA_MOD_C_ABS               (2 << 30)
+#   define R500_ALU_RGBA_ALPHA_MOD_C_NAB               (3 << 30)
+#define R500_US_ALU_RGB_INST_0                         0xa000
+#  define R500_ALU_RGB_SEL_A_SHIFT                     0
+#   define R500_ALU_RGB_SEL_A_SRC0                     (0 << 0)
+#   define R500_ALU_RGB_SEL_A_SRC1                     (1 << 0)
+#   define R500_ALU_RGB_SEL_A_SRC2                     (2 << 0)
+#   define R500_ALU_RGB_SEL_A_SRCP                     (3 << 0)
+#   define R500_ALU_RGB_R_SWIZ_A_R                     (0 << 2)
+#   define R500_ALU_RGB_R_SWIZ_A_G                     (1 << 2)
+#   define R500_ALU_RGB_R_SWIZ_A_B                     (2 << 2)
+#   define R500_ALU_RGB_R_SWIZ_A_A                     (3 << 2)
+#   define R500_ALU_RGB_R_SWIZ_A_0                     (4 << 2)
+#   define R500_ALU_RGB_R_SWIZ_A_HALF                  (5 << 2)
+#   define R500_ALU_RGB_R_SWIZ_A_1                     (6 << 2)
+/* #define R500_ALU_RGB_R_SWIZ_A_UNUSED                        (7 << 2) */
+#   define R500_ALU_RGB_G_SWIZ_A_R                     (0 << 5)
+#   define R500_ALU_RGB_G_SWIZ_A_G                     (1 << 5)
+#   define R500_ALU_RGB_G_SWIZ_A_B                     (2 << 5)
+#   define R500_ALU_RGB_G_SWIZ_A_A                     (3 << 5)
+#   define R500_ALU_RGB_G_SWIZ_A_0                     (4 << 5)
+#   define R500_ALU_RGB_G_SWIZ_A_HALF                  (5 << 5)
+#   define R500_ALU_RGB_G_SWIZ_A_1                     (6 << 5)
+/* #define R500_ALU_RGB_G_SWIZ_A_UNUSED                        (7 << 5) */
+#   define R500_ALU_RGB_B_SWIZ_A_R                     (0 << 8)
+#   define R500_ALU_RGB_B_SWIZ_A_G                     (1 << 8)
+#   define R500_ALU_RGB_B_SWIZ_A_B                     (2 << 8)
+#   define R500_ALU_RGB_B_SWIZ_A_A                     (3 << 8)
+#   define R500_ALU_RGB_B_SWIZ_A_0                     (4 << 8)
+#   define R500_ALU_RGB_B_SWIZ_A_HALF                  (5 << 8)
+#   define R500_ALU_RGB_B_SWIZ_A_1                     (6 << 8)
+/* #define R500_ALU_RGB_B_SWIZ_A_UNUSED                        (7 << 8) */
+#   define R500_ALU_RGB_MOD_A_NOP                      (0 << 11)
+#   define R500_ALU_RGB_MOD_A_NEG                      (1 << 11)
+#   define R500_ALU_RGB_MOD_A_ABS                      (2 << 11)
+#   define R500_ALU_RGB_MOD_A_NAB                      (3 << 11)
+#  define R500_ALU_RGB_SEL_B_SHIFT                     13
+#   define R500_ALU_RGB_SEL_B_SRC0                     (0 << 13)
+#   define R500_ALU_RGB_SEL_B_SRC1                     (1 << 13)
+#   define R500_ALU_RGB_SEL_B_SRC2                     (2 << 13)
+#   define R500_ALU_RGB_SEL_B_SRCP                     (3 << 13)
+#   define R500_ALU_RGB_R_SWIZ_B_R                     (0 << 15)
+#   define R500_ALU_RGB_R_SWIZ_B_G                     (1 << 15)
+#   define R500_ALU_RGB_R_SWIZ_B_B                     (2 << 15)
+#   define R500_ALU_RGB_R_SWIZ_B_A                     (3 << 15)
+#   define R500_ALU_RGB_R_SWIZ_B_0                     (4 << 15)
+#   define R500_ALU_RGB_R_SWIZ_B_HALF                  (5 << 15)
+#   define R500_ALU_RGB_R_SWIZ_B_1                     (6 << 15)
+/* #define R500_ALU_RGB_R_SWIZ_B_UNUSED                        (7 << 15) */
+#   define R500_ALU_RGB_G_SWIZ_B_R                     (0 << 18)
+#   define R500_ALU_RGB_G_SWIZ_B_G                     (1 << 18)
+#   define R500_ALU_RGB_G_SWIZ_B_B                     (2 << 18)
+#   define R500_ALU_RGB_G_SWIZ_B_A                     (3 << 18)
+#   define R500_ALU_RGB_G_SWIZ_B_0                     (4 << 18)
+#   define R500_ALU_RGB_G_SWIZ_B_HALF                  (5 << 18)
+#   define R500_ALU_RGB_G_SWIZ_B_1                     (6 << 18)
+/* #define R500_ALU_RGB_G_SWIZ_B_UNUSED                        (7 << 18) */
+#   define R500_ALU_RGB_B_SWIZ_B_R                     (0 << 21)
+#   define R500_ALU_RGB_B_SWIZ_B_G                     (1 << 21)
+#   define R500_ALU_RGB_B_SWIZ_B_B                     (2 << 21)
+#   define R500_ALU_RGB_B_SWIZ_B_A                     (3 << 21)
+#   define R500_ALU_RGB_B_SWIZ_B_0                     (4 << 21)
+#   define R500_ALU_RGB_B_SWIZ_B_HALF                  (5 << 21)
+#   define R500_ALU_RGB_B_SWIZ_B_1                     (6 << 21)
+/* #define R500_ALU_RGB_B_SWIZ_B_UNUSED                        (7 << 21) */
+#   define R500_ALU_RGB_MOD_B_NOP                      (0 << 24)
+#   define R500_ALU_RGB_MOD_B_NEG                      (1 << 24)
+#   define R500_ALU_RGB_MOD_B_ABS                      (2 << 24)
+#   define R500_ALU_RGB_MOD_B_NAB                      (3 << 24)
+#   define R500_ALU_RGB_OMOD_IDENTITY                  (0 << 26)
+#   define R500_ALU_RGB_OMOD_MUL_2                     (1 << 26)
+#   define R500_ALU_RGB_OMOD_MUL_4                     (2 << 26)
+#   define R500_ALU_RGB_OMOD_MUL_8                     (3 << 26)
+#   define R500_ALU_RGB_OMOD_DIV_2                     (4 << 26)
+#   define R500_ALU_RGB_OMOD_DIV_4                     (5 << 26)
+#   define R500_ALU_RGB_OMOD_DIV_8                     (6 << 26)
+#   define R500_ALU_RGB_OMOD_DISABLE                   (7 << 26)
+#   define R500_ALU_RGB_TARGET(x)                      ((x) << 29)
+#   define R500_ALU_RGB_WMASK                          (1 << 31)
+#define R500_US_ALU_RGB_ADDR_0                         0x9000
+#   define R500_RGB_ADDR0(x)                           ((x) << 0)
+#   define R500_RGB_ADDR0_CONST                                (1 << 8)
+#   define R500_RGB_ADDR0_REL                          (1 << 9)
+#   define R500_RGB_ADDR1(x)                           ((x) << 10)
+#   define R500_RGB_ADDR1_CONST                                (1 << 18)
+#   define R500_RGB_ADDR1_REL                          (1 << 19)
+#   define R500_RGB_ADDR2(x)                           ((x) << 20)
+#   define R500_RGB_ADDR2_CONST                                (1 << 28)
+#   define R500_RGB_ADDR2_REL                          (1 << 29)
+#   define R500_RGB_SRCP_OP_1_MINUS_2RGB0              (0 << 30)
+#   define R500_RGB_SRCP_OP_RGB1_MINUS_RGB0            (1 << 30)
+#   define R500_RGB_SRCP_OP_RGB1_PLUS_RGB0             (2 << 30)
+#   define R500_RGB_SRCP_OP_1_MINUS_RGB0               (3 << 30)
+#define R500_US_CMN_INST_0                             0xb800
+#  define R500_INST_TYPE_MASK                          (3 << 0)
+#   define R500_INST_TYPE_ALU                          (0 << 0)
+#   define R500_INST_TYPE_OUT                          (1 << 0)
+#   define R500_INST_TYPE_FC                           (2 << 0)
+#   define R500_INST_TYPE_TEX                          (3 << 0)
+#   define R500_INST_TEX_SEM_WAIT                      (1 << 2)
+#   define R500_INST_RGB_PRED_SEL_NONE                 (0 << 3)
+#   define R500_INST_RGB_PRED_SEL_RGBA                 (1 << 3)
+#   define R500_INST_RGB_PRED_SEL_RRRR                 (2 << 3)
+#   define R500_INST_RGB_PRED_SEL_GGGG                 (3 << 3)
+#   define R500_INST_RGB_PRED_SEL_BBBB                 (4 << 3)
+#   define R500_INST_RGB_PRED_SEL_AAAA                 (5 << 3)
+#   define R500_INST_RGB_PRED_INV                      (1 << 6)
+#   define R500_INST_WRITE_INACTIVE                    (1 << 7)
+#   define R500_INST_LAST                              (1 << 8)
+#   define R500_INST_NOP                               (1 << 9)
+#   define R500_INST_ALU_WAIT                          (1 << 10)
+#   define R500_INST_RGB_WMASK_R                       (1 << 11)
+#   define R500_INST_RGB_WMASK_G                       (1 << 12)
+#   define R500_INST_RGB_WMASK_B                       (1 << 13)
+#   define R500_INST_ALPHA_WMASK                       (1 << 14)
+#   define R500_INST_RGB_OMASK_R                       (1 << 15)
+#   define R500_INST_RGB_OMASK_G                       (1 << 16)
+#   define R500_INST_RGB_OMASK_B                       (1 << 17)
+#   define R500_INST_ALPHA_OMASK                       (1 << 18)
+#   define R500_INST_RGB_CLAMP                         (1 << 19)
+#   define R500_INST_ALPHA_CLAMP                       (1 << 20)
+#   define R500_INST_ALU_RESULT_SEL                    (1 << 21)
+#   define R500_INST_ALPHA_PRED_INV                    (1 << 22)
+#   define R500_INST_ALU_RESULT_OP_EQ                  (0 << 23)
+#   define R500_INST_ALU_RESULT_OP_LT                  (1 << 23)
+#   define R500_INST_ALU_RESULT_OP_GE                  (2 << 23)
+#   define R500_INST_ALU_RESULT_OP_NE                  (3 << 23)
+#   define R500_INST_ALPHA_PRED_SEL_NONE               (0 << 25)
+#   define R500_INST_ALPHA_PRED_SEL_RGBA               (1 << 25)
+#   define R500_INST_ALPHA_PRED_SEL_RRRR               (2 << 25)
+#   define R500_INST_ALPHA_PRED_SEL_GGGG               (3 << 25)
+#   define R500_INST_ALPHA_PRED_SEL_BBBB               (4 << 25)
+#   define R500_INST_ALPHA_PRED_SEL_AAAA               (5 << 25)
+/* XXX next four are kind of guessed */
+#   define R500_INST_STAT_WE_R                         (1 << 28)
+#   define R500_INST_STAT_WE_G                         (1 << 29)
+#   define R500_INST_STAT_WE_B                         (1 << 30)
+#   define R500_INST_STAT_WE_A                         (1 << 31)
+
+/* note that these are 8 bit lengths, despite the offsets, at least for R500 */
+#define R500_US_CODE_ADDR                              0x4630
+#   define R500_US_CODE_START_ADDR(x)                  ((x) << 0)
+#   define R500_US_CODE_END_ADDR(x)                    ((x) << 16)
+#define R500_US_CODE_OFFSET                            0x4638
+#   define R500_US_CODE_OFFSET_ADDR(x)                 ((x) << 0)
+#define R500_US_CODE_RANGE                             0x4634
+#   define R500_US_CODE_RANGE_ADDR(x)                  ((x) << 0)
+#   define R500_US_CODE_RANGE_SIZE(x)                  ((x) << 16)
+#define R500_US_CONFIG                                 0x4600
+#   define R500_ZERO_TIMES_ANYTHING_EQUALS_ZERO                (1 << 1)
+#define R500_US_FC_ADDR_0                              0xa000
+#   define R500_FC_BOOL_ADDR(x)                                ((x) << 0)
+#   define R500_FC_INT_ADDR(x)                         ((x) << 8)
+#   define R500_FC_JUMP_ADDR(x)                                ((x) << 16)
+#   define R500_FC_JUMP_GLOBAL                         (1 << 31)
+#define R500_US_FC_BOOL_CONST                          0x4620
+#   define R500_FC_KBOOL(x)                            (x)
+#define R500_US_FC_CTRL                                        0x4624
+#   define R500_FC_TEST_EN                             (1 << 30)
+#   define R500_FC_FULL_FC_EN                          (1 << 31)
+#define R500_US_FC_INST_0                              0x9800
+#   define R500_FC_OP_JUMP                             (0 << 0)
+#   define R500_FC_OP_LOOP                             (1 << 0)
+#   define R500_FC_OP_ENDLOOP                          (2 << 0)
+#   define R500_FC_OP_REP                              (3 << 0)
+#   define R500_FC_OP_ENDREP                           (4 << 0)
+#   define R500_FC_OP_BREAKLOOP                                (5 << 0)
+#   define R500_FC_OP_BREAKREP                         (6 << 0)
+#   define R500_FC_OP_CONTINUE                         (7 << 0)
+#   define R500_FC_B_ELSE                              (1 << 4)
+#   define R500_FC_JUMP_ANY                            (1 << 5)
+#   define R500_FC_A_OP_NONE                           (0 << 6)
+#   define R500_FC_A_OP_POP                            (1 << 6)
+#   define R500_FC_A_OP_PUSH                           (2 << 6)
+#   define R500_FC_JUMP_FUNC(x)                                ((x) << 8)
+#   define R500_FC_B_POP_CNT(x)                                ((x) << 16)
+#   define R500_FC_B_OP0_NONE                          (0 << 24)
+#   define R500_FC_B_OP0_DECR                          (1 << 24)
+#   define R500_FC_B_OP0_INCR                          (2 << 24)
+#   define R500_FC_B_OP1_DECR                          (0 << 26)
+#   define R500_FC_B_OP1_NONE                          (1 << 26)
+#   define R500_FC_B_OP1_INCR                          (2 << 26)
+#   define R500_FC_IGNORE_UNCOVERED                    (1 << 28)
+#define R500_US_FC_INT_CONST_0                         0x4c00
+#   define R500_FC_INT_CONST_KR(x)                     ((x) << 0)
+#   define R500_FC_INT_CONST_KG(x)                     ((x) << 8)
+#   define R500_FC_INT_CONST_KB(x)                     ((x) << 16)
+/* _0 through _15 */
+#define R500_US_FORMAT0_0                              0x4640
+#   define R500_FORMAT_TXWIDTH(x)                      ((x) << 0)
+#   define R500_FORMAT_TXHEIGHT(x)                     ((x) << 11)
+#   define R500_FORMAT_TXDEPTH(x)                      ((x) << 22)
+/* _0 through _3 */
+#define R500_US_OUT_FMT_0                              0x46a4
+#   define R500_OUT_FMT_C4_8                           (0 << 0)
+#   define R500_OUT_FMT_C4_10                          (1 << 0)
+#   define R500_OUT_FMT_C4_10_GAMMA                    (2 << 0)
+#   define R500_OUT_FMT_C_16                           (3 << 0)
+#   define R500_OUT_FMT_C2_16                          (4 << 0)
+#   define R500_OUT_FMT_C4_16                          (5 << 0)
+#   define R500_OUT_FMT_C_16_MPEG                      (6 << 0)
+#   define R500_OUT_FMT_C2_16_MPEG                     (7 << 0)
+#   define R500_OUT_FMT_C2_4                           (8 << 0)
+#   define R500_OUT_FMT_C_3_3_2                                (9 << 0)
+#   define R500_OUT_FMT_C_6_5_6                                (10 << 0)
+#   define R500_OUT_FMT_C_11_11_10                     (11 << 0)
+#   define R500_OUT_FMT_C_10_11_11                     (12 << 0)
+#   define R500_OUT_FMT_C_2_10_10_10                   (13 << 0)
+/* #define R500_OUT_FMT_RESERVED                       (14 << 0) */
+#   define R500_OUT_FMT_UNUSED                         (15 << 0)
+#   define R500_OUT_FMT_C_16_FP                                (16 << 0)
+#   define R500_OUT_FMT_C2_16_FP                       (17 << 0)
+#   define R500_OUT_FMT_C4_16_FP                       (18 << 0)
+#   define R500_OUT_FMT_C_32_FP                                (19 << 0)
+#   define R500_OUT_FMT_C2_32_FP                       (20 << 0)
+#   define R500_OUT_FMT_C4_32_FP                       (21 << 0)
+#   define R500_C0_SEL_A                               (0 << 8)
+#   define R500_C0_SEL_R                               (1 << 8)
+#   define R500_C0_SEL_G                               (2 << 8)
+#   define R500_C0_SEL_B                               (3 << 8)
+#   define R500_C1_SEL_A                               (0 << 10)
+#   define R500_C1_SEL_R                               (1 << 10)
+#   define R500_C1_SEL_G                               (2 << 10)
+#   define R500_C1_SEL_B                               (3 << 10)
+#   define R500_C2_SEL_A                               (0 << 12)
+#   define R500_C2_SEL_R                               (1 << 12)
+#   define R500_C2_SEL_G                               (2 << 12)
+#   define R500_C2_SEL_B                               (3 << 12)
+#   define R500_C3_SEL_A                               (0 << 14)
+#   define R500_C3_SEL_R                               (1 << 14)
+#   define R500_C3_SEL_G                               (2 << 14)
+#   define R500_C3_SEL_B                               (3 << 14)
+#   define R500_OUT_SIGN(x)                            ((x) << 16)
+#   define R500_ROUND_ADJ                              (1 << 20)
+#define R500_US_PIXSIZE                                        0x4604
+#   define R500_PIX_SIZE(x)                            (x)
+#define R500_US_TEX_ADDR_0                             0x9800
+#   define R500_TEX_SRC_ADDR(x)                                ((x) << 0)
+#   define R500_TEX_SRC_ADDR_REL                       (1 << 7)
+#   define R500_TEX_SRC_S_SWIZ_R                       (0 << 8)
+#   define R500_TEX_SRC_S_SWIZ_G                       (1 << 8)
+#   define R500_TEX_SRC_S_SWIZ_B                       (2 << 8)
+#   define R500_TEX_SRC_S_SWIZ_A                       (3 << 8)
+#   define R500_TEX_SRC_T_SWIZ_R                       (0 << 10)
+#   define R500_TEX_SRC_T_SWIZ_G                       (1 << 10)
+#   define R500_TEX_SRC_T_SWIZ_B                       (2 << 10)
+#   define R500_TEX_SRC_T_SWIZ_A                       (3 << 10)
+#   define R500_TEX_SRC_R_SWIZ_R                       (0 << 12)
+#   define R500_TEX_SRC_R_SWIZ_G                       (1 << 12)
+#   define R500_TEX_SRC_R_SWIZ_B                       (2 << 12)
+#   define R500_TEX_SRC_R_SWIZ_A                       (3 << 12)
+#   define R500_TEX_SRC_Q_SWIZ_R                       (0 << 14)
+#   define R500_TEX_SRC_Q_SWIZ_G                       (1 << 14)
+#   define R500_TEX_SRC_Q_SWIZ_B                       (2 << 14)
+#   define R500_TEX_SRC_Q_SWIZ_A                       (3 << 14)
+#   define R500_TEX_DST_ADDR(x)                                ((x) << 16)
+#   define R500_TEX_DST_ADDR_REL                       (1 << 23)
+#   define R500_TEX_DST_R_SWIZ_R                       (0 << 24)
+#   define R500_TEX_DST_R_SWIZ_G                       (1 << 24)
+#   define R500_TEX_DST_R_SWIZ_B                       (2 << 24)
+#   define R500_TEX_DST_R_SWIZ_A                       (3 << 24)
+#   define R500_TEX_DST_G_SWIZ_R                       (0 << 26)
+#   define R500_TEX_DST_G_SWIZ_G                       (1 << 26)
+#   define R500_TEX_DST_G_SWIZ_B                       (2 << 26)
+#   define R500_TEX_DST_G_SWIZ_A                       (3 << 26)
+#   define R500_TEX_DST_B_SWIZ_R                       (0 << 28)
+#   define R500_TEX_DST_B_SWIZ_G                       (1 << 28)
+#   define R500_TEX_DST_B_SWIZ_B                       (2 << 28)
+#   define R500_TEX_DST_B_SWIZ_A                       (3 << 28)
+#   define R500_TEX_DST_A_SWIZ_R                       (0 << 30)
+#   define R500_TEX_DST_A_SWIZ_G                       (1 << 30)
+#   define R500_TEX_DST_A_SWIZ_B                       (2 << 30)
+#   define R500_TEX_DST_A_SWIZ_A                       (3 << 30)
+#define R500_US_TEX_ADDR_DXDY_0                                0xa000
+#   define R500_DX_ADDR(x)                             ((x) << 0)
+#   define R500_DX_ADDR_REL                            (1 << 7)
+#   define R500_DX_S_SWIZ_R                            (0 << 8)
+#   define R500_DX_S_SWIZ_G                            (1 << 8)
+#   define R500_DX_S_SWIZ_B                            (2 << 8)
+#   define R500_DX_S_SWIZ_A                            (3 << 8)
+#   define R500_DX_T_SWIZ_R                            (0 << 10)
+#   define R500_DX_T_SWIZ_G                            (1 << 10)
+#   define R500_DX_T_SWIZ_B                            (2 << 10)
+#   define R500_DX_T_SWIZ_A                            (3 << 10)
+#   define R500_DX_R_SWIZ_R                            (0 << 12)
+#   define R500_DX_R_SWIZ_G                            (1 << 12)
+#   define R500_DX_R_SWIZ_B                            (2 << 12)
+#   define R500_DX_R_SWIZ_A                            (3 << 12)
+#   define R500_DX_Q_SWIZ_R                            (0 << 14)
+#   define R500_DX_Q_SWIZ_G                            (1 << 14)
+#   define R500_DX_Q_SWIZ_B                            (2 << 14)
+#   define R500_DX_Q_SWIZ_A                            (3 << 14)
+#   define R500_DY_ADDR(x)                             ((x) << 16)
+#   define R500_DY_ADDR_REL                            (1 << 17)
+#   define R500_DY_S_SWIZ_R                            (0 << 24)
+#   define R500_DY_S_SWIZ_G                            (1 << 24)
+#   define R500_DY_S_SWIZ_B                            (2 << 24)
+#   define R500_DY_S_SWIZ_A                            (3 << 24)
+#   define R500_DY_T_SWIZ_R                            (0 << 26)
+#   define R500_DY_T_SWIZ_G                            (1 << 26)
+#   define R500_DY_T_SWIZ_B                            (2 << 26)
+#   define R500_DY_T_SWIZ_A                            (3 << 26)
+#   define R500_DY_R_SWIZ_R                            (0 << 28)
+#   define R500_DY_R_SWIZ_G                            (1 << 28)
+#   define R500_DY_R_SWIZ_B                            (2 << 28)
+#   define R500_DY_R_SWIZ_A                            (3 << 28)
+#   define R500_DY_Q_SWIZ_R                            (0 << 30)
+#   define R500_DY_Q_SWIZ_G                            (1 << 30)
+#   define R500_DY_Q_SWIZ_B                            (2 << 30)
+#   define R500_DY_Q_SWIZ_A                            (3 << 30)
+#define R500_US_TEX_INST_0                             0x9000
+#   define R500_TEX_ID(x)                              ((x) << 16)
+#   define R500_TEX_INST_NOP                           (0 << 22)
+#   define R500_TEX_INST_LD                            (1 << 22)
+#   define R500_TEX_INST_TEXKILL                       (2 << 22)
+#   define R500_TEX_INST_PROJ                          (3 << 22)
+#   define R500_TEX_INST_LODBIAS                       (4 << 22)
+#   define R500_TEX_INST_LOD                           (5 << 22)
+#   define R500_TEX_INST_DXDY                          (6 << 22)
+#   define R500_TEX_SEM_ACQUIRE                                (1 << 25)
+#   define R500_TEX_IGNORE_UNCOVERED                   (1 << 26)
+#   define R500_TEX_UNSCALED                           (1 << 27)
+#define R300_US_W_FMT                                  0x46b4
+#   define R300_W_FMT_W0                               (0 << 0)
+#   define R300_W_FMT_W24                              (1 << 0)
+#   define R300_W_FMT_W24FP                            (2 << 0)
+#   define R300_W_SRC_US                               (0 << 2)
+#   define R300_W_SRC_RAS                              (1 << 2)
+
+
+/* Draw a primitive from vertex data in arrays loaded via 3D_LOAD_VBPNTR.
+ * Two parameter dwords:
+ * 0. VAP_VTX_FMT: The first parameter is not written to hardware
+ * 1. VAP_VF_CTL: The second parameter is a standard primitive emission dword.
+ */
+#define R300_PACKET3_3D_DRAW_VBUF           0x00002800
+
+/* Draw a primitive from immediate vertices in this packet
+ * Up to 16382 dwords:
+ * 0. VAP_VTX_FMT: The first parameter is not written to hardware
+ * 1. VAP_VF_CTL: The second parameter is a standard primitive emission dword.
+ * 2 to end: Up to 16380 dwords of vertex data.
+ */
+#define R300_PACKET3_3D_DRAW_IMMD           0x00002900
+
+/* Draw a primitive from vertex data in arrays loaded via 3D_LOAD_VBPNTR and
+ * immediate vertices in this packet
+ * Up to 16382 dwords:
+ * 0. VAP_VTX_FMT: The first parameter is not written to hardware
+ * 1. VAP_VF_CTL: The second parameter is a standard primitive emission dword.
+ * 2 to end: Up to 16380 dwords of vertex data.
+ */
+#define R300_PACKET3_3D_DRAW_INDX           0x00002A00
+
+
+/* Specify the full set of vertex arrays as (address, stride).
+ * The first parameter is the number of vertex arrays specified.
+ * The rest of the command is a variable length list of blocks, where
+ * each block is three dwords long and specifies two arrays.
+ * The first dword of a block is split into two words, the lower significant
+ * word refers to the first array, the more significant word to the second
+ * array in the block.
+ * The low byte of each word contains the size of an array entry in dwords,
+ * the high byte contains the stride of the array.
+ * The second dword of a block contains the pointer to the first array,
+ * the third dword of a block contains the pointer to the second array.
+ * Note that if the total number of arrays is odd, the third dword of
+ * the last block is omitted.
+ */
+#define R300_PACKET3_3D_LOAD_VBPNTR         0x00002F00
+
+#define R300_PACKET3_INDX_BUFFER            0x00003300
+#    define R300_INDX_BUFFER_DST_SHIFT          0
+#    define R300_INDX_BUFFER_SKIP_SHIFT         16
+#    define R300_INDX_BUFFER_ONE_REG_WR                (1<<31)
+
+/* Same as R300_PACKET3_3D_DRAW_VBUF but without VAP_VTX_FMT */
+#define R300_PACKET3_3D_DRAW_VBUF_2         0x00003400
+/* Same as R300_PACKET3_3D_DRAW_IMMD but without VAP_VTX_FMT */
+#define R300_PACKET3_3D_DRAW_IMMD_2         0x00003500
+/* Same as R300_PACKET3_3D_DRAW_INDX but without VAP_VTX_FMT */
+#define R300_PACKET3_3D_DRAW_INDX_2         0x00003600
+
+/* Clears a portion of hierachical Z RAM
+ * 3 dword parameters
+ * 0. START
+ * 1. COUNT: 13:0 (max is 0x3FFF)
+ * 2. CLEAR_VALUE: Value to write into HIZ RAM.
+ */
+#define R300_PACKET3_3D_CLEAR_HIZ           0x00003700
+
+/* Draws a set of primitives using vertex buffers pointed by the state data.
+ * At least 2 Parameters:
+ * 0. VAP_VF_CNTL: The first parameter is a standard primitive emission dword.
+ * 2 to end: Data or indices (see other 3D_DRAW_* packets for details)
+ */
+#define R300_PACKET3_3D_DRAW_128            0x00003900
+
+/* END: Packet 3 commands */
+
+
+/* Color formats for 2d packets
+ */
+#define R300_CP_COLOR_FORMAT_CI8       2
+#define R300_CP_COLOR_FORMAT_ARGB1555  3
+#define R300_CP_COLOR_FORMAT_RGB565    4
+#define R300_CP_COLOR_FORMAT_ARGB8888  6
+#define R300_CP_COLOR_FORMAT_RGB332    7
+#define R300_CP_COLOR_FORMAT_RGB8      9
+#define R300_CP_COLOR_FORMAT_ARGB4444  15
+
+/*
+ * CP type-3 packets
+ */
+#define R300_CP_CMD_BITBLT_MULTI       0xC0009B00
+
+#endif /* _R300_REG_H */
+
+/* *INDENT-ON* */
+
+/* vim: set foldenable foldmarker=\\{,\\} foldmethod=marker : */
diff --git a/src/mesa/drivers/dri/r600/r600_render.c b/src/mesa/drivers/dri/r600/r600_render.c
new file mode 100644 (file)
index 0000000..3d4b324
--- /dev/null
@@ -0,0 +1,548 @@
+/**************************************************************************
+
+Copyright (C) 2004 Nicolai Haehnle.
+
+All Rights Reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the "Software"),
+to deal in the Software without restriction, including without limitation
+on the rights to use, copy, modify, merge, publish, distribute, sub
+license, and/or sell copies of the Software, and to permit persons to whom
+the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice (including the next
+paragraph) shall be included in all copies or substantial portions of the
+Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ATI, VA LINUX SYSTEMS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/**
+ * \file
+ *
+ * \brief R300 Render (Vertex Buffer Implementation)
+ *
+ * The immediate implementation has been removed from CVS in favor of the vertex
+ * buffer implementation.
+ *
+ * The render functions are called by the pipeline manager to render a batch of
+ * primitives. They return TRUE to pass on to the next stage (i.e. software
+ * rasterization) or FALSE to indicate that the pipeline has finished after
+ * rendering something.
+ *
+ * When falling back to software TCL still attempt to use hardware
+ * rasterization.
+ *
+ * I am not sure that the cache related registers are setup correctly, but
+ * obviously this does work... Further investigation is needed.
+ *
+ * \author Nicolai Haehnle <prefect_@gmx.net>
+ *
+ * \todo Add immediate implementation back? Perhaps this is useful if there are
+ * no bugs...
+ */
+
+#include "main/glheader.h"
+#include "main/state.h"
+#include "main/imports.h"
+#include "main/enums.h"
+#include "main/macros.h"
+#include "main/context.h"
+#include "main/dd.h"
+#include "main/simple_list.h"
+#include "main/api_arrayelt.h"
+#include "swrast/swrast.h"
+#include "swrast_setup/swrast_setup.h"
+#include "vbo/vbo.h"
+#include "tnl/tnl.h"
+#include "tnl/t_vp_build.h"
+#include "radeon_reg.h"
+#include "radeon_macros.h"
+#include "r600_context.h"
+#include "r600_ioctl.h"
+#include "r600_state.h"
+#include "r600_reg.h"
+#include "r600_tex.h"
+#include "r600_emit.h"
+#include "r600_fragprog.h"
+extern int future_hw_tcl_on;
+
+/**
+ * \brief Convert a OpenGL primitive type into a R300 primitive type.
+ */
+int r300PrimitiveType(r300ContextPtr rmesa, int prim)
+{
+       switch (prim & PRIM_MODE_MASK) {
+       case GL_POINTS:
+               return R300_VAP_VF_CNTL__PRIM_POINTS;
+               break;
+       case GL_LINES:
+               return R300_VAP_VF_CNTL__PRIM_LINES;
+               break;
+       case GL_LINE_STRIP:
+               return R300_VAP_VF_CNTL__PRIM_LINE_STRIP;
+               break;
+       case GL_LINE_LOOP:
+               return R300_VAP_VF_CNTL__PRIM_LINE_LOOP;
+               break;
+       case GL_TRIANGLES:
+               return R300_VAP_VF_CNTL__PRIM_TRIANGLES;
+               break;
+       case GL_TRIANGLE_STRIP:
+               return R300_VAP_VF_CNTL__PRIM_TRIANGLE_STRIP;
+               break;
+       case GL_TRIANGLE_FAN:
+               return R300_VAP_VF_CNTL__PRIM_TRIANGLE_FAN;
+               break;
+       case GL_QUADS:
+               return R300_VAP_VF_CNTL__PRIM_QUADS;
+               break;
+       case GL_QUAD_STRIP:
+               return R300_VAP_VF_CNTL__PRIM_QUAD_STRIP;
+               break;
+       case GL_POLYGON:
+               return R300_VAP_VF_CNTL__PRIM_POLYGON;
+               break;
+       default:
+               assert(0);
+               return -1;
+               break;
+       }
+}
+
+int r300NumVerts(r300ContextPtr rmesa, int num_verts, int prim)
+{
+       int verts_off = 0;
+
+       switch (prim & PRIM_MODE_MASK) {
+       case GL_POINTS:
+               verts_off = 0;
+               break;
+       case GL_LINES:
+               verts_off = num_verts % 2;
+               break;
+       case GL_LINE_STRIP:
+               if (num_verts < 2)
+                       verts_off = num_verts;
+               break;
+       case GL_LINE_LOOP:
+               if (num_verts < 2)
+                       verts_off = num_verts;
+               break;
+       case GL_TRIANGLES:
+               verts_off = num_verts % 3;
+               break;
+       case GL_TRIANGLE_STRIP:
+               if (num_verts < 3)
+                       verts_off = num_verts;
+               break;
+       case GL_TRIANGLE_FAN:
+               if (num_verts < 3)
+                       verts_off = num_verts;
+               break;
+       case GL_QUADS:
+               verts_off = num_verts % 4;
+               break;
+       case GL_QUAD_STRIP:
+               if (num_verts < 4)
+                       verts_off = num_verts;
+               else
+                       verts_off = num_verts % 2;
+               break;
+       case GL_POLYGON:
+               if (num_verts < 3)
+                       verts_off = num_verts;
+               break;
+       default:
+               assert(0);
+               return -1;
+               break;
+       }
+
+       return num_verts - verts_off;
+}
+
+static void r300EmitElts(GLcontext * ctx, void *elts, unsigned long n_elts)
+{
+       r300ContextPtr rmesa = R300_CONTEXT(ctx);
+       void *out;
+
+       radeonAllocDmaRegion(&rmesa->radeon, &rmesa->radeon.tcl.elt_dma_bo,
+                            &rmesa->radeon.tcl.elt_dma_offset, n_elts * 4, 4);
+       radeon_bo_map(rmesa->radeon.tcl.elt_dma_bo, 1);
+       out = rmesa->radeon.tcl.elt_dma_bo->ptr + rmesa->radeon.tcl.elt_dma_offset;
+       memcpy(out, elts, n_elts * 4);
+       radeon_bo_unmap(rmesa->radeon.tcl.elt_dma_bo);
+}
+
+static void r300FireEB(r300ContextPtr rmesa, int vertex_count, int type)
+{
+       BATCH_LOCALS(&rmesa->radeon);
+
+       if (vertex_count > 0) {
+               BEGIN_BATCH(10);
+               OUT_BATCH_PACKET3(R300_PACKET3_3D_DRAW_INDX_2, 0);
+               OUT_BATCH(R300_VAP_VF_CNTL__PRIM_WALK_INDICES |
+                         ((vertex_count + 0) << 16) |
+                         type |
+                         R300_VAP_VF_CNTL__INDEX_SIZE_32bit);
+               
+               if (!rmesa->radeon.radeonScreen->kernel_mm) {
+                       OUT_BATCH_PACKET3(R300_PACKET3_INDX_BUFFER, 2);
+                       OUT_BATCH(R300_INDX_BUFFER_ONE_REG_WR | (0 << R300_INDX_BUFFER_SKIP_SHIFT) |
+                                (R300_VAP_PORT_IDX0 >> 2));
+                       OUT_BATCH_RELOC(rmesa->radeon.tcl.elt_dma_offset,
+                                       rmesa->radeon.tcl.elt_dma_bo,
+                                       rmesa->radeon.tcl.elt_dma_offset,
+                                       RADEON_GEM_DOMAIN_GTT, 0, 0);
+                       OUT_BATCH(vertex_count);
+               } else {
+                       OUT_BATCH_PACKET3(R300_PACKET3_INDX_BUFFER, 2);
+                       OUT_BATCH(R300_INDX_BUFFER_ONE_REG_WR | (0 << R300_INDX_BUFFER_SKIP_SHIFT) |
+                                (R300_VAP_PORT_IDX0 >> 2));
+                       OUT_BATCH(rmesa->radeon.tcl.elt_dma_offset);
+                       OUT_BATCH(vertex_count);
+                       radeon_cs_write_reloc(rmesa->radeon.cmdbuf.cs,
+                                             rmesa->radeon.tcl.elt_dma_bo,
+                                             RADEON_GEM_DOMAIN_GTT, 0, 0);
+               }
+               END_BATCH();
+       }
+}
+
+static void r300EmitAOS(r300ContextPtr rmesa, GLuint nr, GLuint offset)
+{
+       BATCH_LOCALS(&rmesa->radeon);
+       uint32_t voffset;
+       int sz = 1 + (nr >> 1) * 3 + (nr & 1) * 2;
+       int i;
+       
+       if (RADEON_DEBUG & DEBUG_VERTS)
+               fprintf(stderr, "%s: nr=%d, ofs=0x%08x\n", __FUNCTION__, nr,
+                       offset);
+
+    
+       if (!rmesa->radeon.radeonScreen->kernel_mm) {
+               BEGIN_BATCH(sz+2+(nr * 2));
+               OUT_BATCH_PACKET3(R300_PACKET3_3D_LOAD_VBPNTR, sz - 1);
+               OUT_BATCH(nr);
+
+               for (i = 0; i + 1 < nr; i += 2) {
+                       OUT_BATCH((rmesa->radeon.tcl.aos[i].components << 0) |
+                                 (rmesa->radeon.tcl.aos[i].stride << 8) |
+                                 (rmesa->radeon.tcl.aos[i + 1].components << 16) |
+                                 (rmesa->radeon.tcl.aos[i + 1].stride << 24));
+                       
+                       voffset =  rmesa->radeon.tcl.aos[i + 0].offset +
+                               offset * 4 * rmesa->radeon.tcl.aos[i + 0].stride;
+                       OUT_BATCH_RELOC(voffset,
+                                       rmesa->radeon.tcl.aos[i].bo,
+                                       voffset,
+                                       RADEON_GEM_DOMAIN_GTT,
+                                       0, 0);
+                       voffset =  rmesa->radeon.tcl.aos[i + 1].offset +
+                         offset * 4 * rmesa->radeon.tcl.aos[i + 1].stride;
+                       OUT_BATCH_RELOC(voffset,
+                                       rmesa->radeon.tcl.aos[i+1].bo,
+                                       voffset,
+                                       RADEON_GEM_DOMAIN_GTT,
+                                       0, 0);
+               }
+               
+               if (nr & 1) {
+                       OUT_BATCH((rmesa->radeon.tcl.aos[nr - 1].components << 0) |
+                                 (rmesa->radeon.tcl.aos[nr - 1].stride << 8));
+                       voffset =  rmesa->radeon.tcl.aos[nr - 1].offset +
+                               offset * 4 * rmesa->radeon.tcl.aos[nr - 1].stride;
+                       OUT_BATCH_RELOC(voffset,
+                                       rmesa->radeon.tcl.aos[nr - 1].bo,
+                                       voffset,
+                                       RADEON_GEM_DOMAIN_GTT,
+                                       0, 0);
+               }
+               END_BATCH();
+       } else {
+
+               BEGIN_BATCH(sz+2+(nr * 2));
+               OUT_BATCH_PACKET3(R300_PACKET3_3D_LOAD_VBPNTR, sz - 1);
+               OUT_BATCH(nr);
+
+               for (i = 0; i + 1 < nr; i += 2) {
+                       OUT_BATCH((rmesa->radeon.tcl.aos[i].components << 0) |
+                                 (rmesa->radeon.tcl.aos[i].stride << 8) |
+                                 (rmesa->radeon.tcl.aos[i + 1].components << 16) |
+                                 (rmesa->radeon.tcl.aos[i + 1].stride << 24));
+                       
+                       voffset =  rmesa->radeon.tcl.aos[i + 0].offset +
+                               offset * 4 * rmesa->radeon.tcl.aos[i + 0].stride;
+                       OUT_BATCH(voffset);
+                       voffset =  rmesa->radeon.tcl.aos[i + 1].offset +
+                               offset * 4 * rmesa->radeon.tcl.aos[i + 1].stride;
+                       OUT_BATCH(voffset);
+               }
+               
+               if (nr & 1) {
+                       OUT_BATCH((rmesa->radeon.tcl.aos[nr - 1].components << 0) |
+                         (rmesa->radeon.tcl.aos[nr - 1].stride << 8));
+                       voffset =  rmesa->radeon.tcl.aos[nr - 1].offset +
+                               offset * 4 * rmesa->radeon.tcl.aos[nr - 1].stride;
+                       OUT_BATCH(voffset);
+               }
+               for (i = 0; i + 1 < nr; i += 2) {
+                       voffset =  rmesa->radeon.tcl.aos[i + 0].offset +
+                               offset * 4 * rmesa->radeon.tcl.aos[i + 0].stride;
+                       radeon_cs_write_reloc(rmesa->radeon.cmdbuf.cs,
+                                             rmesa->radeon.tcl.aos[i+0].bo,
+                                             RADEON_GEM_DOMAIN_GTT,
+                                             0, 0);
+                       voffset =  rmesa->radeon.tcl.aos[i + 1].offset +
+                               offset * 4 * rmesa->radeon.tcl.aos[i + 1].stride;
+                       radeon_cs_write_reloc(rmesa->radeon.cmdbuf.cs,
+                                             rmesa->radeon.tcl.aos[i+1].bo,
+                                             RADEON_GEM_DOMAIN_GTT,
+                                             0, 0);
+               }
+               if (nr & 1) {
+                       voffset =  rmesa->radeon.tcl.aos[nr - 1].offset +
+                               offset * 4 * rmesa->radeon.tcl.aos[nr - 1].stride;
+                       radeon_cs_write_reloc(rmesa->radeon.cmdbuf.cs,
+                                             rmesa->radeon.tcl.aos[nr-1].bo,
+                                             RADEON_GEM_DOMAIN_GTT,
+                                             0, 0);
+               }
+               END_BATCH();
+       }
+
+}
+
+static void r300FireAOS(r300ContextPtr rmesa, int vertex_count, int type)
+{
+       BATCH_LOCALS(&rmesa->radeon);
+
+       BEGIN_BATCH(3);
+       OUT_BATCH_PACKET3(R300_PACKET3_3D_DRAW_VBUF_2, 0);
+       OUT_BATCH(R300_VAP_VF_CNTL__PRIM_WALK_VERTEX_LIST | (vertex_count << 16) | type);
+       END_BATCH();
+}
+
+static void r300RunRenderPrimitive(r300ContextPtr rmesa, GLcontext * ctx,
+                                  int start, int end, int prim)
+{
+       int type, num_verts;
+       TNLcontext *tnl = TNL_CONTEXT(ctx);
+       struct vertex_buffer *vb = &tnl->vb;
+
+       type = r300PrimitiveType(rmesa, prim);
+       num_verts = r300NumVerts(rmesa, end - start, prim);
+
+       if (type < 0 || num_verts <= 0)
+               return;
+
+       /* Make space for at least 64 dwords.
+        * This is supposed to ensure that we can get all rendering
+        * commands into a single command buffer.
+        */
+       rcommonEnsureCmdBufSpace(&rmesa->radeon, 64, __FUNCTION__);
+
+       if (vb->Elts) {
+               if (num_verts > 65535) {
+                       /* not implemented yet */
+                       WARN_ONCE("Too many elts\n");
+                       return;
+               }
+               /* Note: The following is incorrect, but it's the best I can do
+                * without a major refactoring of how DMA memory is handled.
+                * The problem: Ensuring that both vertex arrays *and* index
+                * arrays are at the right position, and then ensuring that
+                * the LOAD_VBPNTR, DRAW_INDX and INDX_BUFFER packets are emitted
+                * at once.
+                *
+                * So why is the following incorrect? Well, it seems like
+                * allocating the index array might actually evict the vertex
+                * arrays. *sigh*
+                */
+               r300EmitElts(ctx, vb->Elts, num_verts);
+               r300EmitAOS(rmesa, rmesa->radeon.tcl.aos_count, start);
+               r300FireEB(rmesa, num_verts, type);
+       } else {
+               r300EmitAOS(rmesa, rmesa->radeon.tcl.aos_count, start);
+               r300FireAOS(rmesa, num_verts, type);
+       }
+       COMMIT_BATCH();
+}
+
+static GLboolean r300RunRender(GLcontext * ctx,
+                              struct tnl_pipeline_stage *stage)
+{
+       r300ContextPtr rmesa = R300_CONTEXT(ctx);
+       int i;
+       TNLcontext *tnl = TNL_CONTEXT(ctx);
+       struct vertex_buffer *vb = &tnl->vb;
+
+       if (RADEON_DEBUG & DEBUG_PRIMS)
+               fprintf(stderr, "%s\n", __FUNCTION__);
+
+       r300UpdateShaders(rmesa);
+       if (r300EmitArrays(ctx))
+               return GL_TRUE;
+
+       r300UpdateShaderStates(rmesa);
+
+       r300EmitCacheFlush(rmesa);
+       radeonEmitState(&rmesa->radeon);
+
+       for (i = 0; i < vb->PrimitiveCount; i++) {
+               GLuint prim = _tnl_translate_prim(&vb->Primitive[i]);
+               GLuint start = vb->Primitive[i].start;
+               GLuint end = vb->Primitive[i].start + vb->Primitive[i].count;
+               r300RunRenderPrimitive(rmesa, ctx, start, end, prim);
+       }
+
+       r300EmitCacheFlush(rmesa);
+
+       radeonReleaseArrays(ctx, ~0);
+
+       return GL_FALSE;
+}
+
+#define FALLBACK_IF(expr)                                              \
+       do {                                                            \
+               if (expr) {                                             \
+                       if (1 || RADEON_DEBUG & DEBUG_FALLBACKS)        \
+                               WARN_ONCE("Software fallback:%s\n",     \
+                                         #expr);                       \
+                       return R300_FALLBACK_RAST;                      \
+               }                                                       \
+       } while(0)
+
+static int r300Fallback(GLcontext * ctx)
+{
+       r300ContextPtr r300 = R300_CONTEXT(ctx);
+       const unsigned back = ctx->Stencil._BackFace;
+       
+       FALLBACK_IF(r300->radeon.Fallback);
+       /* Do we need to use new-style shaders?
+        * Also is there a better way to do this? */
+       if (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515) {
+               struct r500_fragment_program *fp = (struct r500_fragment_program *)
+           (char *)ctx->FragmentProgram._Current;
+               if (fp) {
+                       if (!fp->translated) {
+                               r500TranslateFragmentShader(r300, fp);
+                               FALLBACK_IF(!fp->translated);
+                       }
+               }
+       } else {
+               struct r300_fragment_program *fp = (struct r300_fragment_program *)
+           (char *)ctx->FragmentProgram._Current;
+               if (fp) {
+                       if (!fp->translated) {
+                               r300TranslateFragmentShader(r300, fp);
+                               FALLBACK_IF(!fp->translated);
+                       }
+               }
+       }
+
+       FALLBACK_IF(ctx->RenderMode != GL_RENDER);
+
+       /* If GL_EXT_stencil_two_side is disabled, this fallback check can
+        * be removed.
+        */
+       FALLBACK_IF(ctx->Stencil.Ref[0] != ctx->Stencil.Ref[back]
+                   || ctx->Stencil.ValueMask[0] !=
+                   ctx->Stencil.ValueMask[back]
+                   || ctx->Stencil.WriteMask[0] !=
+                   ctx->Stencil.WriteMask[back]);
+
+       if (ctx->Extensions.NV_point_sprite || ctx->Extensions.ARB_point_sprite)
+               FALLBACK_IF(ctx->Point.PointSprite);
+
+       if (!r300->disable_lowimpact_fallback) {
+               FALLBACK_IF(ctx->Polygon.StippleFlag);
+               FALLBACK_IF(ctx->Multisample._Enabled);
+               FALLBACK_IF(ctx->Line.StippleFlag);
+               FALLBACK_IF(ctx->Line.SmoothFlag);
+               FALLBACK_IF(ctx->Point.SmoothFlag);
+       }
+
+       return R300_FALLBACK_NONE;
+}
+
+static GLboolean r300RunNonTCLRender(GLcontext * ctx,
+                                    struct tnl_pipeline_stage *stage)
+{
+       r300ContextPtr rmesa = R300_CONTEXT(ctx);
+
+       if (RADEON_DEBUG & DEBUG_PRIMS)
+               fprintf(stderr, "%s\n", __FUNCTION__);
+
+       if (r300Fallback(ctx) >= R300_FALLBACK_RAST)
+               return GL_TRUE;
+
+       if (!(rmesa->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL))
+               return GL_TRUE;
+
+       if (!r300ValidateBuffers(ctx))
+           return GL_TRUE;
+       
+       return r300RunRender(ctx, stage);
+}
+
+static GLboolean r300RunTCLRender(GLcontext * ctx,
+                                 struct tnl_pipeline_stage *stage)
+{
+       r300ContextPtr rmesa = R300_CONTEXT(ctx);
+       struct r300_vertex_program *vp;
+
+       hw_tcl_on = future_hw_tcl_on;
+
+       if (RADEON_DEBUG & DEBUG_PRIMS)
+               fprintf(stderr, "%s\n", __FUNCTION__);
+
+       if (hw_tcl_on == GL_FALSE)
+               return GL_TRUE;
+
+       if (r300Fallback(ctx) >= R300_FALLBACK_TCL) {
+               hw_tcl_on = GL_FALSE;
+               return GL_TRUE;
+       }
+
+       if (!r300ValidateBuffers(ctx))
+           return GL_TRUE;
+       
+       r300UpdateShaders(rmesa);
+
+       vp = (struct r300_vertex_program *)CURRENT_VERTEX_SHADER(ctx);
+       if (vp->native == GL_FALSE) {
+               hw_tcl_on = GL_FALSE;
+               return GL_TRUE;
+       }
+
+       return r300RunRender(ctx, stage);
+}
+
+const struct tnl_pipeline_stage _r300_render_stage = {
+       "r300 Hardware Rasterization",
+       NULL,
+       NULL,
+       NULL,
+       NULL,
+       r300RunNonTCLRender
+};
+
+const struct tnl_pipeline_stage _r300_tcl_stage = {
+       "r300 Hardware Transform, Clipping and Lighting",
+       NULL,
+       NULL,
+       NULL,
+       NULL,
+       r300RunTCLRender
+};
diff --git a/src/mesa/drivers/dri/r600/r600_shader.c b/src/mesa/drivers/dri/r600/r600_shader.c
new file mode 100644 (file)
index 0000000..63d4b45
--- /dev/null
@@ -0,0 +1,93 @@
+
+#include "main/glheader.h"
+
+#include "shader/program.h"
+#include "tnl/tnl.h"
+#include "r600_context.h"
+#include "r600_fragprog.h"
+
+static struct gl_program *r300NewProgram(GLcontext * ctx, GLenum target,
+                                        GLuint id)
+{
+       r300ContextPtr rmesa = R300_CONTEXT(ctx);
+       struct r300_vertex_program_cont *vp;
+       struct r300_fragment_program *r300_fp;
+       struct r500_fragment_program *r500_fp;
+
+       switch (target) {
+       case GL_VERTEX_STATE_PROGRAM_NV:
+       case GL_VERTEX_PROGRAM_ARB:
+               vp = CALLOC_STRUCT(r300_vertex_program_cont);
+               return _mesa_init_vertex_program(ctx, &vp->mesa_program,
+                                                target, id);
+       case GL_FRAGMENT_PROGRAM_ARB:
+               if (rmesa->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515) {
+                       r500_fp = CALLOC_STRUCT(r500_fragment_program);
+                       r500_fp->ctx = ctx;
+                       return _mesa_init_fragment_program(ctx, &r500_fp->mesa_program,
+                                                          target, id);
+               } else {
+                       r300_fp = CALLOC_STRUCT(r300_fragment_program);
+                       return _mesa_init_fragment_program(ctx, &r300_fp->mesa_program,
+                                                          target, id);
+               }
+
+       case GL_FRAGMENT_PROGRAM_NV:
+               if (rmesa->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515) {
+                       r500_fp = CALLOC_STRUCT(r500_fragment_program);
+                       return _mesa_init_fragment_program(ctx, &r500_fp->mesa_program,
+                                                          target, id);
+               } else {
+                       r300_fp = CALLOC_STRUCT(r300_fragment_program);
+                       return _mesa_init_fragment_program(ctx, &r300_fp->mesa_program,
+                                                          target, id);
+               }
+       default:
+               _mesa_problem(ctx, "Bad target in r300NewProgram");
+       }
+
+       return NULL;
+}
+
+static void r300DeleteProgram(GLcontext * ctx, struct gl_program *prog)
+{
+       _mesa_delete_program(ctx, prog);
+}
+
+static void
+r300ProgramStringNotify(GLcontext * ctx, GLenum target, struct gl_program *prog)
+{
+       r300ContextPtr rmesa = R300_CONTEXT(ctx);
+       struct r300_vertex_program_cont *vp = (void *)prog;
+       struct r300_fragment_program *r300_fp = (struct r300_fragment_program *)prog;
+       struct r500_fragment_program *r500_fp = (struct r500_fragment_program *)prog;
+
+       switch (target) {
+       case GL_VERTEX_PROGRAM_ARB:
+               vp->progs = NULL;
+               break;
+       case GL_FRAGMENT_PROGRAM_ARB:
+               if (rmesa->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515)
+                       r500_fp->translated = GL_FALSE;
+               else
+                       r300_fp->translated = GL_FALSE;
+               break;
+       }
+
+       /* need this for tcl fallbacks */
+       _tnl_program_string(ctx, target, prog);
+}
+
+static GLboolean
+r300IsProgramNative(GLcontext * ctx, GLenum target, struct gl_program *prog)
+{
+       return GL_TRUE;
+}
+
+void r300InitShaderFuncs(struct dd_function_table *functions)
+{
+       functions->NewProgram = r300NewProgram;
+       functions->DeleteProgram = r300DeleteProgram;
+       functions->ProgramStringNotify = r300ProgramStringNotify;
+       functions->IsProgramNative = r300IsProgramNative;
+}
diff --git a/src/mesa/drivers/dri/r600/r600_state.c b/src/mesa/drivers/dri/r600/r600_state.c
new file mode 100644 (file)
index 0000000..8cb9c7c
--- /dev/null
@@ -0,0 +1,2600 @@
+/*
+Copyright (C) The Weather Channel, Inc.  2002.
+Copyright (C) 2004 Nicolai Haehnle.
+All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/**
+ * \file
+ *
+ * \author Nicolai Haehnle <prefect_@gmx.net>
+ */
+
+#include "main/glheader.h"
+#include "main/state.h"
+#include "main/imports.h"
+#include "main/enums.h"
+#include "main/macros.h"
+#include "main/context.h"
+#include "main/dd.h"
+#include "main/framebuffer.h"
+#include "main/simple_list.h"
+#include "main/api_arrayelt.h"
+#include "main/texformat.h"
+
+#include "swrast/swrast.h"
+#include "swrast_setup/swrast_setup.h"
+#include "shader/prog_parameter.h"
+#include "shader/prog_statevars.h"
+#include "vbo/vbo.h"
+#include "tnl/tnl.h"
+
+#include "r600_context.h"
+#include "r600_ioctl.h"
+#include "r600_state.h"
+#include "r600_reg.h"
+#include "r600_emit.h"
+#include "r600_fragprog.h"
+#include "r600_tex.h"
+
+#include "drirenderbuffer.h"
+
+extern int future_hw_tcl_on;
+extern void _tnl_UpdateFixedFunctionProgram(GLcontext * ctx);
+
+static void r300BlendColor(GLcontext * ctx, const GLfloat cf[4])
+{
+       r300ContextPtr rmesa = R300_CONTEXT(ctx);
+
+       R300_STATECHANGE(rmesa, blend_color);
+
+       if (rmesa->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515) {
+               GLuint r = IROUND(cf[0]*1023.0f);
+               GLuint g = IROUND(cf[1]*1023.0f);
+               GLuint b = IROUND(cf[2]*1023.0f);
+               GLuint a = IROUND(cf[3]*1023.0f);
+
+               rmesa->hw.blend_color.cmd[1] = r | (a << 16);
+               rmesa->hw.blend_color.cmd[2] = b | (g << 16);
+       } else {
+               GLubyte color[4];
+               CLAMPED_FLOAT_TO_UBYTE(color[0], cf[0]);
+               CLAMPED_FLOAT_TO_UBYTE(color[1], cf[1]);
+               CLAMPED_FLOAT_TO_UBYTE(color[2], cf[2]);
+               CLAMPED_FLOAT_TO_UBYTE(color[3], cf[3]);
+
+               rmesa->hw.blend_color.cmd[1] = PACK_COLOR_8888(color[3], color[0],
+                                                       color[1], color[2]);
+       }
+}
+
+/**
+ * Calculate the hardware blend factor setting.  This same function is used
+ * for source and destination of both alpha and RGB.
+ *
+ * \returns
+ * The hardware register value for the specified blend factor.  This value
+ * will need to be shifted into the correct position for either source or
+ * destination factor.
+ *
+ * \todo
+ * Since the two cases where source and destination are handled differently
+ * are essentially error cases, they should never happen.  Determine if these
+ * cases can be removed.
+ */
+static int blend_factor(GLenum factor, GLboolean is_src)
+{
+       switch (factor) {
+       case GL_ZERO:
+               return R300_BLEND_GL_ZERO;
+               break;
+       case GL_ONE:
+               return R300_BLEND_GL_ONE;
+               break;
+       case GL_DST_COLOR:
+               return R300_BLEND_GL_DST_COLOR;
+               break;
+       case GL_ONE_MINUS_DST_COLOR:
+               return R300_BLEND_GL_ONE_MINUS_DST_COLOR;
+               break;
+       case GL_SRC_COLOR:
+               return R300_BLEND_GL_SRC_COLOR;
+               break;
+       case GL_ONE_MINUS_SRC_COLOR:
+               return R300_BLEND_GL_ONE_MINUS_SRC_COLOR;
+               break;
+       case GL_SRC_ALPHA:
+               return R300_BLEND_GL_SRC_ALPHA;
+               break;
+       case GL_ONE_MINUS_SRC_ALPHA:
+               return R300_BLEND_GL_ONE_MINUS_SRC_ALPHA;
+               break;
+       case GL_DST_ALPHA:
+               return R300_BLEND_GL_DST_ALPHA;
+               break;
+       case GL_ONE_MINUS_DST_ALPHA:
+               return R300_BLEND_GL_ONE_MINUS_DST_ALPHA;
+               break;
+       case GL_SRC_ALPHA_SATURATE:
+               return (is_src) ? R300_BLEND_GL_SRC_ALPHA_SATURATE :
+                   R300_BLEND_GL_ZERO;
+               break;
+       case GL_CONSTANT_COLOR:
+               return R300_BLEND_GL_CONST_COLOR;
+               break;
+       case GL_ONE_MINUS_CONSTANT_COLOR:
+               return R300_BLEND_GL_ONE_MINUS_CONST_COLOR;
+               break;
+       case GL_CONSTANT_ALPHA:
+               return R300_BLEND_GL_CONST_ALPHA;
+               break;
+       case GL_ONE_MINUS_CONSTANT_ALPHA:
+               return R300_BLEND_GL_ONE_MINUS_CONST_ALPHA;
+               break;
+       default:
+               fprintf(stderr, "unknown blend factor %x\n", factor);
+               return (is_src) ? R300_BLEND_GL_ONE : R300_BLEND_GL_ZERO;
+               break;
+       }
+}
+
+/**
+ * Sets both the blend equation and the blend function.
+ * This is done in a single
+ * function because some blend equations (i.e., \c GL_MIN and \c GL_MAX)
+ * change the interpretation of the blend function.
+ * Also, make sure that blend function and blend equation are set to their
+ * default value if color blending is not enabled, since at least blend
+ * equations GL_MIN and GL_FUNC_REVERSE_SUBTRACT will cause wrong results
+ * otherwise for unknown reasons.
+ */
+
+/* helper function */
+static void r300SetBlendCntl(r300ContextPtr r300, int func, int eqn,
+                            int cbits, int funcA, int eqnA)
+{
+       GLuint new_ablend, new_cblend;
+
+#if 0
+       fprintf(stderr,
+               "eqnA=%08x funcA=%08x eqn=%08x func=%08x cbits=%08x\n",
+               eqnA, funcA, eqn, func, cbits);
+#endif
+       new_ablend = eqnA | funcA;
+       new_cblend = eqn | func;
+
+       /* Some blend factor combinations don't seem to work when the
+        * BLEND_NO_SEPARATE bit is set.
+        *
+        * Especially problematic candidates are the ONE_MINUS_* flags,
+        * but I can't see a real pattern.
+        */
+#if 0
+       if (new_ablend == new_cblend) {
+               new_cblend |= R300_DISCARD_SRC_PIXELS_SRC_ALPHA_0;
+       }
+#endif
+       new_cblend |= cbits;
+
+       if ((new_ablend != r300->hw.bld.cmd[R300_BLD_ABLEND]) ||
+           (new_cblend != r300->hw.bld.cmd[R300_BLD_CBLEND])) {
+               R300_STATECHANGE(r300, bld);
+               r300->hw.bld.cmd[R300_BLD_ABLEND] = new_ablend;
+               r300->hw.bld.cmd[R300_BLD_CBLEND] = new_cblend;
+       }
+}
+
+static void r300SetBlendState(GLcontext * ctx)
+{
+       r300ContextPtr r300 = R300_CONTEXT(ctx);
+       int func = (R300_BLEND_GL_ONE << R300_SRC_BLEND_SHIFT) |
+           (R300_BLEND_GL_ZERO << R300_DST_BLEND_SHIFT);
+       int eqn = R300_COMB_FCN_ADD_CLAMP;
+       int funcA = (R300_BLEND_GL_ONE << R300_SRC_BLEND_SHIFT) |
+           (R300_BLEND_GL_ZERO << R300_DST_BLEND_SHIFT);
+       int eqnA = R300_COMB_FCN_ADD_CLAMP;
+
+       if (RGBA_LOGICOP_ENABLED(ctx) || !ctx->Color.BlendEnabled) {
+               r300SetBlendCntl(r300, func, eqn, 0, func, eqn);
+               return;
+       }
+
+       func =
+           (blend_factor(ctx->Color.BlendSrcRGB, GL_TRUE) <<
+            R300_SRC_BLEND_SHIFT) | (blend_factor(ctx->Color.BlendDstRGB,
+                                                  GL_FALSE) <<
+                                     R300_DST_BLEND_SHIFT);
+
+       switch (ctx->Color.BlendEquationRGB) {
+       case GL_FUNC_ADD:
+               eqn = R300_COMB_FCN_ADD_CLAMP;
+               break;
+
+       case GL_FUNC_SUBTRACT:
+               eqn = R300_COMB_FCN_SUB_CLAMP;
+               break;
+
+       case GL_FUNC_REVERSE_SUBTRACT:
+               eqn = R300_COMB_FCN_RSUB_CLAMP;
+               break;
+
+       case GL_MIN:
+               eqn = R300_COMB_FCN_MIN;
+               func = (R300_BLEND_GL_ONE << R300_SRC_BLEND_SHIFT) |
+                   (R300_BLEND_GL_ONE << R300_DST_BLEND_SHIFT);
+               break;
+
+       case GL_MAX:
+               eqn = R300_COMB_FCN_MAX;
+               func = (R300_BLEND_GL_ONE << R300_SRC_BLEND_SHIFT) |
+                   (R300_BLEND_GL_ONE << R300_DST_BLEND_SHIFT);
+               break;
+
+       default:
+               fprintf(stderr,
+                       "[%s:%u] Invalid RGB blend equation (0x%04x).\n",
+                       __FUNCTION__, __LINE__, ctx->Color.BlendEquationRGB);
+               return;
+       }
+
+       funcA =
+           (blend_factor(ctx->Color.BlendSrcA, GL_TRUE) <<
+            R300_SRC_BLEND_SHIFT) | (blend_factor(ctx->Color.BlendDstA,
+                                                  GL_FALSE) <<
+                                     R300_DST_BLEND_SHIFT);
+
+       switch (ctx->Color.BlendEquationA) {
+       case GL_FUNC_ADD:
+               eqnA = R300_COMB_FCN_ADD_CLAMP;
+               break;
+
+       case GL_FUNC_SUBTRACT:
+               eqnA = R300_COMB_FCN_SUB_CLAMP;
+               break;
+
+       case GL_FUNC_REVERSE_SUBTRACT:
+               eqnA = R300_COMB_FCN_RSUB_CLAMP;
+               break;
+
+       case GL_MIN:
+               eqnA = R300_COMB_FCN_MIN;
+               funcA = (R300_BLEND_GL_ONE << R300_SRC_BLEND_SHIFT) |
+                   (R300_BLEND_GL_ONE << R300_DST_BLEND_SHIFT);
+               break;
+
+       case GL_MAX:
+               eqnA = R300_COMB_FCN_MAX;
+               funcA = (R300_BLEND_GL_ONE << R300_SRC_BLEND_SHIFT) |
+                   (R300_BLEND_GL_ONE << R300_DST_BLEND_SHIFT);
+               break;
+
+       default:
+               fprintf(stderr,
+                       "[%s:%u] Invalid A blend equation (0x%04x).\n",
+                       __FUNCTION__, __LINE__, ctx->Color.BlendEquationA);
+               return;
+       }
+
+       r300SetBlendCntl(r300,
+                        func, eqn,
+                        (R300_SEPARATE_ALPHA_ENABLE |
+                         R300_READ_ENABLE |
+                         R300_ALPHA_BLEND_ENABLE), funcA, eqnA);
+}
+
+static void r300BlendEquationSeparate(GLcontext * ctx,
+                                     GLenum modeRGB, GLenum modeA)
+{
+       r300SetBlendState(ctx);
+}
+
+static void r300BlendFuncSeparate(GLcontext * ctx,
+                                 GLenum sfactorRGB, GLenum dfactorRGB,
+                                 GLenum sfactorA, GLenum dfactorA)
+{
+       r300SetBlendState(ctx);
+}
+
+/**
+ * Translate LogicOp enums into hardware representation.
+ * Both use a very logical bit-wise layout, but unfortunately the order
+ * of bits is reversed.
+ */
+static GLuint translate_logicop(GLenum logicop)
+{
+       GLuint bits = logicop - GL_CLEAR;
+       bits = ((bits & 1) << 3) | ((bits & 2) << 1) | ((bits & 4) >> 1) | ((bits & 8) >> 3);
+       return bits << R300_RB3D_ROPCNTL_ROP_SHIFT;
+}
+
+/**
+ * Used internally to update the r300->hw hardware state to match the
+ * current OpenGL state.
+ */
+static void r300SetLogicOpState(GLcontext *ctx)
+{
+       r300ContextPtr r300 = R300_CONTEXT(ctx);
+       R300_STATECHANGE(r300, rop);
+       if (RGBA_LOGICOP_ENABLED(ctx)) {
+               r300->hw.rop.cmd[1] = R300_RB3D_ROPCNTL_ROP_ENABLE |
+                       translate_logicop(ctx->Color.LogicOp);
+       } else {
+               r300->hw.rop.cmd[1] = 0;
+       }
+}
+
+/**
+ * Called by Mesa when an application program changes the LogicOp state
+ * via glLogicOp.
+ */
+static void r300LogicOpcode(GLcontext *ctx, GLenum logicop)
+{
+       if (RGBA_LOGICOP_ENABLED(ctx))
+               r300SetLogicOpState(ctx);
+}
+
+static void r300ClipPlane( GLcontext *ctx, GLenum plane, const GLfloat *eq )
+{
+       r300ContextPtr rmesa = R300_CONTEXT(ctx);
+       GLint p;
+       GLint *ip;
+
+       /* no VAP UCP on non-TCL chipsets */
+       if (!(rmesa->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL))
+                       return;
+
+       p = (GLint) plane - (GLint) GL_CLIP_PLANE0;
+       ip = (GLint *)ctx->Transform._ClipUserPlane[p];
+
+       R300_STATECHANGE( rmesa, vpucp[p] );
+       rmesa->hw.vpucp[p].cmd[R300_VPUCP_X] = ip[0];
+       rmesa->hw.vpucp[p].cmd[R300_VPUCP_Y] = ip[1];
+       rmesa->hw.vpucp[p].cmd[R300_VPUCP_Z] = ip[2];
+       rmesa->hw.vpucp[p].cmd[R300_VPUCP_W] = ip[3];
+}
+
+static void r300SetClipPlaneState(GLcontext * ctx, GLenum cap, GLboolean state)
+{
+       r300ContextPtr r300 = R300_CONTEXT(ctx);
+       GLuint p;
+
+       /* no VAP UCP on non-TCL chipsets */
+       if (!(r300->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL))
+               return;
+
+       p = cap - GL_CLIP_PLANE0;
+       R300_STATECHANGE(r300, vap_clip_cntl);
+       if (state) {
+               r300->hw.vap_clip_cntl.cmd[1] |= (R300_VAP_UCP_ENABLE_0 << p);
+               r300ClipPlane(ctx, cap, NULL);
+       } else {
+               r300->hw.vap_clip_cntl.cmd[1] &= ~(R300_VAP_UCP_ENABLE_0 << p);
+       }
+}
+
+/**
+ * Update our tracked culling state based on Mesa's state.
+ */
+static void r300UpdateCulling(GLcontext * ctx)
+{
+       r300ContextPtr r300 = R300_CONTEXT(ctx);
+       uint32_t val = 0;
+
+       if (ctx->Polygon.CullFlag) {
+               switch (ctx->Polygon.CullFaceMode) {
+               case GL_FRONT:
+                       val = R300_CULL_FRONT;
+                       break;
+               case GL_BACK:
+                       val = R300_CULL_BACK;
+                       break;
+               case GL_FRONT_AND_BACK:
+                       val = R300_CULL_FRONT | R300_CULL_BACK;
+                       break;
+               default:
+                       break;
+               }
+       }
+
+       switch (ctx->Polygon.FrontFace) {
+       case GL_CW:
+               val |= R300_FRONT_FACE_CW;
+               break;
+       case GL_CCW:
+               val |= R300_FRONT_FACE_CCW;
+               break;
+       default:
+               break;
+       }
+
+       R300_STATECHANGE(r300, cul);
+       r300->hw.cul.cmd[R300_CUL_CULL] = val;
+}
+
+static void r300SetPolygonOffsetState(GLcontext * ctx, GLboolean state)
+{
+       r300ContextPtr r300 = R300_CONTEXT(ctx);
+
+       R300_STATECHANGE(r300, occlusion_cntl);
+       if (state) {
+               r300->hw.occlusion_cntl.cmd[1] |= (3 << 0);
+       } else {
+               r300->hw.occlusion_cntl.cmd[1] &= ~(3 << 0);
+       }
+}
+
+static GLboolean current_fragment_program_writes_depth(GLcontext* ctx)
+{
+       r300ContextPtr r300 = R300_CONTEXT(ctx);
+
+       if (r300->radeon.radeonScreen->chip_family < CHIP_FAMILY_RV515) {
+               struct r300_fragment_program *fp = (struct r300_fragment_program *)
+                       (char *)ctx->FragmentProgram._Current;
+               return (fp && fp->WritesDepth);
+       } else {
+               struct r500_fragment_program* fp =
+                       (struct r500_fragment_program*)(char*)
+                       ctx->FragmentProgram._Current;
+               return (fp && fp->writes_depth);
+       }
+}
+
+static void r300SetEarlyZState(GLcontext * ctx)
+{
+       r300ContextPtr r300 = R300_CONTEXT(ctx);
+       GLuint topZ = R300_ZTOP_ENABLE;
+
+       if (ctx->Color.AlphaEnabled && ctx->Color.AlphaFunc != GL_ALWAYS)
+               topZ = R300_ZTOP_DISABLE;
+       if (current_fragment_program_writes_depth(ctx))
+               topZ = R300_ZTOP_DISABLE;
+
+       if (topZ != r300->hw.zstencil_format.cmd[2]) {
+               /* Note: This completely reemits the stencil format.
+                * I have not tested whether this is strictly necessary,
+                * or if emitting a write to ZB_ZTOP is enough.
+                */
+               R300_STATECHANGE(r300, zstencil_format);
+               r300->hw.zstencil_format.cmd[2] = topZ;
+       }
+}
+
+static void r300SetAlphaState(GLcontext * ctx)
+{
+       r300ContextPtr r300 = R300_CONTEXT(ctx);
+       GLubyte refByte;
+       uint32_t pp_misc = 0x0;
+       GLboolean really_enabled = ctx->Color.AlphaEnabled;
+
+       CLAMPED_FLOAT_TO_UBYTE(refByte, ctx->Color.AlphaRef);
+
+       switch (ctx->Color.AlphaFunc) {
+       case GL_NEVER:
+               pp_misc |= R300_FG_ALPHA_FUNC_NEVER;
+               break;
+       case GL_LESS:
+               pp_misc |= R300_FG_ALPHA_FUNC_LESS;
+               break;
+       case GL_EQUAL:
+               pp_misc |= R300_FG_ALPHA_FUNC_EQUAL;
+               break;
+       case GL_LEQUAL:
+               pp_misc |= R300_FG_ALPHA_FUNC_LE;
+               break;
+       case GL_GREATER:
+               pp_misc |= R300_FG_ALPHA_FUNC_GREATER;
+               break;
+       case GL_NOTEQUAL:
+               pp_misc |= R300_FG_ALPHA_FUNC_NOTEQUAL;
+               break;
+       case GL_GEQUAL:
+               pp_misc |= R300_FG_ALPHA_FUNC_GE;
+               break;
+       case GL_ALWAYS:
+               /*pp_misc |= FG_ALPHA_FUNC_ALWAYS; */
+               really_enabled = GL_FALSE;
+               break;
+       }
+
+       if (really_enabled) {
+               pp_misc |= R300_FG_ALPHA_FUNC_ENABLE;
+               pp_misc |= R500_FG_ALPHA_FUNC_8BIT;
+               pp_misc |= (refByte & R300_FG_ALPHA_FUNC_VAL_MASK);
+       } else {
+               pp_misc = 0x0;
+       }
+
+       R300_STATECHANGE(r300, at);
+       r300->hw.at.cmd[R300_AT_ALPHA_TEST] = pp_misc;
+       r300->hw.at.cmd[R300_AT_UNKNOWN] = 0;
+
+       r300SetEarlyZState(ctx);
+}
+
+static void r300AlphaFunc(GLcontext * ctx, GLenum func, GLfloat ref)
+{
+       (void)func;
+       (void)ref;
+       r300SetAlphaState(ctx);
+}
+
+static int translate_func(int func)
+{
+       switch (func) {
+       case GL_NEVER:
+               return R300_ZS_NEVER;
+       case GL_LESS:
+               return R300_ZS_LESS;
+       case GL_EQUAL:
+               return R300_ZS_EQUAL;
+       case GL_LEQUAL:
+               return R300_ZS_LEQUAL;
+       case GL_GREATER:
+               return R300_ZS_GREATER;
+       case GL_NOTEQUAL:
+               return R300_ZS_NOTEQUAL;
+       case GL_GEQUAL:
+               return R300_ZS_GEQUAL;
+       case GL_ALWAYS:
+               return R300_ZS_ALWAYS;
+       }
+       return 0;
+}
+
+static void r300SetDepthState(GLcontext * ctx)
+{
+       r300ContextPtr r300 = R300_CONTEXT(ctx);
+
+       R300_STATECHANGE(r300, zs);
+       r300->hw.zs.cmd[R300_ZS_CNTL_0] &= R300_STENCIL_ENABLE|R300_STENCIL_FRONT_BACK;
+       r300->hw.zs.cmd[R300_ZS_CNTL_1] &= ~(R300_ZS_MASK << R300_Z_FUNC_SHIFT);
+
+       if (ctx->Depth.Test) {
+               r300->hw.zs.cmd[R300_ZS_CNTL_0] |= R300_Z_ENABLE;
+               if (ctx->Depth.Mask)
+                       r300->hw.zs.cmd[R300_ZS_CNTL_0] |= R300_Z_WRITE_ENABLE;
+               r300->hw.zs.cmd[R300_ZS_CNTL_1] |=
+                   translate_func(ctx->Depth.Func) << R300_Z_FUNC_SHIFT;
+       }
+
+       r300SetEarlyZState(ctx);
+}
+
+static void r300SetStencilState(GLcontext * ctx, GLboolean state)
+{
+       r300ContextPtr r300 = R300_CONTEXT(ctx);
+       GLboolean hw_stencil = GL_FALSE;
+       if (ctx->DrawBuffer) {
+               struct radeon_renderbuffer *rrbStencil
+                       = radeon_get_renderbuffer(ctx->DrawBuffer, BUFFER_STENCIL);
+               hw_stencil = (rrbStencil && rrbStencil->bo);
+       }
+
+       if (hw_stencil) {
+               R300_STATECHANGE(r300, zs);
+               if (state) {
+                       r300->hw.zs.cmd[R300_ZS_CNTL_0] |=
+                           R300_STENCIL_ENABLE;
+               } else {
+                       r300->hw.zs.cmd[R300_ZS_CNTL_0] &=
+                           ~R300_STENCIL_ENABLE;
+               }
+       } else {
+#if R200_MERGED
+               FALLBACK(&r300->radeon, RADEON_FALLBACK_STENCIL, state);
+#endif
+       }
+}
+
+static void r300UpdatePolygonMode(GLcontext * ctx)
+{
+       r300ContextPtr r300 = R300_CONTEXT(ctx);
+       uint32_t hw_mode = R300_GA_POLY_MODE_DISABLE;
+
+       /* Only do something if a polygon mode is wanted, default is GL_FILL */
+       if (ctx->Polygon.FrontMode != GL_FILL ||
+           ctx->Polygon.BackMode != GL_FILL) {
+               GLenum f, b;
+
+               /* Handle GL_CW (clock wise and GL_CCW (counter clock wise)
+                * correctly by selecting the correct front and back face
+                */
+               if (ctx->Polygon.FrontFace == GL_CCW) {
+                       f = ctx->Polygon.FrontMode;
+                       b = ctx->Polygon.BackMode;
+               } else {
+                       f = ctx->Polygon.BackMode;
+                       b = ctx->Polygon.FrontMode;
+               }
+
+               /* Enable polygon mode */
+               hw_mode |= R300_GA_POLY_MODE_DUAL;
+
+               switch (f) {
+               case GL_LINE:
+                       hw_mode |= R300_GA_POLY_MODE_FRONT_PTYPE_LINE;
+                       break;
+               case GL_POINT:
+                       hw_mode |= R300_GA_POLY_MODE_FRONT_PTYPE_POINT;
+                       break;
+               case GL_FILL:
+                       hw_mode |= R300_GA_POLY_MODE_FRONT_PTYPE_TRI;
+                       break;
+               }
+
+               switch (b) {
+               case GL_LINE:
+                       hw_mode |= R300_GA_POLY_MODE_BACK_PTYPE_LINE;
+                       break;
+               case GL_POINT:
+                       hw_mode |= R300_GA_POLY_MODE_BACK_PTYPE_POINT;
+                       break;
+               case GL_FILL:
+                       hw_mode |= R300_GA_POLY_MODE_BACK_PTYPE_TRI;
+                       break;
+               }
+       }
+
+       if (r300->hw.polygon_mode.cmd[1] != hw_mode) {
+               R300_STATECHANGE(r300, polygon_mode);
+               r300->hw.polygon_mode.cmd[1] = hw_mode;
+       }
+
+       r300->hw.polygon_mode.cmd[2] = 0x00000001;
+       r300->hw.polygon_mode.cmd[3] = 0x00000000;
+}
+
+/**
+ * Change the culling mode.
+ *
+ * \note Mesa already filters redundant calls to this function.
+ */
+static void r300CullFace(GLcontext * ctx, GLenum mode)
+{
+       (void)mode;
+
+       r300UpdateCulling(ctx);
+}
+
+/**
+ * Change the polygon orientation.
+ *
+ * \note Mesa already filters redundant calls to this function.
+ */
+static void r300FrontFace(GLcontext * ctx, GLenum mode)
+{
+       (void)mode;
+
+       r300UpdateCulling(ctx);
+       r300UpdatePolygonMode(ctx);
+}
+
+/**
+ * Change the depth testing function.
+ *
+ * \note Mesa already filters redundant calls to this function.
+ */
+static void r300DepthFunc(GLcontext * ctx, GLenum func)
+{
+       (void)func;
+       r300SetDepthState(ctx);
+}
+
+/**
+ * Enable/Disable depth writing.
+ *
+ * \note Mesa already filters redundant calls to this function.
+ */
+static void r300DepthMask(GLcontext * ctx, GLboolean mask)
+{
+       (void)mask;
+       r300SetDepthState(ctx);
+}
+
+/**
+ * Handle glColorMask()
+ */
+static void r300ColorMask(GLcontext * ctx,
+                         GLboolean r, GLboolean g, GLboolean b, GLboolean a)
+{
+       r300ContextPtr r300 = R300_CONTEXT(ctx);
+       int mask = (r ? RB3D_COLOR_CHANNEL_MASK_RED_MASK0 : 0) |
+           (g ? RB3D_COLOR_CHANNEL_MASK_GREEN_MASK0 : 0) |
+           (b ? RB3D_COLOR_CHANNEL_MASK_BLUE_MASK0 : 0) |
+           (a ? RB3D_COLOR_CHANNEL_MASK_ALPHA_MASK0 : 0);
+
+       if (mask != r300->hw.cmk.cmd[R300_CMK_COLORMASK]) {
+               R300_STATECHANGE(r300, cmk);
+               r300->hw.cmk.cmd[R300_CMK_COLORMASK] = mask;
+       }
+}
+
+/* =============================================================
+ * Point state
+ */
+static void r300PointSize(GLcontext * ctx, GLfloat size)
+{
+       r300ContextPtr r300 = R300_CONTEXT(ctx);
+        /* same size limits for AA, non-AA points */
+       size = CLAMP(size, ctx->Const.MinPointSize, ctx->Const.MaxPointSize);
+
+       R300_STATECHANGE(r300, ps);
+       r300->hw.ps.cmd[R300_PS_POINTSIZE] =
+           ((int)(size * 6) << R300_POINTSIZE_X_SHIFT) |
+           ((int)(size * 6) << R300_POINTSIZE_Y_SHIFT);
+}
+
+static void r300PointParameter(GLcontext * ctx, GLenum pname, const GLfloat * param)
+{
+       r300ContextPtr r300 = R300_CONTEXT(ctx);
+
+       switch (pname) {
+       case GL_POINT_SIZE_MIN:
+               R300_STATECHANGE(r300, ga_point_minmax);
+               r300->hw.ga_point_minmax.cmd[1] &= ~R300_GA_POINT_MINMAX_MIN_MASK;
+               r300->hw.ga_point_minmax.cmd[1] |= (GLuint)(ctx->Point.MinSize * 6.0);
+               break;
+       case GL_POINT_SIZE_MAX:
+               R300_STATECHANGE(r300, ga_point_minmax);
+               r300->hw.ga_point_minmax.cmd[1] &= ~R300_GA_POINT_MINMAX_MAX_MASK;
+               r300->hw.ga_point_minmax.cmd[1] |= (GLuint)(ctx->Point.MaxSize * 6.0)
+                       << R300_GA_POINT_MINMAX_MAX_SHIFT;
+               break;
+       case GL_POINT_DISTANCE_ATTENUATION:
+               break;
+       case GL_POINT_FADE_THRESHOLD_SIZE:
+               break;
+       default:
+               break;
+       }
+}
+
+/* =============================================================
+ * Line state
+ */
+static void r300LineWidth(GLcontext * ctx, GLfloat widthf)
+{
+       r300ContextPtr r300 = R300_CONTEXT(ctx);
+
+       widthf = CLAMP(widthf,
+                       ctx->Const.MinPointSize,
+                       ctx->Const.MaxPointSize);
+       R300_STATECHANGE(r300, lcntl);
+       r300->hw.lcntl.cmd[1] =
+           R300_LINE_CNT_HO | R300_LINE_CNT_VE | (int)(widthf * 6.0);
+}
+
+static void r300PolygonMode(GLcontext * ctx, GLenum face, GLenum mode)
+{
+       (void)face;
+       (void)mode;
+
+       r300UpdatePolygonMode(ctx);
+}
+
+/* =============================================================
+ * Stencil
+ */
+
+static int translate_stencil_op(int op)
+{
+       switch (op) {
+       case GL_KEEP:
+               return R300_ZS_KEEP;
+       case GL_ZERO:
+               return R300_ZS_ZERO;
+       case GL_REPLACE:
+               return R300_ZS_REPLACE;
+       case GL_INCR:
+               return R300_ZS_INCR;
+       case GL_DECR:
+               return R300_ZS_DECR;
+       case GL_INCR_WRAP_EXT:
+               return R300_ZS_INCR_WRAP;
+       case GL_DECR_WRAP_EXT:
+               return R300_ZS_DECR_WRAP;
+       case GL_INVERT:
+               return R300_ZS_INVERT;
+       default:
+               WARN_ONCE("Do not know how to translate stencil op");
+               return R300_ZS_KEEP;
+       }
+       return 0;
+}
+
+static void r300ShadeModel(GLcontext * ctx, GLenum mode)
+{
+       r300ContextPtr rmesa = R300_CONTEXT(ctx);
+
+       R300_STATECHANGE(rmesa, shade);
+       rmesa->hw.shade.cmd[1] = 0x00000002;
+       switch (mode) {
+       case GL_FLAT:
+               rmesa->hw.shade.cmd[2] = R300_RE_SHADE_MODEL_FLAT;
+               break;
+       case GL_SMOOTH:
+               rmesa->hw.shade.cmd[2] = R300_RE_SHADE_MODEL_SMOOTH;
+               break;
+       default:
+               return;
+       }
+       rmesa->hw.shade.cmd[3] = 0x00000000;
+       rmesa->hw.shade.cmd[4] = 0x00000000;
+}
+
+static void r300StencilFuncSeparate(GLcontext * ctx, GLenum face,
+                                   GLenum func, GLint ref, GLuint mask)
+{
+       r300ContextPtr rmesa = R300_CONTEXT(ctx);
+       GLuint refmask =
+           ((ctx->Stencil.Ref[0] & 0xff) << R300_STENCILREF_SHIFT)
+            | ((ctx->Stencil.ValueMask[0] & 0xff) << R300_STENCILMASK_SHIFT);
+       const unsigned back = ctx->Stencil._BackFace;
+       GLuint flag;
+
+       R300_STATECHANGE(rmesa, zs);
+       rmesa->hw.zs.cmd[R300_ZS_CNTL_0] |= R300_STENCIL_FRONT_BACK;
+       rmesa->hw.zs.cmd[R300_ZS_CNTL_1] &= ~((R300_ZS_MASK <<
+                                              R300_S_FRONT_FUNC_SHIFT)
+                                             | (R300_ZS_MASK <<
+                                                R300_S_BACK_FUNC_SHIFT));
+
+       rmesa->hw.zs.cmd[R300_ZS_CNTL_2] &=
+           ~((R300_STENCILREF_MASK << R300_STENCILREF_SHIFT) |
+             (R300_STENCILREF_MASK << R300_STENCILMASK_SHIFT));
+
+       flag = translate_func(ctx->Stencil.Function[0]);
+       rmesa->hw.zs.cmd[R300_ZS_CNTL_1] |=
+           (flag << R300_S_FRONT_FUNC_SHIFT);
+
+       flag = translate_func(ctx->Stencil.Function[back]);
+
+       rmesa->hw.zs.cmd[R300_ZS_CNTL_1] |=
+           (flag << R300_S_BACK_FUNC_SHIFT);
+       rmesa->hw.zs.cmd[R300_ZS_CNTL_2] |= refmask;
+}
+
+static void r300StencilMaskSeparate(GLcontext * ctx, GLenum face, GLuint mask)
+{
+       r300ContextPtr rmesa = R300_CONTEXT(ctx);
+
+       R300_STATECHANGE(rmesa, zs);
+       rmesa->hw.zs.cmd[R300_ZS_CNTL_2] &=
+           ~(R300_STENCILREF_MASK <<
+             R300_STENCILWRITEMASK_SHIFT);
+       rmesa->hw.zs.cmd[R300_ZS_CNTL_2] |=
+           (ctx->Stencil.
+            WriteMask[0] & R300_STENCILREF_MASK) <<
+            R300_STENCILWRITEMASK_SHIFT;
+}
+
+static void r300StencilOpSeparate(GLcontext * ctx, GLenum face,
+                                 GLenum fail, GLenum zfail, GLenum zpass)
+{
+       r300ContextPtr rmesa = R300_CONTEXT(ctx);
+       const unsigned back = ctx->Stencil._BackFace;
+
+       R300_STATECHANGE(rmesa, zs);
+       /* It is easier to mask what's left.. */
+       rmesa->hw.zs.cmd[R300_ZS_CNTL_1] &=
+           (R300_ZS_MASK << R300_Z_FUNC_SHIFT) |
+           (R300_ZS_MASK << R300_S_FRONT_FUNC_SHIFT) |
+           (R300_ZS_MASK << R300_S_BACK_FUNC_SHIFT);
+
+       rmesa->hw.zs.cmd[R300_ZS_CNTL_1] |=
+           (translate_stencil_op(ctx->Stencil.FailFunc[0]) <<
+            R300_S_FRONT_SFAIL_OP_SHIFT)
+           | (translate_stencil_op(ctx->Stencil.ZFailFunc[0]) <<
+              R300_S_FRONT_ZFAIL_OP_SHIFT)
+           | (translate_stencil_op(ctx->Stencil.ZPassFunc[0]) <<
+              R300_S_FRONT_ZPASS_OP_SHIFT);
+
+       rmesa->hw.zs.cmd[R300_ZS_CNTL_1] |=
+           (translate_stencil_op(ctx->Stencil.FailFunc[back]) <<
+            R300_S_BACK_SFAIL_OP_SHIFT)
+           | (translate_stencil_op(ctx->Stencil.ZFailFunc[back]) <<
+              R300_S_BACK_ZFAIL_OP_SHIFT)
+           | (translate_stencil_op(ctx->Stencil.ZPassFunc[back]) <<
+              R300_S_BACK_ZPASS_OP_SHIFT);
+}
+
+/* =============================================================
+ * Window position and viewport transformation
+ */
+
+/*
+ * To correctly position primitives:
+ */
+#define SUBPIXEL_X 0.125
+#define SUBPIXEL_Y 0.125
+
+static void r300UpdateWindow(GLcontext * ctx)
+{
+       r300ContextPtr rmesa = R300_CONTEXT(ctx);
+       __DRIdrawablePrivate *dPriv = rmesa->radeon.dri.drawable;
+       GLfloat xoffset = dPriv ? (GLfloat) dPriv->x : 0;
+       GLfloat yoffset = dPriv ? (GLfloat) dPriv->y + dPriv->h : 0;
+       const GLfloat *v = ctx->Viewport._WindowMap.m;
+       const GLfloat depthScale = 1.0F / ctx->DrawBuffer->_DepthMaxF;
+       const GLboolean render_to_fbo = (ctx->DrawBuffer->Name != 0);
+       GLfloat y_scale, y_bias;
+
+       if (render_to_fbo) {
+               y_scale = 1.0;
+               y_bias = 0;
+       } else {
+               y_scale = -1.0;
+               y_bias = yoffset;
+       }
+
+       GLfloat sx = v[MAT_SX];
+       GLfloat tx = v[MAT_TX] + xoffset + SUBPIXEL_X;
+       GLfloat sy = v[MAT_SY] * y_scale;
+       GLfloat ty = (v[MAT_TY] * y_scale) + y_bias + SUBPIXEL_Y;
+       GLfloat sz = v[MAT_SZ] * depthScale;
+       GLfloat tz = v[MAT_TZ] * depthScale;
+
+       R300_STATECHANGE(rmesa, vpt);
+
+       rmesa->hw.vpt.cmd[R300_VPT_XSCALE] = r300PackFloat32(sx);
+       rmesa->hw.vpt.cmd[R300_VPT_XOFFSET] = r300PackFloat32(tx);
+       rmesa->hw.vpt.cmd[R300_VPT_YSCALE] = r300PackFloat32(sy);
+       rmesa->hw.vpt.cmd[R300_VPT_YOFFSET] = r300PackFloat32(ty);
+       rmesa->hw.vpt.cmd[R300_VPT_ZSCALE] = r300PackFloat32(sz);
+       rmesa->hw.vpt.cmd[R300_VPT_ZOFFSET] = r300PackFloat32(tz);
+}
+
+static void r300Viewport(GLcontext * ctx, GLint x, GLint y,
+                        GLsizei width, GLsizei height)
+{
+       /* Don't pipeline viewport changes, conflict with window offset
+        * setting below.  Could apply deltas to rescue pipelined viewport
+        * values, or keep the originals hanging around.
+        */
+       r300UpdateWindow(ctx);
+
+       radeon_viewport(ctx, x, y, width, height);
+}
+
+static void r300DepthRange(GLcontext * ctx, GLclampd nearval, GLclampd farval)
+{
+       r300UpdateWindow(ctx);
+}
+
+void r300UpdateViewportOffset(GLcontext * ctx)
+{
+       r300ContextPtr rmesa = R300_CONTEXT(ctx);
+       __DRIdrawablePrivate *dPriv = ((radeonContextPtr) rmesa)->dri.drawable;
+       GLfloat xoffset = (GLfloat) dPriv->x;
+       GLfloat yoffset = (GLfloat) dPriv->y + dPriv->h;
+       const GLfloat *v = ctx->Viewport._WindowMap.m;
+
+       GLfloat tx = v[MAT_TX] + xoffset + SUBPIXEL_X;
+       GLfloat ty = (-v[MAT_TY]) + yoffset + SUBPIXEL_Y;
+
+       if (rmesa->hw.vpt.cmd[R300_VPT_XOFFSET] != r300PackFloat32(tx) ||
+           rmesa->hw.vpt.cmd[R300_VPT_YOFFSET] != r300PackFloat32(ty)) {
+               /* Note: this should also modify whatever data the context reset
+                * code uses...
+                */
+               R300_STATECHANGE(rmesa, vpt);
+               rmesa->hw.vpt.cmd[R300_VPT_XOFFSET] = r300PackFloat32(tx);
+               rmesa->hw.vpt.cmd[R300_VPT_YOFFSET] = r300PackFloat32(ty);
+
+       }
+
+       radeonUpdateScissor(ctx);
+}
+
+static void
+r300FetchStateParameter(GLcontext * ctx,
+                       const gl_state_index state[STATE_LENGTH],
+                       GLfloat * value)
+{
+       r300ContextPtr r300 = R300_CONTEXT(ctx);
+
+       switch (state[0]) {
+       case STATE_INTERNAL:
+               switch (state[1]) {
+               case STATE_R300_WINDOW_DIMENSION:
+                       value[0] = r300->radeon.dri.drawable->w * 0.5f; /* width*0.5 */
+                       value[1] = r300->radeon.dri.drawable->h * 0.5f; /* height*0.5 */
+                       value[2] = 0.5F;        /* for moving range [-1 1] -> [0 1] */
+                       value[3] = 1.0F;        /* not used */
+                       break;
+
+               case STATE_R300_TEXRECT_FACTOR:{
+                               struct gl_texture_object *t =
+                                   ctx->Texture.Unit[state[2]].CurrentTex[TEXTURE_RECT_INDEX];
+
+                               if (t && t->Image[0][t->BaseLevel]) {
+                                       struct gl_texture_image *image =
+                                           t->Image[0][t->BaseLevel];
+                                       value[0] = 1.0 / image->Width2;
+                                       value[1] = 1.0 / image->Height2;
+                               } else {
+                                       value[0] = 1.0;
+                                       value[1] = 1.0;
+                               }
+                               value[2] = 1.0;
+                               value[3] = 1.0;
+                               break;
+                       }
+
+               default:
+                       break;
+               }
+               break;
+
+       default:
+               break;
+       }
+}
+
+/**
+ * Update R300's own internal state parameters.
+ * For now just STATE_R300_WINDOW_DIMENSION
+ */
+void r300UpdateStateParameters(GLcontext * ctx, GLuint new_state)
+{
+       struct r300_fragment_program *fp;
+       struct gl_program_parameter_list *paramList;
+       GLuint i;
+
+       if (!(new_state & (_NEW_BUFFERS | _NEW_PROGRAM)))
+               return;
+
+       fp = (struct r300_fragment_program *)ctx->FragmentProgram._Current;
+       if (!fp)
+               return;
+
+       paramList = fp->mesa_program.Base.Parameters;
+
+       if (!paramList)
+               return;
+
+       for (i = 0; i < paramList->NumParameters; i++) {
+               if (paramList->Parameters[i].Type == PROGRAM_STATE_VAR) {
+                       r300FetchStateParameter(ctx,
+                                               paramList->Parameters[i].
+                                               StateIndexes,
+                                               paramList->ParameterValues[i]);
+               }
+       }
+}
+
+/* =============================================================
+ * Polygon state
+ */
+static void r300PolygonOffset(GLcontext * ctx, GLfloat factor, GLfloat units)
+{
+       r300ContextPtr rmesa = R300_CONTEXT(ctx);
+       GLfloat constant = units;
+
+       switch (ctx->Visual.depthBits) {
+       case 16:
+               constant *= 4.0;
+               break;
+       case 24:
+               constant *= 2.0;
+               break;
+       }
+
+       factor *= 12.0;
+
+/*    fprintf(stderr, "%s f:%f u:%f\n", __FUNCTION__, factor, constant); */
+
+       R300_STATECHANGE(rmesa, zbs);
+       rmesa->hw.zbs.cmd[R300_ZBS_T_FACTOR] = r300PackFloat32(factor);
+       rmesa->hw.zbs.cmd[R300_ZBS_T_CONSTANT] = r300PackFloat32(constant);
+       rmesa->hw.zbs.cmd[R300_ZBS_W_FACTOR] = r300PackFloat32(factor);
+       rmesa->hw.zbs.cmd[R300_ZBS_W_CONSTANT] = r300PackFloat32(constant);
+}
+
+/* Routing and texture-related */
+
+/* r300 doesnt handle GL_CLAMP and GL_MIRROR_CLAMP_EXT correctly when filter is NEAREST.
+ * Since texwrap produces same results for GL_CLAMP and GL_CLAMP_TO_EDGE we use them instead.
+ * We need to recalculate wrap modes whenever filter mode is changed because someone might do:
+ * glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+ * glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP);
+ * glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+ * Since r300 completely ignores R300_TX_CLAMP when either min or mag is nearest it cant handle
+ * combinations where only one of them is nearest.
+ */
+static unsigned long gen_fixed_filter(unsigned long f)
+{
+       unsigned long mag, min, needs_fixing = 0;
+       //return f;
+
+       /* We ignore MIRROR bit so we dont have to do everything twice */
+       if ((f & ((7 - 1) << R300_TX_WRAP_S_SHIFT)) ==
+           (R300_TX_CLAMP << R300_TX_WRAP_S_SHIFT)) {
+               needs_fixing |= 1;
+       }
+       if ((f & ((7 - 1) << R300_TX_WRAP_T_SHIFT)) ==
+           (R300_TX_CLAMP << R300_TX_WRAP_T_SHIFT)) {
+               needs_fixing |= 2;
+       }
+       if ((f & ((7 - 1) << R300_TX_WRAP_R_SHIFT)) ==
+           (R300_TX_CLAMP << R300_TX_WRAP_R_SHIFT)) {
+               needs_fixing |= 4;
+       }
+
+       if (!needs_fixing)
+               return f;
+
+       mag = f & R300_TX_MAG_FILTER_MASK;
+       min = f & (R300_TX_MIN_FILTER_MASK|R300_TX_MIN_FILTER_MIP_MASK);
+
+       /* TODO: Check for anisto filters too */
+       if ((mag != R300_TX_MAG_FILTER_NEAREST)
+           && (min != R300_TX_MIN_FILTER_NEAREST))
+               return f;
+
+       /* r300 cant handle these modes hence we force nearest to linear */
+       if ((mag == R300_TX_MAG_FILTER_NEAREST)
+           && (min != R300_TX_MIN_FILTER_NEAREST)) {
+               f &= ~R300_TX_MAG_FILTER_NEAREST;
+               f |= R300_TX_MAG_FILTER_LINEAR;
+               return f;
+       }
+
+       if ((min == R300_TX_MIN_FILTER_NEAREST)
+           && (mag != R300_TX_MAG_FILTER_NEAREST)) {
+               f &= ~R300_TX_MIN_FILTER_NEAREST;
+               f |= R300_TX_MIN_FILTER_LINEAR;
+               return f;
+       }
+
+       /* Both are nearest */
+       if (needs_fixing & 1) {
+               f &= ~((7 - 1) << R300_TX_WRAP_S_SHIFT);
+               f |= R300_TX_CLAMP_TO_EDGE << R300_TX_WRAP_S_SHIFT;
+       }
+       if (needs_fixing & 2) {
+               f &= ~((7 - 1) << R300_TX_WRAP_T_SHIFT);
+               f |= R300_TX_CLAMP_TO_EDGE << R300_TX_WRAP_T_SHIFT;
+       }
+       if (needs_fixing & 4) {
+               f &= ~((7 - 1) << R300_TX_WRAP_R_SHIFT);
+               f |= R300_TX_CLAMP_TO_EDGE << R300_TX_WRAP_R_SHIFT;
+       }
+       return f;
+}
+
+static void r300SetupFragmentShaderTextures(GLcontext *ctx, int *tmu_mappings)
+{
+       r300ContextPtr r300 = R300_CONTEXT(ctx);
+       int i;
+       struct r300_fragment_program *fp = (struct r300_fragment_program *)
+           (char *)ctx->FragmentProgram._Current;
+       struct r300_fragment_program_code *code = &fp->code;
+
+       R300_STATECHANGE(r300, fpt);
+
+       for (i = 0; i < code->tex.length; i++) {
+               int unit;
+               int opcode;
+               unsigned long val;
+
+               unit = code->tex.inst[i] >> R300_TEX_ID_SHIFT;
+               unit &= 15;
+
+               val = code->tex.inst[i];
+               val &= ~R300_TEX_ID_MASK;
+
+               opcode =
+                       (val & R300_TEX_INST_MASK) >> R300_TEX_INST_SHIFT;
+               if (opcode == R300_TEX_OP_KIL) {
+                       r300->hw.fpt.cmd[R300_FPT_INSTR_0 + i] = val;
+               } else {
+                       if (tmu_mappings[unit] >= 0) {
+                               val |=
+                                       tmu_mappings[unit] <<
+                                       R300_TEX_ID_SHIFT;
+                               r300->hw.fpt.cmd[R300_FPT_INSTR_0 + i] = val;
+                       } else {
+                               // We get here when the corresponding texture image is incomplete
+                               // (e.g. incomplete mipmaps etc.)
+                               r300->hw.fpt.cmd[R300_FPT_INSTR_0 + i] = val;
+                       }
+               }
+       }
+
+       r300->hw.fpt.cmd[R300_FPT_CMD_0] =
+               cmdpacket0(r300->radeon.radeonScreen,
+                   R300_US_TEX_INST_0, code->tex.length);
+}
+
+static void r500SetupFragmentShaderTextures(GLcontext *ctx, int *tmu_mappings)
+{
+       int i;
+       struct r500_fragment_program *fp = (struct r500_fragment_program *)
+           (char *)ctx->FragmentProgram._Current;
+       struct r500_fragment_program_code *code = &fp->code;
+
+       /* find all the texture instructions and relocate the texture units */
+       for (i = 0; i < code->inst_end + 1; i++) {
+               if ((code->inst[i].inst0 & 0x3) == R500_INST_TYPE_TEX) {
+                       uint32_t val;
+                       int unit, opcode, new_unit;
+
+                       val = code->inst[i].inst1;
+
+                       unit = (val >> 16) & 0xf;
+
+                       val &= ~(0xf << 16);
+
+                       opcode = val & (0x7 << 22);
+                       if (opcode == R500_TEX_INST_TEXKILL) {
+                               new_unit = 0;
+                       } else {
+                               if (tmu_mappings[unit] >= 0) {
+                                       new_unit = tmu_mappings[unit];
+                               } else {
+                                       new_unit = 0;
+                               }
+                       }
+                       val |= R500_TEX_ID(new_unit);
+                       code->inst[i].inst1 = val;
+               }
+       }
+}
+
+static GLuint translate_lod_bias(GLfloat bias)
+{
+       GLint b = (int)(bias*32);
+       if (b >= (1 << 9))
+               b = (1 << 9)-1;
+       else if (b < -(1 << 9))
+               b = -(1 << 9);
+       return (((GLuint)b) << R300_LOD_BIAS_SHIFT) & R300_LOD_BIAS_MASK;
+}
+
+static void r300SetupTextures(GLcontext * ctx)
+{
+       int i, mtu;
+       struct radeon_tex_obj *t;
+       r300ContextPtr r300 = R300_CONTEXT(ctx);
+       int hw_tmu = 0;
+       int last_hw_tmu = -1;   /* -1 translates into no setup costs for fields */
+       int tmu_mappings[R300_MAX_TEXTURE_UNITS] = { -1, };
+       struct r300_fragment_program *fp = (struct r300_fragment_program *)
+           (char *)ctx->FragmentProgram._Current;
+
+       R300_STATECHANGE(r300, txe);
+       R300_STATECHANGE(r300, tex.filter);
+       R300_STATECHANGE(r300, tex.filter_1);
+       R300_STATECHANGE(r300, tex.size);
+       R300_STATECHANGE(r300, tex.format);
+       R300_STATECHANGE(r300, tex.pitch);
+       R300_STATECHANGE(r300, tex.offset);
+       R300_STATECHANGE(r300, tex.chroma_key);
+       R300_STATECHANGE(r300, tex.border_color);
+
+       r300->hw.txe.cmd[R300_TXE_ENABLE] = 0x0;
+
+       mtu = r300->radeon.glCtx->Const.MaxTextureUnits;
+       if (RADEON_DEBUG & DEBUG_STATE)
+               fprintf(stderr, "mtu=%d\n", mtu);
+
+       if (mtu > R300_MAX_TEXTURE_UNITS) {
+               fprintf(stderr,
+                       "Aiiee ! mtu=%d is greater than R300_MAX_TEXTURE_UNITS=%d\n",
+                       mtu, R300_MAX_TEXTURE_UNITS);
+               _mesa_exit(-1);
+       }
+
+       /* We cannot let disabled tmu offsets pass DRM */
+       for (i = 0; i < mtu; i++) {
+               if (ctx->Texture.Unit[i]._ReallyEnabled) {
+                       tmu_mappings[i] = hw_tmu;
+
+                       t = radeon_tex_obj(ctx->Texture.Unit[i]._Current);
+                       if (!t)
+                               continue;
+
+                       if ((t->pp_txformat & 0xffffff00) == 0xffffff00) {
+                               WARN_ONCE
+                                   ("unknown texture format (entry %x) encountered. Help me !\n",
+                                    t->pp_txformat & 0xff);
+                       }
+
+                       if (RADEON_DEBUG & DEBUG_STATE)
+                               fprintf(stderr,
+                                       "Activating texture unit %d\n", i);
+
+                       r300->hw.txe.cmd[R300_TXE_ENABLE] |= (1 << hw_tmu);
+
+                       r300->hw.tex.filter.cmd[R300_TEX_VALUE_0 +
+                                               hw_tmu] =
+                           gen_fixed_filter(t->pp_txfilter) | (hw_tmu << 28);
+                       /* Note: There is a LOD bias per texture unit and a LOD bias
+                        * per texture object. We add them here to get the correct behaviour.
+                        * (The per-texture object LOD bias was introduced in OpenGL 1.4
+                        * and is not present in the EXT_texture_object extension).
+                        */
+                       r300->hw.tex.filter_1.cmd[R300_TEX_VALUE_0 + hw_tmu] =
+                               t->pp_txfilter_1 |
+                               translate_lod_bias(ctx->Texture.Unit[i].LodBias + t->base.LodBias);
+                       r300->hw.tex.size.cmd[R300_TEX_VALUE_0 + hw_tmu] =
+                           t->pp_txsize;
+                       r300->hw.tex.format.cmd[R300_TEX_VALUE_0 +
+                                               hw_tmu] = t->pp_txformat;
+                       r300->hw.tex.pitch.cmd[R300_TEX_VALUE_0 + hw_tmu] =
+                         t->pp_txpitch;
+                       r300->hw.textures[hw_tmu] = t;
+
+                       if (t->tile_bits & R300_TXO_MACRO_TILE) {
+                               WARN_ONCE("macro tiling enabled!\n");
+                       }
+
+                       if (t->tile_bits & R300_TXO_MICRO_TILE) {
+                               WARN_ONCE("micro tiling enabled!\n");
+                       }
+
+                       r300->hw.tex.chroma_key.cmd[R300_TEX_VALUE_0 +
+                                                   hw_tmu] = 0x0;
+                       r300->hw.tex.border_color.cmd[R300_TEX_VALUE_0 +
+                                                     hw_tmu] =
+                           t->pp_border_color;
+
+                       last_hw_tmu = hw_tmu;
+
+                       hw_tmu++;
+               }
+       }
+
+       r300->hw.tex.filter.cmd[R300_TEX_CMD_0] =
+           cmdpacket0(r300->radeon.radeonScreen, R300_TX_FILTER0_0, last_hw_tmu + 1);
+       r300->hw.tex.filter_1.cmd[R300_TEX_CMD_0] =
+           cmdpacket0(r300->radeon.radeonScreen, R300_TX_FILTER1_0, last_hw_tmu + 1);
+       r300->hw.tex.size.cmd[R300_TEX_CMD_0] =
+           cmdpacket0(r300->radeon.radeonScreen, R300_TX_SIZE_0, last_hw_tmu + 1);
+       r300->hw.tex.format.cmd[R300_TEX_CMD_0] =
+           cmdpacket0(r300->radeon.radeonScreen, R300_TX_FORMAT_0, last_hw_tmu + 1);
+       r300->hw.tex.pitch.cmd[R300_TEX_CMD_0] =
+           cmdpacket0(r300->radeon.radeonScreen, R300_TX_FORMAT2_0, last_hw_tmu + 1);
+       r300->hw.tex.offset.cmd[R300_TEX_CMD_0] =
+           cmdpacket0(r300->radeon.radeonScreen, R300_TX_OFFSET_0, last_hw_tmu + 1);
+       r300->hw.tex.chroma_key.cmd[R300_TEX_CMD_0] =
+           cmdpacket0(r300->radeon.radeonScreen, R300_TX_CHROMA_KEY_0, last_hw_tmu + 1);
+       r300->hw.tex.border_color.cmd[R300_TEX_CMD_0] =
+           cmdpacket0(r300->radeon.radeonScreen, R300_TX_BORDER_COLOR_0, last_hw_tmu + 1);
+
+       if (!fp)                /* should only happenen once, just after context is created */
+               return;
+
+       if (r300->radeon.radeonScreen->chip_family < CHIP_FAMILY_RV515) {
+               if (fp->mesa_program.UsesKill && last_hw_tmu < 0) {
+                       // The KILL operation requires the first texture unit
+                       // to be enabled.
+                       r300->hw.txe.cmd[R300_TXE_ENABLE] |= 1;
+                       r300->hw.tex.filter.cmd[R300_TEX_VALUE_0] = 0;
+                       r300->hw.tex.filter.cmd[R300_TEX_CMD_0] =
+                               cmdpacket0(r300->radeon.radeonScreen, R300_TX_FILTER0_0, 1);
+               }
+               r300SetupFragmentShaderTextures(ctx, tmu_mappings);
+       } else
+               r500SetupFragmentShaderTextures(ctx, tmu_mappings);
+
+       if (RADEON_DEBUG & DEBUG_STATE)
+               fprintf(stderr, "TX_ENABLE: %08x  last_hw_tmu=%d\n",
+                       r300->hw.txe.cmd[R300_TXE_ENABLE], last_hw_tmu);
+}
+
+union r300_outputs_written {
+       GLuint vp_outputs;      /* hw_tcl_on */
+        DECLARE_RENDERINPUTS(index_bitset);    /* !hw_tcl_on */
+};
+
+#define R300_OUTPUTS_WRITTEN_TEST(ow, vp_result, tnl_attrib) \
+       ((hw_tcl_on) ? (ow).vp_outputs & (1 << (vp_result)) : \
+       RENDERINPUTS_TEST( (ow.index_bitset), (tnl_attrib) ))
+
+static void r300SetupRSUnit(GLcontext * ctx)
+{
+       r300ContextPtr r300 = R300_CONTEXT(ctx);
+        TNLcontext *tnl = TNL_CONTEXT(ctx);
+       struct vertex_buffer *VB = &tnl->vb;
+       union r300_outputs_written OutputsWritten;
+       GLuint InputsRead;
+       int fp_reg, high_rr;
+       int col_ip, tex_ip;
+       int rs_tex_count = 0;
+       int i, count, col_fmt;
+
+       if (hw_tcl_on)
+               OutputsWritten.vp_outputs = CURRENT_VERTEX_SHADER(ctx)->key.OutputsWritten;
+       else
+               RENDERINPUTS_COPY(OutputsWritten.index_bitset, r300->state.render_inputs_bitset);
+
+       if (ctx->FragmentProgram._Current)
+               InputsRead = ctx->FragmentProgram._Current->Base.InputsRead;
+       else {
+               fprintf(stderr, "No ctx->FragmentProgram._Current!!\n");
+               return;         /* This should only ever happen once.. */
+       }
+
+       R300_STATECHANGE(r300, ri);
+       R300_STATECHANGE(r300, rc);
+       R300_STATECHANGE(r300, rr);
+
+       fp_reg = col_ip = tex_ip = col_fmt = 0;
+
+       r300->hw.rc.cmd[1] = 0;
+       r300->hw.rc.cmd[2] = 0;
+       for (i=0; i<R300_RR_CMDSIZE-1; ++i)
+               r300->hw.rr.cmd[R300_RR_INST_0 + i] = 0;
+
+       for (i=0; i<R300_RI_CMDSIZE-1; ++i)
+               r300->hw.ri.cmd[R300_RI_INTERP_0 + i] = 0;
+
+
+       if (InputsRead & FRAG_BIT_COL0) {
+               if (R300_OUTPUTS_WRITTEN_TEST(OutputsWritten, VERT_RESULT_COL0, _TNL_ATTRIB_COLOR0)) {
+                       count = VB->AttribPtr[_TNL_ATTRIB_COLOR0]->size;
+                       if (count == 4)
+                           col_fmt = R300_RS_COL_FMT_RGBA;
+                       else if (count == 3)
+                           col_fmt = R300_RS_COL_FMT_RGB1;
+                       else
+                           col_fmt = R300_RS_COL_FMT_0001;
+
+                       r300->hw.ri.cmd[R300_RI_INTERP_0 + col_ip] = R300_RS_COL_PTR(col_ip) | R300_RS_COL_FMT(col_fmt);
+                       r300->hw.rr.cmd[R300_RR_INST_0 + col_ip] = R300_RS_INST_COL_ID(col_ip) | R300_RS_INST_COL_CN_WRITE | R300_RS_INST_COL_ADDR(fp_reg);
+                       InputsRead &= ~FRAG_BIT_COL0;
+                       ++col_ip;
+                       ++fp_reg;
+               } else {
+                       WARN_ONCE("fragprog wants col0, vp doesn't provide it\n");
+               }
+       }
+
+       if (InputsRead & FRAG_BIT_COL1) {
+               if (R300_OUTPUTS_WRITTEN_TEST(OutputsWritten, VERT_RESULT_COL1, _TNL_ATTRIB_COLOR1)) {
+                       count = VB->AttribPtr[_TNL_ATTRIB_COLOR1]->size;
+                       if (count == 4)
+                           col_fmt = R300_RS_COL_FMT_RGBA;
+                       else if (count == 3)
+                           col_fmt = R300_RS_COL_FMT_RGB1;
+                       else
+                           col_fmt = R300_RS_COL_FMT_0001;
+
+                       r300->hw.ri.cmd[R300_RI_INTERP_0 + col_ip] = R300_RS_COL_PTR(col_ip) | R300_RS_COL_FMT(col_fmt);
+                       r300->hw.rr.cmd[R300_RR_INST_0 + col_ip] = R300_RS_INST_COL_ID(col_ip) | R300_RS_INST_COL_CN_WRITE | R300_RS_INST_COL_ADDR(fp_reg);
+                       InputsRead &= ~FRAG_BIT_COL1;
+                       ++col_ip;
+                       ++fp_reg;
+               } else {
+                       WARN_ONCE("fragprog wants col1, vp doesn't provide it\n");
+               }
+       }
+
+       for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
+               if (! ( InputsRead & FRAG_BIT_TEX(i) ) )
+                   continue;
+
+               if (!R300_OUTPUTS_WRITTEN_TEST(OutputsWritten, VERT_RESULT_TEX0 + i, _TNL_ATTRIB_TEX(i))) {
+                   WARN_ONCE("fragprog wants coords for tex%d, vp doesn't provide them!\n", i);
+                   continue;
+               }
+
+               int swiz;
+
+               /* with TCL we always seem to route 4 components */
+               if (hw_tcl_on)
+                 count = 4;
+               else
+                 count = VB->AttribPtr[_TNL_ATTRIB_TEX(i)]->size;
+
+               switch(count) {
+               case 4: swiz = R300_RS_SEL_S(0) | R300_RS_SEL_T(1) | R300_RS_SEL_R(2) | R300_RS_SEL_Q(3); break;
+               case 3: swiz = R300_RS_SEL_S(0) | R300_RS_SEL_T(1) | R300_RS_SEL_R(2) | R300_RS_SEL_Q(R300_RS_SEL_K1); break;
+               default:
+               case 1:
+               case 2: swiz = R300_RS_SEL_S(0) | R300_RS_SEL_T(1) | R300_RS_SEL_R(R300_RS_SEL_K0) | R300_RS_SEL_Q(R300_RS_SEL_K1); break;
+               };
+
+               r300->hw.ri.cmd[R300_RI_INTERP_0 + tex_ip] |= swiz | R300_RS_TEX_PTR(rs_tex_count);
+               r300->hw.rr.cmd[R300_RR_INST_0 + tex_ip] |= R300_RS_INST_TEX_ID(tex_ip) | R300_RS_INST_TEX_CN_WRITE | R300_RS_INST_TEX_ADDR(fp_reg);
+               InputsRead &= ~(FRAG_BIT_TEX0 << i);
+               rs_tex_count += count;
+               ++tex_ip;
+               ++fp_reg;
+       }
+
+       if (InputsRead & FRAG_BIT_FOGC) {
+               if (R300_OUTPUTS_WRITTEN_TEST(OutputsWritten, VERT_RESULT_FOGC, _TNL_ATTRIB_FOG)) {
+                       r300->hw.ri.cmd[R300_RI_INTERP_0 + tex_ip] |=  R300_RS_SEL_S(0) | R300_RS_SEL_T(1) | R300_RS_SEL_R(2) | R300_RS_SEL_Q(3) |  R300_RS_TEX_PTR(rs_tex_count);
+                       r300->hw.rr.cmd[R300_RR_INST_0 + tex_ip] |= R300_RS_INST_TEX_ID(tex_ip) | R300_RS_INST_TEX_CN_WRITE | R300_RS_INST_TEX_ADDR(fp_reg);
+                       InputsRead &= ~FRAG_BIT_FOGC;
+                       rs_tex_count += 4;
+                       ++tex_ip;
+                       ++fp_reg;
+               } else {
+                       WARN_ONCE("fragprog wants fogc, vp doesn't provide it\n");
+               }
+       }
+
+       if (InputsRead & FRAG_BIT_WPOS) {
+               r300->hw.ri.cmd[R300_RI_INTERP_0 + tex_ip] |=  R300_RS_SEL_S(0) | R300_RS_SEL_T(1) | R300_RS_SEL_R(2) | R300_RS_SEL_Q(3) |  R300_RS_TEX_PTR(rs_tex_count);
+               r300->hw.rr.cmd[R300_RR_INST_0 + tex_ip] |= R300_RS_INST_TEX_ID(tex_ip) | R300_RS_INST_TEX_CN_WRITE | R300_RS_INST_TEX_ADDR(fp_reg);
+               InputsRead &= ~FRAG_BIT_WPOS;
+               rs_tex_count += 4;
+               ++tex_ip;
+               ++fp_reg;
+       }
+       InputsRead &= ~FRAG_BIT_WPOS;
+
+       /* Setup default color if no color or tex was set */
+       if (rs_tex_count == 0 && col_ip == 0) {
+               r300->hw.rr.cmd[R300_RR_INST_0] = R300_RS_INST_COL_ID(0) | R300_RS_INST_COL_CN_WRITE | R300_RS_INST_COL_ADDR(0) | R300_RS_COL_FMT(R300_RS_COL_FMT_0001);
+               ++col_ip;
+       }
+
+       high_rr = (col_ip > tex_ip) ? col_ip : tex_ip;
+       r300->hw.rc.cmd[1] |= (rs_tex_count << R300_IT_COUNT_SHIFT)  | (col_ip << R300_IC_COUNT_SHIFT) | R300_HIRES_EN;
+       r300->hw.rc.cmd[2] |= high_rr - 1;
+
+        r300->hw.rr.cmd[R300_RR_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_RS_INST_0, high_rr);
+
+       if (InputsRead)
+               WARN_ONCE("Don't know how to satisfy InputsRead=0x%08x\n", InputsRead);
+}
+
+static void r500SetupRSUnit(GLcontext * ctx)
+{
+       r300ContextPtr r300 = R300_CONTEXT(ctx);
+        TNLcontext *tnl = TNL_CONTEXT(ctx);
+       struct vertex_buffer *VB = &tnl->vb;
+       union r300_outputs_written OutputsWritten;
+       GLuint InputsRead;
+       int fp_reg, high_rr;
+       int col_ip, tex_ip;
+       int rs_tex_count = 0;
+       int i, count, col_fmt;
+
+       if (hw_tcl_on)
+               OutputsWritten.vp_outputs = CURRENT_VERTEX_SHADER(ctx)->key.OutputsWritten;
+       else
+               RENDERINPUTS_COPY(OutputsWritten.index_bitset, r300->state.render_inputs_bitset);
+
+       if (ctx->FragmentProgram._Current)
+               InputsRead = ctx->FragmentProgram._Current->Base.InputsRead;
+       else {
+               fprintf(stderr, "No ctx->FragmentProgram._Current!!\n");
+               return;         /* This should only ever happen once.. */
+       }
+
+       R300_STATECHANGE(r300, ri);
+       R300_STATECHANGE(r300, rc);
+       R300_STATECHANGE(r300, rr);
+
+       fp_reg = col_ip = tex_ip = col_fmt = 0;
+
+       r300->hw.rc.cmd[1] = 0;
+       r300->hw.rc.cmd[2] = 0;
+       for (i=0; i<R300_RR_CMDSIZE-1; ++i)
+               r300->hw.rr.cmd[R300_RR_INST_0 + i] = 0;
+
+       for (i=0; i<R500_RI_CMDSIZE-1; ++i)
+               r300->hw.ri.cmd[R300_RI_INTERP_0 + i] = 0;
+
+
+       if (InputsRead & FRAG_BIT_COL0) {
+               if (R300_OUTPUTS_WRITTEN_TEST(OutputsWritten, VERT_RESULT_COL0, _TNL_ATTRIB_COLOR0)) {
+                       count = VB->AttribPtr[_TNL_ATTRIB_COLOR0]->size;
+                       if (count == 4)
+                           col_fmt = R300_RS_COL_FMT_RGBA;
+                       else if (count == 3)
+                           col_fmt = R300_RS_COL_FMT_RGB1;
+                       else
+                           col_fmt = R300_RS_COL_FMT_0001;
+
+                       r300->hw.ri.cmd[R300_RI_INTERP_0 + col_ip] = R500_RS_COL_PTR(col_ip) | R500_RS_COL_FMT(col_fmt);
+                       r300->hw.rr.cmd[R300_RR_INST_0 + col_ip] = R500_RS_INST_COL_ID(col_ip) | R500_RS_INST_COL_CN_WRITE | R500_RS_INST_COL_ADDR(fp_reg);
+                       InputsRead &= ~FRAG_BIT_COL0;
+                       ++col_ip;
+                       ++fp_reg;
+               } else {
+                       WARN_ONCE("fragprog wants col0, vp doesn't provide it\n");
+               }
+       }
+
+       if (InputsRead & FRAG_BIT_COL1) {
+               if (R300_OUTPUTS_WRITTEN_TEST(OutputsWritten, VERT_RESULT_COL1, _TNL_ATTRIB_COLOR1)) {
+                       count = VB->AttribPtr[_TNL_ATTRIB_COLOR1]->size;
+                       if (count == 4)
+                           col_fmt = R300_RS_COL_FMT_RGBA;
+                       else if (count == 3)
+                           col_fmt = R300_RS_COL_FMT_RGB1;
+                       else
+                           col_fmt = R300_RS_COL_FMT_0001;
+
+                       r300->hw.ri.cmd[R300_RI_INTERP_0 + col_ip] = R500_RS_COL_PTR(col_ip) | R500_RS_COL_FMT(col_fmt);
+                       r300->hw.rr.cmd[R300_RR_INST_0 + col_ip] = R500_RS_INST_COL_ID(col_ip) | R500_RS_INST_COL_CN_WRITE | R500_RS_INST_COL_ADDR(fp_reg);
+                       InputsRead &= ~FRAG_BIT_COL1;
+                       ++col_ip;
+                       ++fp_reg;
+               } else {
+                       WARN_ONCE("fragprog wants col1, vp doesn't provide it\n");
+               }
+       }
+
+
+       for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
+               if (! ( InputsRead & FRAG_BIT_TEX(i) ) )
+                   continue;
+
+               if (!R300_OUTPUTS_WRITTEN_TEST(OutputsWritten, VERT_RESULT_TEX0 + i, _TNL_ATTRIB_TEX(i))) {
+                   WARN_ONCE("fragprog wants coords for tex%d, vp doesn't provide them!\n", i);
+                   continue;
+               }
+
+               int swiz = 0;
+
+               /* with TCL we always seem to route 4 components */
+               if (hw_tcl_on)
+                 count = 4;
+               else
+                 count = VB->AttribPtr[_TNL_ATTRIB_TEX(i)]->size;
+
+               if (count == 4) {
+                       swiz |= (rs_tex_count + 0) << R500_RS_IP_TEX_PTR_S_SHIFT;
+                       swiz |= (rs_tex_count + 1) << R500_RS_IP_TEX_PTR_T_SHIFT;
+                       swiz |= (rs_tex_count + 2) << R500_RS_IP_TEX_PTR_R_SHIFT;
+                       swiz |= (rs_tex_count + 3) << R500_RS_IP_TEX_PTR_Q_SHIFT;
+               } else if (count == 3) {
+                       swiz |= (rs_tex_count + 0) << R500_RS_IP_TEX_PTR_S_SHIFT;
+                       swiz |= (rs_tex_count + 1) << R500_RS_IP_TEX_PTR_T_SHIFT;
+                       swiz |= (rs_tex_count + 2) << R500_RS_IP_TEX_PTR_R_SHIFT;
+                       swiz |= R500_RS_IP_PTR_K1 << R500_RS_IP_TEX_PTR_Q_SHIFT;
+               } else if (count == 2) {
+                       swiz |= (rs_tex_count + 0) << R500_RS_IP_TEX_PTR_S_SHIFT;
+                       swiz |= (rs_tex_count + 1) << R500_RS_IP_TEX_PTR_T_SHIFT;
+                       swiz |= R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_R_SHIFT;
+                       swiz |= R500_RS_IP_PTR_K1 << R500_RS_IP_TEX_PTR_Q_SHIFT;
+               } else if (count == 1) {
+                       swiz |= (rs_tex_count + 0) << R500_RS_IP_TEX_PTR_S_SHIFT;
+                       swiz |= R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_T_SHIFT;
+                       swiz |= R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_R_SHIFT;
+                       swiz |= R500_RS_IP_PTR_K1 << R500_RS_IP_TEX_PTR_Q_SHIFT;
+               } else {
+                       swiz |= R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_S_SHIFT;
+                       swiz |= R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_T_SHIFT;
+                       swiz |= R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_R_SHIFT;
+                       swiz |= R500_RS_IP_PTR_K1 << R500_RS_IP_TEX_PTR_Q_SHIFT;
+               }
+
+               r300->hw.ri.cmd[R300_RI_INTERP_0 + tex_ip] |= swiz;
+               r300->hw.rr.cmd[R300_RR_INST_0 + tex_ip] |= R500_RS_INST_TEX_ID(tex_ip) | R500_RS_INST_TEX_CN_WRITE | R500_RS_INST_TEX_ADDR(fp_reg);
+               InputsRead &= ~(FRAG_BIT_TEX0 << i);
+               rs_tex_count += count;
+               ++tex_ip;
+               ++fp_reg;
+       }
+
+       if (InputsRead & FRAG_BIT_FOGC) {
+               if (R300_OUTPUTS_WRITTEN_TEST(OutputsWritten, VERT_RESULT_FOGC, _TNL_ATTRIB_FOG)) {
+                       r300->hw.ri.cmd[R300_RI_INTERP_0 + tex_ip] |= ((rs_tex_count + 0) << R500_RS_IP_TEX_PTR_S_SHIFT) |
+                               ((rs_tex_count + 1) << R500_RS_IP_TEX_PTR_T_SHIFT) |
+                               ((rs_tex_count + 2) << R500_RS_IP_TEX_PTR_R_SHIFT) |
+                               ((rs_tex_count + 3) << R500_RS_IP_TEX_PTR_Q_SHIFT);
+
+                       r300->hw.rr.cmd[R300_RR_INST_0 + tex_ip] |= R500_RS_INST_TEX_ID(tex_ip) | R500_RS_INST_TEX_CN_WRITE | R500_RS_INST_TEX_ADDR(fp_reg);
+                       InputsRead &= ~FRAG_BIT_FOGC;
+                       rs_tex_count += 4;
+                       ++tex_ip;
+                       ++fp_reg;
+               } else {
+                       WARN_ONCE("fragprog wants fogc, vp doesn't provide it\n");
+               }
+       }
+
+       if (InputsRead & FRAG_BIT_WPOS) {
+               r300->hw.ri.cmd[R300_RI_INTERP_0 + tex_ip] |= ((rs_tex_count + 0) << R500_RS_IP_TEX_PTR_S_SHIFT) |
+                               ((rs_tex_count + 1) << R500_RS_IP_TEX_PTR_T_SHIFT) |
+                               ((rs_tex_count + 2) << R500_RS_IP_TEX_PTR_R_SHIFT) |
+                               ((rs_tex_count + 3) << R500_RS_IP_TEX_PTR_Q_SHIFT);
+
+               r300->hw.rr.cmd[R300_RR_INST_0 + tex_ip] |= R500_RS_INST_TEX_ID(tex_ip) | R500_RS_INST_TEX_CN_WRITE | R500_RS_INST_TEX_ADDR(fp_reg);
+               InputsRead &= ~FRAG_BIT_WPOS;
+               rs_tex_count += 4;
+               ++tex_ip;
+               ++fp_reg;
+       }
+
+       /* Setup default color if no color or tex was set */
+       if (rs_tex_count == 0 && col_ip == 0) {
+               r300->hw.rr.cmd[R300_RR_INST_0] |= R500_RS_INST_COL_ID(0) | R500_RS_INST_COL_CN_WRITE | R500_RS_INST_COL_ADDR(0) | R500_RS_COL_FMT(R300_RS_COL_FMT_0001);
+               ++col_ip;
+       }
+
+       high_rr = (col_ip > tex_ip) ? col_ip : tex_ip;
+       r300->hw.rc.cmd[1] |= (rs_tex_count << R300_IT_COUNT_SHIFT)  | (col_ip << R300_IC_COUNT_SHIFT) | R300_HIRES_EN;
+       r300->hw.rc.cmd[2] |= 0xC0 | (high_rr - 1);
+
+       r300->hw.rr.cmd[R300_RR_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R500_RS_INST_0, high_rr);
+
+       if (InputsRead)
+               WARN_ONCE("Don't know how to satisfy InputsRead=0x%08x\n", InputsRead);
+}
+
+
+
+
+#define bump_vpu_count(ptr, new_count)   do{\
+       drm_r300_cmd_header_t* _p=((drm_r300_cmd_header_t*)(ptr));\
+       int _nc=(new_count)/4; \
+       assert(_nc < 256); \
+       if(_nc>_p->vpu.count)_p->vpu.count=_nc;\
+       }while(0)
+
+static INLINE void r300SetupVertexProgramFragment(r300ContextPtr r300, int dest, struct r300_vertex_shader_fragment *vsf)
+{
+       int i;
+
+       if (vsf->length == 0)
+               return;
+
+       if (vsf->length & 0x3) {
+               fprintf(stderr, "VERTEX_SHADER_FRAGMENT must have length divisible by 4\n");
+               _mesa_exit(-1);
+       }
+
+       switch ((dest >> 8) & 0xf) {
+       case 0:
+               R300_STATECHANGE(r300, vpi);
+               for (i = 0; i < vsf->length; i++)
+                       r300->hw.vpi.cmd[R300_VPI_INSTR_0 + i + 4 * (dest & 0xff)] = (vsf->body.d[i]);
+               bump_vpu_count(r300->hw.vpi.cmd, vsf->length + 4 * (dest & 0xff));
+               break;
+
+       case 2:
+               R300_STATECHANGE(r300, vpp);
+               for (i = 0; i < vsf->length; i++)
+                       r300->hw.vpp.cmd[R300_VPP_PARAM_0 + i + 4 * (dest & 0xff)] = (vsf->body.d[i]);
+               bump_vpu_count(r300->hw.vpp.cmd, vsf->length + 4 * (dest & 0xff));
+               break;
+       case 4:
+               R300_STATECHANGE(r300, vps);
+               for (i = 0; i < vsf->length; i++)
+                       r300->hw.vps.cmd[1 + i + 4 * (dest & 0xff)] = (vsf->body.d[i]);
+               bump_vpu_count(r300->hw.vps.cmd, vsf->length + 4 * (dest & 0xff));
+               break;
+       default:
+               fprintf(stderr, "%s:%s don't know how to handle dest %04x\n", __FILE__, __FUNCTION__, dest);
+               _mesa_exit(-1);
+       }
+}
+
+#define MIN3(a, b, c)  ((a) < (b) ? MIN2(a, c) : MIN2(b, c))
+
+
+static void r300VapCntl(r300ContextPtr rmesa, GLuint input_count,
+                       GLuint output_count, GLuint temp_count)
+{
+    int vtx_mem_size;
+    int pvs_num_slots;
+    int pvs_num_cntrls;
+
+    /* Flush PVS engine before changing PVS_NUM_SLOTS, PVS_NUM_CNTRLS.
+     * See r500 docs 6.5.2 - done in emit */
+
+    /* avoid division by zero */
+    if (input_count == 0) input_count = 1;
+    if (output_count == 0) output_count = 1;
+    if (temp_count == 0) temp_count = 1;
+
+    if (rmesa->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515)
+       vtx_mem_size = 128;
+    else
+       vtx_mem_size = 72;
+
+    pvs_num_slots = MIN3(10, vtx_mem_size/input_count, vtx_mem_size/output_count);
+    pvs_num_cntrls = MIN2(6, vtx_mem_size/temp_count);
+
+    R300_STATECHANGE(rmesa, vap_cntl);
+    if (rmesa->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL) {
+       rmesa->hw.vap_cntl.cmd[R300_VAP_CNTL_INSTR] =
+           (pvs_num_slots << R300_PVS_NUM_SLOTS_SHIFT) |
+           (pvs_num_cntrls << R300_PVS_NUM_CNTLRS_SHIFT) |
+           (12 << R300_VF_MAX_VTX_NUM_SHIFT);
+       if (rmesa->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515)
+           rmesa->hw.vap_cntl.cmd[R300_VAP_CNTL_INSTR] |= R500_TCL_STATE_OPTIMIZATION;
+    } else
+       /* not sure about non-tcl */
+       rmesa->hw.vap_cntl.cmd[R300_VAP_CNTL_INSTR] = ((10 << R300_PVS_NUM_SLOTS_SHIFT) |
+                                   (5 << R300_PVS_NUM_CNTLRS_SHIFT) |
+                                   (5 << R300_VF_MAX_VTX_NUM_SHIFT));
+
+    if (rmesa->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV515)
+       rmesa->hw.vap_cntl.cmd[R300_VAP_CNTL_INSTR] |= (2 << R300_PVS_NUM_FPUS_SHIFT);
+    else if ((rmesa->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV530) ||
+            (rmesa->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV560) ||
+            (rmesa->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV570))
+       rmesa->hw.vap_cntl.cmd[R300_VAP_CNTL_INSTR] |= (5 << R300_PVS_NUM_FPUS_SHIFT);
+    else if ((rmesa->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV410) ||
+            (rmesa->radeon.radeonScreen->chip_family == CHIP_FAMILY_R420))
+       rmesa->hw.vap_cntl.cmd[R300_VAP_CNTL_INSTR] |= (6 << R300_PVS_NUM_FPUS_SHIFT);
+    else if ((rmesa->radeon.radeonScreen->chip_family == CHIP_FAMILY_R520) ||
+            (rmesa->radeon.radeonScreen->chip_family == CHIP_FAMILY_R580))
+       rmesa->hw.vap_cntl.cmd[R300_VAP_CNTL_INSTR] |= (8 << R300_PVS_NUM_FPUS_SHIFT);
+    else
+       rmesa->hw.vap_cntl.cmd[R300_VAP_CNTL_INSTR] |= (4 << R300_PVS_NUM_FPUS_SHIFT);
+
+}
+
+static void r300SetupDefaultVertexProgram(r300ContextPtr rmesa)
+{
+       struct r300_vertex_shader_state *prog = &(rmesa->state.vertex_shader);
+       GLuint o_reg = 0;
+       GLuint i_reg = 0;
+       int i;
+       int inst_count = 0;
+       int param_count = 0;
+       int program_end = 0;
+
+       for (i = VERT_ATTRIB_POS; i < VERT_ATTRIB_MAX; i++) {
+               if (rmesa->state.sw_tcl_inputs[i] != -1) {
+                       prog->program.body.i[program_end + 0] = PVS_OP_DST_OPERAND(VE_MULTIPLY, GL_FALSE, GL_FALSE, o_reg++, VSF_FLAG_ALL, PVS_DST_REG_OUT);
+                       prog->program.body.i[program_end + 1] = PVS_SRC_OPERAND(rmesa->state.sw_tcl_inputs[i], PVS_SRC_SELECT_X, PVS_SRC_SELECT_Y, PVS_SRC_SELECT_Z, PVS_SRC_SELECT_W, PVS_SRC_REG_INPUT, VSF_FLAG_NONE);
+                       prog->program.body.i[program_end + 2] = PVS_SRC_OPERAND(rmesa->state.sw_tcl_inputs[i], PVS_SRC_SELECT_FORCE_1, PVS_SRC_SELECT_FORCE_1, PVS_SRC_SELECT_FORCE_1, PVS_SRC_SELECT_FORCE_1, PVS_SRC_REG_INPUT, VSF_FLAG_NONE);
+                       prog->program.body.i[program_end + 3] = PVS_SRC_OPERAND(rmesa->state.sw_tcl_inputs[i], PVS_SRC_SELECT_FORCE_1, PVS_SRC_SELECT_FORCE_1, PVS_SRC_SELECT_FORCE_1, PVS_SRC_SELECT_FORCE_1, PVS_SRC_REG_INPUT, VSF_FLAG_NONE);
+                       program_end += 4;
+                       i_reg++;
+               }
+       }
+
+       prog->program.length = program_end;
+
+       r300SetupVertexProgramFragment(rmesa, R300_PVS_CODE_START,
+                                      &(prog->program));
+       inst_count = (prog->program.length / 4) - 1;
+
+       r300VapCntl(rmesa, i_reg, o_reg, 0);
+
+       R300_STATECHANGE(rmesa, pvs);
+       rmesa->hw.pvs.cmd[R300_PVS_CNTL_1] =
+           (0 << R300_PVS_FIRST_INST_SHIFT) |
+           (inst_count << R300_PVS_XYZW_VALID_INST_SHIFT) |
+           (inst_count << R300_PVS_LAST_INST_SHIFT);
+       rmesa->hw.pvs.cmd[R300_PVS_CNTL_2] =
+           (0 << R300_PVS_CONST_BASE_OFFSET_SHIFT) |
+           (param_count << R300_PVS_MAX_CONST_ADDR_SHIFT);
+       rmesa->hw.pvs.cmd[R300_PVS_CNTL_3] =
+           (inst_count << R300_PVS_LAST_VTX_SRC_INST_SHIFT);
+}
+
+static int bit_count (int x)
+{
+    x = ((x & 0xaaaaaaaaU) >> 1) + (x & 0x55555555U);
+    x = ((x & 0xccccccccU) >> 2) + (x & 0x33333333U);
+    x = (x >> 16) + (x & 0xffff);
+    x = ((x & 0xf0f0) >> 4) + (x & 0x0f0f);
+    return (x >> 8) + (x & 0x00ff);
+}
+
+static void r300SetupRealVertexProgram(r300ContextPtr rmesa)
+{
+       GLcontext *ctx = rmesa->radeon.glCtx;
+       struct r300_vertex_program *prog = (struct r300_vertex_program *)CURRENT_VERTEX_SHADER(ctx);
+       int inst_count = 0;
+       int param_count = 0;
+
+       /* FIXME: r300SetupVertexProgramFragment */
+       R300_STATECHANGE(rmesa, vpp);
+       param_count =
+           r300VertexProgUpdateParams(ctx,
+                                      (struct r300_vertex_program_cont *)
+                                      ctx->VertexProgram._Current,
+                                      (float *)&rmesa->hw.vpp.
+                                      cmd[R300_VPP_PARAM_0]);
+       bump_vpu_count(rmesa->hw.vpp.cmd, param_count);
+       param_count /= 4;
+
+       r300SetupVertexProgramFragment(rmesa, R300_PVS_CODE_START, &(prog->program));
+       inst_count = (prog->program.length / 4) - 1;
+
+       r300VapCntl(rmesa, bit_count(prog->key.InputsRead),
+                   bit_count(prog->key.OutputsWritten), prog->num_temporaries);
+
+       R300_STATECHANGE(rmesa, pvs);
+       rmesa->hw.pvs.cmd[R300_PVS_CNTL_1] =
+         (0 << R300_PVS_FIRST_INST_SHIFT) |
+         (inst_count << R300_PVS_XYZW_VALID_INST_SHIFT) |
+         (inst_count << R300_PVS_LAST_INST_SHIFT);
+       rmesa->hw.pvs.cmd[R300_PVS_CNTL_2] =
+         (0 << R300_PVS_CONST_BASE_OFFSET_SHIFT) |
+         (param_count << R300_PVS_MAX_CONST_ADDR_SHIFT);
+       rmesa->hw.pvs.cmd[R300_PVS_CNTL_3] =
+         (inst_count << R300_PVS_LAST_VTX_SRC_INST_SHIFT);
+}
+
+
+static void r300SetupVertexProgram(r300ContextPtr rmesa)
+{
+       GLcontext *ctx = rmesa->radeon.glCtx;
+
+       /* Reset state, in case we don't use something */
+       ((drm_r300_cmd_header_t *) rmesa->hw.vpp.cmd)->vpu.count = 0;
+       ((drm_r300_cmd_header_t *) rmesa->hw.vpi.cmd)->vpu.count = 0;
+       ((drm_r300_cmd_header_t *) rmesa->hw.vps.cmd)->vpu.count = 0;
+
+       /* Not sure why this doesnt work...
+          0x400 area might have something to do with pixel shaders as it appears right after pfs programming.
+          0x406 is set to { 0.0, 0.0, 1.0, 0.0 } most of the time but should change with smooth points and in other rare cases. */
+       //setup_vertex_shader_fragment(rmesa, 0x406, &unk4);
+       if (hw_tcl_on && ((struct r300_vertex_program *)CURRENT_VERTEX_SHADER(ctx))->translated) {
+               r300SetupRealVertexProgram(rmesa);
+       } else {
+               /* FIXME: This needs to be replaced by vertex shader generation code. */
+               r300SetupDefaultVertexProgram(rmesa);
+       }
+
+}
+
+/**
+ * Enable/Disable states.
+ *
+ * \note Mesa already filters redundant calls to this function.
+ */
+static void r300Enable(GLcontext * ctx, GLenum cap, GLboolean state)
+{
+       r300ContextPtr rmesa = R300_CONTEXT(ctx);
+       if (RADEON_DEBUG & DEBUG_STATE)
+               fprintf(stderr, "%s( %s = %s )\n", __FUNCTION__,
+                       _mesa_lookup_enum_by_nr(cap),
+                       state ? "GL_TRUE" : "GL_FALSE");
+
+       switch (cap) {
+       case GL_TEXTURE_1D:
+       case GL_TEXTURE_2D:
+       case GL_TEXTURE_3D:
+               /* empty */
+               break;
+       case GL_FOG:
+               /* empty */
+               break;
+       case GL_ALPHA_TEST:
+               r300SetAlphaState(ctx);
+               break;
+       case GL_COLOR_LOGIC_OP:
+               r300SetLogicOpState(ctx);
+               /* fall-through, because logic op overrides blending */
+       case GL_BLEND:
+               r300SetBlendState(ctx);
+               break;
+       case GL_CLIP_PLANE0:
+       case GL_CLIP_PLANE1:
+       case GL_CLIP_PLANE2:
+       case GL_CLIP_PLANE3:
+       case GL_CLIP_PLANE4:
+       case GL_CLIP_PLANE5:
+               r300SetClipPlaneState(ctx, cap, state);
+               break;
+       case GL_DEPTH_TEST:
+               r300SetDepthState(ctx);
+               break;
+       case GL_STENCIL_TEST:
+               r300SetStencilState(ctx, state);
+               break;
+       case GL_CULL_FACE:
+               r300UpdateCulling(ctx);
+               break;
+       case GL_POLYGON_OFFSET_POINT:
+       case GL_POLYGON_OFFSET_LINE:
+       case GL_POLYGON_OFFSET_FILL:
+               r300SetPolygonOffsetState(ctx, state);
+               break;
+       case GL_SCISSOR_TEST:
+               radeon_firevertices(&rmesa->radeon);
+               rmesa->radeon.state.scissor.enabled = state;
+               radeonUpdateScissor( ctx );
+               break;
+       default:
+               break;
+       }
+}
+
+/**
+ * Completely recalculates hardware state based on the Mesa state.
+ */
+static void r300ResetHwState(r300ContextPtr r300)
+{
+       GLcontext *ctx = r300->radeon.glCtx;
+       int has_tcl = 1;
+
+       if (!(r300->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL))
+               has_tcl = 0;
+
+       if (RADEON_DEBUG & DEBUG_STATE)
+               fprintf(stderr, "%s\n", __FUNCTION__);
+
+       radeon_firevertices(&r300->radeon);
+
+       r300ColorMask(ctx,
+                     ctx->Color.ColorMask[RCOMP],
+                     ctx->Color.ColorMask[GCOMP],
+                     ctx->Color.ColorMask[BCOMP], ctx->Color.ColorMask[ACOMP]);
+
+       r300Enable(ctx, GL_DEPTH_TEST, ctx->Depth.Test);
+       r300DepthMask(ctx, ctx->Depth.Mask);
+       r300DepthFunc(ctx, ctx->Depth.Func);
+
+       /* stencil */
+       r300Enable(ctx, GL_STENCIL_TEST, ctx->Stencil._Enabled);
+       r300StencilMaskSeparate(ctx, 0, ctx->Stencil.WriteMask[0]);
+       r300StencilFuncSeparate(ctx, 0, ctx->Stencil.Function[0],
+                               ctx->Stencil.Ref[0], ctx->Stencil.ValueMask[0]);
+       r300StencilOpSeparate(ctx, 0, ctx->Stencil.FailFunc[0],
+                             ctx->Stencil.ZFailFunc[0],
+                             ctx->Stencil.ZPassFunc[0]);
+
+       r300UpdateCulling(ctx);
+
+       r300SetBlendState(ctx);
+       r300SetLogicOpState(ctx);
+
+       r300AlphaFunc(ctx, ctx->Color.AlphaFunc, ctx->Color.AlphaRef);
+       r300Enable(ctx, GL_ALPHA_TEST, ctx->Color.AlphaEnabled);
+
+       r300->hw.vte.cmd[1] = R300_VPORT_X_SCALE_ENA
+           | R300_VPORT_X_OFFSET_ENA
+           | R300_VPORT_Y_SCALE_ENA
+           | R300_VPORT_Y_OFFSET_ENA
+           | R300_VPORT_Z_SCALE_ENA
+           | R300_VPORT_Z_OFFSET_ENA | R300_VTX_W0_FMT;
+       r300->hw.vte.cmd[2] = 0x00000008;
+
+       r300->hw.vap_vf_max_vtx_indx.cmd[1] = 0x00FFFFFF;
+       r300->hw.vap_vf_max_vtx_indx.cmd[2] = 0x00000000;
+
+#ifdef MESA_LITTLE_ENDIAN
+       r300->hw.vap_cntl_status.cmd[1] = R300_VC_NO_SWAP;
+#else
+       r300->hw.vap_cntl_status.cmd[1] = R300_VC_32BIT_SWAP;
+#endif
+
+       /* disable VAP/TCL on non-TCL capable chips */
+       if (!has_tcl)
+               r300->hw.vap_cntl_status.cmd[1] |= R300_VAP_TCL_BYPASS;
+
+       r300->hw.vap_psc_sgn_norm_cntl.cmd[1] = 0xAAAAAAAA;
+
+       /* XXX: Other families? */
+       if (has_tcl) {
+               r300->hw.vap_clip_cntl.cmd[1] = R300_PS_UCP_MODE_DIST_COP;
+
+               r300->hw.vap_clip.cmd[1] = r300PackFloat32(1.0); /* X */
+               r300->hw.vap_clip.cmd[2] = r300PackFloat32(1.0); /* X */
+               r300->hw.vap_clip.cmd[3] = r300PackFloat32(1.0); /* Y */
+               r300->hw.vap_clip.cmd[4] = r300PackFloat32(1.0); /* Y */
+
+               switch (r300->radeon.radeonScreen->chip_family) {
+               case CHIP_FAMILY_R300:
+                       r300->hw.vap_pvs_vtx_timeout_reg.cmd[1] = R300_2288_R300;
+                       break;
+               default:
+                       r300->hw.vap_pvs_vtx_timeout_reg.cmd[1] = R300_2288_RV350;
+                       break;
+               }
+       }
+
+       r300->hw.gb_enable.cmd[1] = R300_GB_POINT_STUFF_ENABLE
+           | R300_GB_LINE_STUFF_ENABLE
+           | R300_GB_TRIANGLE_STUFF_ENABLE;
+
+       r300->hw.gb_misc.cmd[R300_GB_MISC_MSPOS_0] = 0x66666666;
+       r300->hw.gb_misc.cmd[R300_GB_MISC_MSPOS_1] = 0x06666666;
+
+       r300->hw.gb_misc.cmd[R300_GB_MISC_TILE_CONFIG] =
+           R300_GB_TILE_ENABLE | R300_GB_TILE_SIZE_16 /*| R300_GB_SUBPIXEL_1_16*/;
+       switch (r300->radeon.radeonScreen->num_gb_pipes) {
+       case 1:
+       default:
+               r300->hw.gb_misc.cmd[R300_GB_MISC_TILE_CONFIG] |=
+                   R300_GB_TILE_PIPE_COUNT_RV300;
+               break;
+       case 2:
+               r300->hw.gb_misc.cmd[R300_GB_MISC_TILE_CONFIG] |=
+                   R300_GB_TILE_PIPE_COUNT_R300;
+               break;
+       case 3:
+               r300->hw.gb_misc.cmd[R300_GB_MISC_TILE_CONFIG] |=
+                   R300_GB_TILE_PIPE_COUNT_R420_3P;
+               break;
+       case 4:
+               r300->hw.gb_misc.cmd[R300_GB_MISC_TILE_CONFIG] |=
+                   R300_GB_TILE_PIPE_COUNT_R420;
+               break;
+       }
+
+       /* XXX: Enable anti-aliasing? */
+       r300->hw.gb_misc.cmd[R300_GB_MISC_AA_CONFIG] = GB_AA_CONFIG_AA_DISABLE;
+       r300->hw.gb_misc.cmd[R300_GB_MISC_SELECT] = 0;
+
+       r300->hw.ga_point_s0.cmd[1] = r300PackFloat32(0.0);
+       r300->hw.ga_point_s0.cmd[2] = r300PackFloat32(0.0);
+       r300->hw.ga_point_s0.cmd[3] = r300PackFloat32(1.0);
+       r300->hw.ga_point_s0.cmd[4] = r300PackFloat32(1.0);
+
+       r300->hw.ga_triangle_stipple.cmd[1] = 0x00050005;
+
+       r300PointSize(ctx, 1.0);
+
+       r300->hw.ga_point_minmax.cmd[1] = 0x18000006;
+       r300->hw.ga_point_minmax.cmd[2] = 0x00020006;
+       r300->hw.ga_point_minmax.cmd[3] = r300PackFloat32(1.0 / 192.0);
+
+       r300LineWidth(ctx, 1.0);
+
+       r300->hw.ga_line_stipple.cmd[1] = 0;
+       r300->hw.ga_line_stipple.cmd[2] = r300PackFloat32(0.0);
+       r300->hw.ga_line_stipple.cmd[3] = r300PackFloat32(1.0);
+
+       r300ShadeModel(ctx, ctx->Light.ShadeModel);
+
+       r300PolygonMode(ctx, GL_FRONT, ctx->Polygon.FrontMode);
+       r300PolygonMode(ctx, GL_BACK, ctx->Polygon.BackMode);
+       r300->hw.zbias_cntl.cmd[1] = 0x00000000;
+
+       r300PolygonOffset(ctx, ctx->Polygon.OffsetFactor,
+                         ctx->Polygon.OffsetUnits);
+       r300Enable(ctx, GL_POLYGON_OFFSET_POINT, ctx->Polygon.OffsetPoint);
+       r300Enable(ctx, GL_POLYGON_OFFSET_LINE, ctx->Polygon.OffsetLine);
+       r300Enable(ctx, GL_POLYGON_OFFSET_FILL, ctx->Polygon.OffsetFill);
+
+       r300->hw.su_depth_scale.cmd[1] = 0x4B7FFFFF;
+       r300->hw.su_depth_scale.cmd[2] = 0x00000000;
+
+       r300->hw.sc_hyperz.cmd[1] = 0x0000001C;
+       r300->hw.sc_hyperz.cmd[2] = 0x2DA49525;
+
+       r300->hw.sc_screendoor.cmd[1] = 0x00FFFFFF;
+
+       r300->hw.us_out_fmt.cmd[1] = R500_OUT_FMT_C4_8  |
+         R500_C0_SEL_B | R500_C1_SEL_G | R500_C2_SEL_R | R500_C3_SEL_A;
+       r300->hw.us_out_fmt.cmd[2] = R500_OUT_FMT_UNUSED |
+         R500_C0_SEL_B | R500_C1_SEL_G | R500_C2_SEL_R | R500_C3_SEL_A;
+       r300->hw.us_out_fmt.cmd[3] = R500_OUT_FMT_UNUSED |
+         R500_C0_SEL_B | R500_C1_SEL_G | R500_C2_SEL_R | R500_C3_SEL_A;
+       r300->hw.us_out_fmt.cmd[4] = R500_OUT_FMT_UNUSED |
+         R500_C0_SEL_B | R500_C1_SEL_G | R500_C2_SEL_R | R500_C3_SEL_A;
+       r300->hw.us_out_fmt.cmd[5] = R300_W_FMT_W0 | R300_W_SRC_US;
+
+       /* disable fog unit */
+       r300->hw.fogs.cmd[R300_FOGS_STATE] = 0;
+       r300->hw.fg_depth_src.cmd[1] = R300_FG_DEPTH_SRC_SCAN;
+
+       r300->hw.rb3d_cctl.cmd[1] = 0;
+
+       r300BlendColor(ctx, ctx->Color.BlendColor);
+
+       r300->hw.rb3d_dither_ctl.cmd[1] = 0;
+       r300->hw.rb3d_dither_ctl.cmd[2] = 0;
+       r300->hw.rb3d_dither_ctl.cmd[3] = 0;
+       r300->hw.rb3d_dither_ctl.cmd[4] = 0;
+       r300->hw.rb3d_dither_ctl.cmd[5] = 0;
+       r300->hw.rb3d_dither_ctl.cmd[6] = 0;
+       r300->hw.rb3d_dither_ctl.cmd[7] = 0;
+       r300->hw.rb3d_dither_ctl.cmd[8] = 0;
+       r300->hw.rb3d_dither_ctl.cmd[9] = 0;
+
+       r300->hw.rb3d_aaresolve_ctl.cmd[1] = 0;
+
+       r300->hw.rb3d_discard_src_pixel_lte_threshold.cmd[1] = 0x00000000;
+       r300->hw.rb3d_discard_src_pixel_lte_threshold.cmd[2] = 0xffffffff;
+
+       r300->hw.zb_depthclearvalue.cmd[1] = 0;
+
+       r300->hw.zstencil_format.cmd[2] = R300_ZTOP_DISABLE;
+       r300->hw.zstencil_format.cmd[3] = 0x00000003;
+       r300->hw.zstencil_format.cmd[4] = 0x00000000;
+       r300SetEarlyZState(ctx);
+
+       r300->hw.unk4F30.cmd[1] = 0;
+       r300->hw.unk4F30.cmd[2] = 0;
+
+       r300->hw.zb_hiz_offset.cmd[1] = 0;
+
+       r300->hw.zb_hiz_pitch.cmd[1] = 0;
+
+       r300VapCntl(r300, 0, 0, 0);
+       if (has_tcl) {
+               r300->hw.vps.cmd[R300_VPS_ZERO_0] = 0;
+               r300->hw.vps.cmd[R300_VPS_ZERO_1] = 0;
+               r300->hw.vps.cmd[R300_VPS_POINTSIZE] = r300PackFloat32(1.0);
+               r300->hw.vps.cmd[R300_VPS_ZERO_3] = 0;
+       }
+
+       r300->radeon.hw.all_dirty = GL_TRUE;
+}
+
+void r300UpdateShaders(r300ContextPtr rmesa)
+{
+       GLcontext *ctx;
+       struct r300_vertex_program *vp;
+       int i;
+
+       ctx = rmesa->radeon.glCtx;
+
+       if (rmesa->radeon.NewGLState && hw_tcl_on) {
+               rmesa->radeon.NewGLState = 0;
+
+               for (i = _TNL_FIRST_MAT; i <= _TNL_LAST_MAT; i++) {
+                       rmesa->temp_attrib[i] =
+                           TNL_CONTEXT(ctx)->vb.AttribPtr[i];
+                       TNL_CONTEXT(ctx)->vb.AttribPtr[i] =
+                           &rmesa->dummy_attrib[i];
+               }
+
+               _tnl_UpdateFixedFunctionProgram(ctx);
+
+               for (i = _TNL_FIRST_MAT; i <= _TNL_LAST_MAT; i++) {
+                       TNL_CONTEXT(ctx)->vb.AttribPtr[i] =
+                           rmesa->temp_attrib[i];
+               }
+
+               r300SelectVertexShader(rmesa);
+               vp = (struct r300_vertex_program *)
+                   CURRENT_VERTEX_SHADER(ctx);
+               /*if (vp->translated == GL_FALSE)
+                  r300TranslateVertexShader(vp); */
+               if (vp->translated == GL_FALSE) {
+                       fprintf(stderr, "Failing back to sw-tcl\n");
+                       hw_tcl_on = future_hw_tcl_on = 0;
+                       r300ResetHwState(rmesa);
+
+                       r300UpdateStateParameters(ctx, _NEW_PROGRAM);
+                       return;
+               }
+       }
+       r300UpdateStateParameters(ctx, _NEW_PROGRAM);
+}
+
+static const GLfloat *get_fragmentprogram_constant(GLcontext *ctx,
+       struct gl_program *program, struct prog_src_register srcreg)
+{
+       static const GLfloat dummy[4] = { 0, 0, 0, 0 };
+
+       switch(srcreg.File) {
+       case PROGRAM_LOCAL_PARAM:
+               return program->LocalParams[srcreg.Index];
+       case PROGRAM_ENV_PARAM:
+               return ctx->FragmentProgram.Parameters[srcreg.Index];
+       case PROGRAM_STATE_VAR:
+       case PROGRAM_NAMED_PARAM:
+       case PROGRAM_CONSTANT:
+               return program->Parameters->ParameterValues[srcreg.Index];
+       default:
+               _mesa_problem(ctx, "get_fragmentprogram_constant: Unknown\n");
+               return dummy;
+       }
+}
+
+
+static void r300SetupPixelShader(r300ContextPtr rmesa)
+{
+       GLcontext *ctx = rmesa->radeon.glCtx;
+       struct r300_fragment_program *fp = (struct r300_fragment_program *)
+           (char *)ctx->FragmentProgram._Current;
+       struct r300_fragment_program_code *code;
+       int i, k;
+
+       if (!fp)                /* should only happenen once, just after context is created */
+               return;
+
+       r300TranslateFragmentShader(rmesa, fp);
+       if (!fp->translated) {
+               fprintf(stderr, "%s: No valid fragment shader, exiting\n",
+                       __FUNCTION__);
+               return;
+       }
+       code = &fp->code;
+
+       r300SetupTextures(ctx);
+
+       R300_STATECHANGE(rmesa, fpi[0]);
+       R300_STATECHANGE(rmesa, fpi[1]);
+       R300_STATECHANGE(rmesa, fpi[2]);
+       R300_STATECHANGE(rmesa, fpi[3]);
+       rmesa->hw.fpi[0].cmd[R300_FPI_CMD_0] = cmdpacket0(rmesa->radeon.radeonScreen, R300_US_ALU_RGB_INST_0, code->alu.length);
+       rmesa->hw.fpi[1].cmd[R300_FPI_CMD_0] = cmdpacket0(rmesa->radeon.radeonScreen, R300_US_ALU_RGB_ADDR_0, code->alu.length);
+       rmesa->hw.fpi[2].cmd[R300_FPI_CMD_0] = cmdpacket0(rmesa->radeon.radeonScreen, R300_US_ALU_ALPHA_INST_0, code->alu.length);
+       rmesa->hw.fpi[3].cmd[R300_FPI_CMD_0] = cmdpacket0(rmesa->radeon.radeonScreen, R300_US_ALU_ALPHA_ADDR_0, code->alu.length);
+       for (i = 0; i < code->alu.length; i++) {
+               rmesa->hw.fpi[0].cmd[R300_FPI_INSTR_0 + i] = code->alu.inst[i].inst0;
+               rmesa->hw.fpi[1].cmd[R300_FPI_INSTR_0 + i] = code->alu.inst[i].inst1;
+               rmesa->hw.fpi[2].cmd[R300_FPI_INSTR_0 + i] = code->alu.inst[i].inst2;
+               rmesa->hw.fpi[3].cmd[R300_FPI_INSTR_0 + i] = code->alu.inst[i].inst3;
+       }
+
+       R300_STATECHANGE(rmesa, fp);
+       rmesa->hw.fp.cmd[R300_FP_CNTL0] = code->cur_node | (code->first_node_has_tex << 3);
+       rmesa->hw.fp.cmd[R300_FP_CNTL1] = code->max_temp_idx;
+       rmesa->hw.fp.cmd[R300_FP_CNTL2] =
+         (0 << R300_PFS_CNTL_ALU_OFFSET_SHIFT) |
+         ((code->alu.length-1) << R300_PFS_CNTL_ALU_END_SHIFT) |
+         (0 << R300_PFS_CNTL_TEX_OFFSET_SHIFT) |
+         ((code->tex.length ? code->tex.length-1 : 0) << R300_PFS_CNTL_TEX_END_SHIFT);
+       /* I just want to say, the way these nodes are stored.. weird.. */
+       for (i = 0, k = (4 - (code->cur_node + 1)); i < 4; i++, k++) {
+               if (i < (code->cur_node + 1)) {
+                       rmesa->hw.fp.cmd[R300_FP_NODE0 + k] =
+                         (code->node[i].alu_offset << R300_ALU_START_SHIFT) |
+                         (code->node[i].alu_end << R300_ALU_SIZE_SHIFT) |
+                         (code->node[i].tex_offset << R300_TEX_START_SHIFT) |
+                         (code->node[i].tex_end << R300_TEX_SIZE_SHIFT) |
+                         code->node[i].flags;
+               } else {
+                       rmesa->hw.fp.cmd[R300_FP_NODE0 + (3 - i)] = 0;
+               }
+       }
+
+       R300_STATECHANGE(rmesa, fpp);
+       rmesa->hw.fpp.cmd[R300_FPP_CMD_0] = cmdpacket0(rmesa->radeon.radeonScreen, R300_PFS_PARAM_0_X, code->const_nr * 4);
+       for (i = 0; i < code->const_nr; i++) {
+               const GLfloat *constant = get_fragmentprogram_constant(ctx,
+                       &fp->mesa_program.Base, code->constant[i]);
+               rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 0] = r300PackFloat24(constant[0]);
+               rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 1] = r300PackFloat24(constant[1]);
+               rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 2] = r300PackFloat24(constant[2]);
+               rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 3] = r300PackFloat24(constant[3]);
+       }
+}
+
+#define bump_r500fp_count(ptr, new_count)   do{\
+       drm_r300_cmd_header_t* _p=((drm_r300_cmd_header_t*)(ptr));\
+       int _nc=(new_count)/6; \
+       assert(_nc < 256); \
+       if(_nc>_p->r500fp.count)_p->r500fp.count=_nc;\
+} while(0)
+
+#define bump_r500fp_const_count(ptr, new_count)   do{\
+       drm_r300_cmd_header_t* _p=((drm_r300_cmd_header_t*)(ptr));\
+       int _nc=(new_count)/4; \
+       assert(_nc < 256); \
+       if(_nc>_p->r500fp.count)_p->r500fp.count=_nc;\
+} while(0)
+
+static void r500SetupPixelShader(r300ContextPtr rmesa)
+{
+       GLcontext *ctx = rmesa->radeon.glCtx;
+       struct r500_fragment_program *fp = (struct r500_fragment_program *)
+           (char *)ctx->FragmentProgram._Current;
+       int i;
+       struct r500_fragment_program_code *code;
+
+       if (!fp)                /* should only happenen once, just after context is created */
+               return;
+
+       ((drm_r300_cmd_header_t *) rmesa->hw.r500fp.cmd)->r500fp.count = 0;
+       ((drm_r300_cmd_header_t *) rmesa->hw.r500fp_const.cmd)->r500fp.count = 0;
+
+       r500TranslateFragmentShader(rmesa, fp);
+       if (!fp->translated) {
+               fprintf(stderr, "%s: No valid fragment shader, exiting\n",
+                       __FUNCTION__);
+               return;
+       }
+       code = &fp->code;
+
+       r300SetupTextures(ctx);
+
+       R300_STATECHANGE(rmesa, fp);
+       rmesa->hw.fp.cmd[R500_FP_PIXSIZE] = code->max_temp_idx;
+
+       rmesa->hw.fp.cmd[R500_FP_CODE_ADDR] =
+           R500_US_CODE_START_ADDR(code->inst_offset) |
+           R500_US_CODE_END_ADDR(code->inst_end);
+       rmesa->hw.fp.cmd[R500_FP_CODE_RANGE] =
+           R500_US_CODE_RANGE_ADDR(code->inst_offset) |
+           R500_US_CODE_RANGE_SIZE(code->inst_end);
+       rmesa->hw.fp.cmd[R500_FP_CODE_OFFSET] =
+           R500_US_CODE_OFFSET_ADDR(0); /* FIXME when we add flow control */
+
+       R300_STATECHANGE(rmesa, r500fp);
+       /* Emit our shader... */
+       for (i = 0; i < code->inst_end+1; i++) {
+               rmesa->hw.r500fp.cmd[i*6+1] = code->inst[i].inst0;
+               rmesa->hw.r500fp.cmd[i*6+2] = code->inst[i].inst1;
+               rmesa->hw.r500fp.cmd[i*6+3] = code->inst[i].inst2;
+               rmesa->hw.r500fp.cmd[i*6+4] = code->inst[i].inst3;
+               rmesa->hw.r500fp.cmd[i*6+5] = code->inst[i].inst4;
+               rmesa->hw.r500fp.cmd[i*6+6] = code->inst[i].inst5;
+       }
+
+       bump_r500fp_count(rmesa->hw.r500fp.cmd, (code->inst_end + 1) * 6);
+
+       R300_STATECHANGE(rmesa, r500fp_const);
+       for (i = 0; i < code->const_nr; i++) {
+               const GLfloat *constant = get_fragmentprogram_constant(ctx,
+                       &fp->mesa_program.Base, code->constant[i]);
+               rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 0] = r300PackFloat32(constant[0]);
+               rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 1] = r300PackFloat32(constant[1]);
+               rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 2] = r300PackFloat32(constant[2]);
+               rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 3] = r300PackFloat32(constant[3]);
+       }
+       bump_r500fp_const_count(rmesa->hw.r500fp_const.cmd, code->const_nr * 4);
+
+}
+
+void r300UpdateShaderStates(r300ContextPtr rmesa)
+{
+       GLcontext *ctx;
+       ctx = rmesa->radeon.glCtx;
+
+       r300SetEarlyZState(ctx);
+
+       /* w_fmt value is set to get best performance
+        * see p.130 R5xx 3D acceleration guide v1.3 */
+       GLuint w_fmt, fgdepthsrc;
+       if (current_fragment_program_writes_depth(ctx)) {
+               fgdepthsrc = R300_FG_DEPTH_SRC_SHADER;
+               w_fmt = R300_W_FMT_W24 | R300_W_SRC_US;
+       } else {
+               fgdepthsrc = R300_FG_DEPTH_SRC_SCAN;
+               w_fmt = R300_W_FMT_W0 | R300_W_SRC_US;
+       }
+
+       if (w_fmt != rmesa->hw.us_out_fmt.cmd[5]) {
+               R300_STATECHANGE(rmesa, us_out_fmt);
+               rmesa->hw.us_out_fmt.cmd[5] = w_fmt;
+       }
+
+       if (fgdepthsrc != rmesa->hw.fg_depth_src.cmd[1]) {
+               R300_STATECHANGE(rmesa, fg_depth_src);
+               rmesa->hw.fg_depth_src.cmd[1] = fgdepthsrc;
+       }
+
+       if (rmesa->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515)
+               r500SetupPixelShader(rmesa);
+       else
+               r300SetupPixelShader(rmesa);
+
+       if (rmesa->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515)
+               r500SetupRSUnit(ctx);
+       else
+               r300SetupRSUnit(ctx);
+
+       if ((rmesa->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL))
+               r300SetupVertexProgram(rmesa);
+
+}
+
+/**
+ * Called by Mesa after an internal state update.
+ */
+static void r300InvalidateState(GLcontext * ctx, GLuint new_state)
+{
+       r300ContextPtr r300 = R300_CONTEXT(ctx);
+
+       _swrast_InvalidateState(ctx, new_state);
+       _swsetup_InvalidateState(ctx, new_state);
+       _vbo_InvalidateState(ctx, new_state);
+       _tnl_InvalidateState(ctx, new_state);
+       _ae_invalidate_state(ctx, new_state);
+
+       if (new_state & (_NEW_BUFFERS | _NEW_COLOR | _NEW_PIXEL)) {
+               _mesa_update_framebuffer(ctx);
+               /* this updates the DrawBuffer's Width/Height if it's a FBO */
+               _mesa_update_draw_buffer_bounds(ctx);
+
+               R300_STATECHANGE(r300, cb);
+       }
+
+       r300UpdateStateParameters(ctx, new_state);
+
+       r300->radeon.NewGLState |= new_state;
+}
+
+/**
+ * Calculate initial hardware state and register state functions.
+ * Assumes that the command buffer and state atoms have been
+ * initialized already.
+ */
+void r300InitState(r300ContextPtr r300)
+{
+       memset(&(r300->state.texture), 0, sizeof(r300->state.texture));
+
+       r300ResetHwState(r300);
+}
+
+static void r300RenderMode(GLcontext * ctx, GLenum mode)
+{
+       r300ContextPtr rmesa = R300_CONTEXT(ctx);
+       (void)rmesa;
+       (void)mode;
+}
+
+void r300UpdateClipPlanes( GLcontext *ctx )
+{
+       r300ContextPtr rmesa = R300_CONTEXT(ctx);
+       GLuint p;
+
+       for (p = 0; p < ctx->Const.MaxClipPlanes; p++) {
+               if (ctx->Transform.ClipPlanesEnabled & (1 << p)) {
+                       GLint *ip = (GLint *)ctx->Transform._ClipUserPlane[p];
+
+                       R300_STATECHANGE( rmesa, vpucp[p] );
+                       rmesa->hw.vpucp[p].cmd[R300_VPUCP_X] = ip[0];
+                       rmesa->hw.vpucp[p].cmd[R300_VPUCP_Y] = ip[1];
+                       rmesa->hw.vpucp[p].cmd[R300_VPUCP_Z] = ip[2];
+                       rmesa->hw.vpucp[p].cmd[R300_VPUCP_W] = ip[3];
+               }
+       }
+}
+
+/**
+ * Initialize driver's state callback functions
+ */
+void r300InitStateFuncs(struct dd_function_table *functions)
+{
+
+       functions->UpdateState = r300InvalidateState;
+       functions->AlphaFunc = r300AlphaFunc;
+       functions->BlendColor = r300BlendColor;
+       functions->BlendEquationSeparate = r300BlendEquationSeparate;
+       functions->BlendFuncSeparate = r300BlendFuncSeparate;
+       functions->Enable = r300Enable;
+       functions->ColorMask = r300ColorMask;
+       functions->DepthFunc = r300DepthFunc;
+       functions->DepthMask = r300DepthMask;
+       functions->CullFace = r300CullFace;
+       functions->FrontFace = r300FrontFace;
+       functions->ShadeModel = r300ShadeModel;
+       functions->LogicOpcode = r300LogicOpcode;
+
+       /* ARB_point_parameters */
+       functions->PointParameterfv = r300PointParameter;
+
+       /* Stencil related */
+       functions->StencilFuncSeparate = r300StencilFuncSeparate;
+       functions->StencilMaskSeparate = r300StencilMaskSeparate;
+       functions->StencilOpSeparate = r300StencilOpSeparate;
+
+       /* Viewport related */
+       functions->Viewport = r300Viewport;
+       functions->DepthRange = r300DepthRange;
+       functions->PointSize = r300PointSize;
+       functions->LineWidth = r300LineWidth;
+
+       functions->PolygonOffset = r300PolygonOffset;
+       functions->PolygonMode = r300PolygonMode;
+
+       functions->RenderMode = r300RenderMode;
+
+       functions->ClipPlane = r300ClipPlane;
+       functions->Scissor = radeonScissor;
+
+       functions->DrawBuffer           = radeonDrawBuffer;
+       functions->ReadBuffer           = radeonReadBuffer;
+}
diff --git a/src/mesa/drivers/dri/r600/r600_state.h b/src/mesa/drivers/dri/r600/r600_state.h
new file mode 100644 (file)
index 0000000..bc33da9
--- /dev/null
@@ -0,0 +1,65 @@
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Nicolai Haehnle <prefect_@gmx.net>
+ */
+
+#ifndef __R600_STATE_H__
+#define __R600_STATE_H__
+
+#include "r600_context.h"
+
+#define R300_NEWPRIM( rmesa )                  \
+  do {                                         \
+  if ( rmesa->radeon.dma.flush )                       \
+    rmesa->radeon.dma.flush( rmesa->radeon.glCtx );    \
+  } while (0)
+
+#define R300_STATECHANGE(r300, atom) \
+       do {                                            \
+         R300_NEWPRIM(r300);                           \
+               r300->hw.atom.dirty = GL_TRUE;          \
+               r300->radeon.hw.is_dirty = GL_TRUE;             \
+       } while(0)
+
+// r300_state.c
+extern int future_hw_tcl_on;
+void _tnl_UpdateFixedFunctionProgram (GLcontext * ctx);
+void r300UpdateViewportOffset (GLcontext * ctx);
+void r300UpdateDrawBuffer (GLcontext * ctx);
+void r300UpdateStateParameters (GLcontext * ctx, GLuint new_state);
+void r300UpdateShaders (r300ContextPtr rmesa);
+void r300UpdateShaderStates (r300ContextPtr rmesa);
+void r300InitState (r300ContextPtr r300);
+void r300UpdateClipPlanes (GLcontext * ctx);
+void r300InitStateFuncs (struct dd_function_table *functions);
+
+#endif                         /* __R300_STATE_H__ */
diff --git a/src/mesa/drivers/dri/r600/r600_swtcl.c b/src/mesa/drivers/dri/r600/r600_swtcl.c
new file mode 100644 (file)
index 0000000..9c8b624
--- /dev/null
@@ -0,0 +1,722 @@
+/**************************************************************************
+
+Copyright (C) 2007 Dave Airlie
+
+All Rights Reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the "Software"),
+to deal in the Software without restriction, including without limitation
+on the rights to use, copy, modify, merge, publish, distribute, sub
+license, and/or sell copies of the Software, and to permit persons to whom
+the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice (including the next
+paragraph) shall be included in all copies or substantial portions of the
+Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Dave Airlie <airlied@linux.ie>
+ *   Maciej Cencora <m.cencora@gmail.com>
+ */
+
+#include "tnl/tnl.h"
+#include "tnl/t_pipeline.h"
+
+#include "r600_swtcl.h"
+#include "r600_emit.h"
+#include "r600_tex.h"
+
+#define EMIT_ATTR( ATTR, STYLE )                                       \
+do {                                                                   \
+   rmesa->radeon.swtcl.vertex_attrs[rmesa->radeon.swtcl.vertex_attr_count].attrib = (ATTR);    \
+   rmesa->radeon.swtcl.vertex_attrs[rmesa->radeon.swtcl.vertex_attr_count].format = (STYLE);   \
+   rmesa->radeon.swtcl.vertex_attr_count++;                                    \
+} while (0)
+
+#define EMIT_PAD( N )                                                  \
+do {                                                                   \
+   rmesa->radeon.swtcl.vertex_attrs[rmesa->radeon.swtcl.vertex_attr_count].attrib = 0;         \
+   rmesa->radeon.swtcl.vertex_attrs[rmesa->radeon.swtcl.vertex_attr_count].format = EMIT_PAD;  \
+   rmesa->radeon.swtcl.vertex_attrs[rmesa->radeon.swtcl.vertex_attr_count].offset = (N);               \
+   rmesa->radeon.swtcl.vertex_attr_count++;                                    \
+} while (0)
+
+#define ADD_ATTR(_attr, _format, _dst_loc, _swizzle, _write_mask) \
+do { \
+       attrs[num_attrs].attr = (_attr); \
+       attrs[num_attrs].format = (_format); \
+       attrs[num_attrs].dst_loc = (_dst_loc); \
+       attrs[num_attrs].swizzle = (_swizzle); \
+       attrs[num_attrs].write_mask = (_write_mask); \
+       ++num_attrs; \
+} while (0)
+
+static void r300SwtclVAPSetup(GLcontext *ctx, GLuint InputsRead, GLuint OutputsWritten)
+{
+       r300ContextPtr rmesa = R300_CONTEXT( ctx );
+       TNLcontext *tnl = TNL_CONTEXT(ctx);
+       struct vertex_buffer *VB = &tnl->vb;
+       struct vertex_attribute *attrs = rmesa->swtcl.vert_attrs;
+       int vte = 0;
+       int i, j, reg_count;
+       uint32_t *vir0 = &rmesa->hw.vir[0].cmd[1];
+       uint32_t *vir1 = &rmesa->hw.vir[1].cmd[1];
+
+       for (i = 0; i < R300_VIR_CMDSIZE-1; ++i)
+               vir0[i] = vir1[i] = 0;
+
+       for (i = 0, j = 0; i < rmesa->radeon.swtcl.vertex_attr_count; ++i) {
+               int tmp, data_format;
+               switch (attrs[i].format) {
+                       case EMIT_1F:
+                               data_format = R300_DATA_TYPE_FLOAT_1;
+                               break;
+                       case EMIT_2F:
+                               data_format = R300_DATA_TYPE_FLOAT_2;
+                               break;
+                       case EMIT_3F:
+                               data_format = R300_DATA_TYPE_FLOAT_3;
+                               break;
+                       case EMIT_4F:
+                               data_format = R300_DATA_TYPE_FLOAT_4;
+                               break;
+                       case EMIT_4UB_4F_RGBA:
+                       case EMIT_4UB_4F_ABGR:
+                               data_format = R300_DATA_TYPE_BYTE | R300_NORMALIZE;
+                               break;
+                       default:
+                               fprintf(stderr, "%s: Invalid data format type", __FUNCTION__);
+                               _mesa_exit(-1);
+                               break;
+               }
+
+               tmp = data_format | (attrs[i].dst_loc << R300_DST_VEC_LOC_SHIFT);
+               if (i % 2 == 0) {
+                       vir0[j] = tmp << R300_DATA_TYPE_0_SHIFT;
+                       vir1[j] = attrs[i].swizzle | (attrs[i].write_mask << R300_WRITE_ENA_SHIFT);
+               } else {
+                       vir0[j] |= tmp << R300_DATA_TYPE_1_SHIFT;
+                       vir1[j] |= (attrs[i].swizzle | (attrs[i].write_mask << R300_WRITE_ENA_SHIFT)) << R300_SWIZZLE1_SHIFT;
+                       ++j;
+               }
+       }
+
+       reg_count = (rmesa->radeon.swtcl.vertex_attr_count + 1) >> 1;
+       if (rmesa->radeon.swtcl.vertex_attr_count % 2 != 0) {
+               vir0[reg_count-1] |= R300_LAST_VEC << R300_DATA_TYPE_0_SHIFT;
+       } else {
+               vir0[reg_count-1] |= R300_LAST_VEC << R300_DATA_TYPE_1_SHIFT;
+       }
+
+       R300_STATECHANGE(rmesa, vir[0]);
+       R300_STATECHANGE(rmesa, vir[1]);
+       R300_STATECHANGE(rmesa, vof);
+       R300_STATECHANGE(rmesa, vte);
+       R300_STATECHANGE(rmesa, vic);
+
+       if (rmesa->radeon.radeonScreen->kernel_mm) {
+               rmesa->hw.vir[0].cmd[0] &= 0xC000FFFF;
+               rmesa->hw.vir[1].cmd[0] &= 0xC000FFFF;
+               rmesa->hw.vir[0].cmd[0] |= (reg_count & 0x3FFF) << 16;
+               rmesa->hw.vir[1].cmd[0] |= (reg_count & 0x3FFF) << 16;
+       } else {
+               ((drm_r300_cmd_header_t *) rmesa->hw.vir[0].cmd)->packet0.count = reg_count;
+               ((drm_r300_cmd_header_t *) rmesa->hw.vir[1].cmd)->packet0.count = reg_count;
+       }
+
+       rmesa->hw.vic.cmd[R300_VIC_CNTL_0] = r300VAPInputCntl0(ctx, InputsRead);
+       rmesa->hw.vic.cmd[R300_VIC_CNTL_1] = r300VAPInputCntl1(ctx, InputsRead);
+       rmesa->hw.vof.cmd[R300_VOF_CNTL_0] = r300VAPOutputCntl0(ctx, OutputsWritten);
+       rmesa->hw.vof.cmd[R300_VOF_CNTL_1] = r300VAPOutputCntl1(ctx, OutputsWritten);
+
+       vte = rmesa->hw.vte.cmd[1];
+       vte &= ~(R300_VTX_XY_FMT | R300_VTX_Z_FMT | R300_VTX_W0_FMT);
+       /* Important:
+        */
+       if ( VB->NdcPtr != NULL ) {
+               VB->AttribPtr[VERT_ATTRIB_POS] = VB->NdcPtr;
+               vte |= R300_VTX_XY_FMT | R300_VTX_Z_FMT;
+       }
+       else {
+               VB->AttribPtr[VERT_ATTRIB_POS] = VB->ClipPtr;
+               vte |= R300_VTX_W0_FMT;
+       }
+
+       assert( VB->AttribPtr[VERT_ATTRIB_POS] != NULL );
+
+       rmesa->hw.vte.cmd[1] = vte;
+       rmesa->hw.vte.cmd[2] = rmesa->radeon.swtcl.vertex_size;
+}
+
+
+static void r300SetVertexFormat( GLcontext *ctx )
+{
+       r300ContextPtr rmesa = R300_CONTEXT( ctx );
+       TNLcontext *tnl = TNL_CONTEXT(ctx);
+       struct vertex_buffer *VB = &tnl->vb;
+       int fog_id = -1;
+       GLuint InputsRead = 0;
+       GLuint OutputsWritten = 0;
+       int num_attrs = 0;
+       struct vertex_attribute *attrs = rmesa->swtcl.vert_attrs;
+
+       rmesa->swtcl.coloroffset = rmesa->swtcl.specoffset = 0;
+       rmesa->radeon.swtcl.vertex_attr_count = 0;
+
+       if (RENDERINPUTS_TEST(tnl->render_inputs_bitset, _TNL_ATTRIB_POS)) {
+               InputsRead |= 1 << VERT_ATTRIB_POS;
+               OutputsWritten |= 1 << VERT_RESULT_HPOS;
+               EMIT_ATTR( _TNL_ATTRIB_POS, EMIT_4F );
+               ADD_ATTR(VERT_ATTRIB_POS, EMIT_4F, SWTCL_OVM_POS, SWIZZLE_XYZW, MASK_XYZW);
+               rmesa->swtcl.coloroffset = 4;
+       }
+
+       if (RENDERINPUTS_TEST(tnl->render_inputs_bitset, _TNL_ATTRIB_COLOR0)) {
+               InputsRead |= 1 << VERT_ATTRIB_COLOR0;
+               OutputsWritten |= 1 << VERT_RESULT_COL0;
+#if MESA_LITTLE_ENDIAN
+               EMIT_ATTR( _TNL_ATTRIB_COLOR0, EMIT_4UB_4F_RGBA );
+               ADD_ATTR(VERT_ATTRIB_COLOR0, EMIT_4UB_4F_RGBA, SWTCL_OVM_COLOR0, SWIZZLE_XYZW, MASK_XYZW);
+#else
+               EMIT_ATTR( _TNL_ATTRIB_COLOR0, EMIT_4UB_4F_ABGR );
+               ADD_ATTR(VERT_ATTRIB_COLOR0, EMIT_4UB_4F_ABGR, SWTCL_OVM_COLOR0, SWIZZLE_XYZW, MASK_XYZW);
+#endif
+       }
+
+       if (RENDERINPUTS_TEST(tnl->render_inputs_bitset, _TNL_ATTRIB_COLOR1 )) {
+               GLuint swiz = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ONE);
+               InputsRead |= 1 << VERT_ATTRIB_COLOR1;
+               OutputsWritten |= 1 << VERT_RESULT_COL1;
+#if MESA_LITTLE_ENDIAN
+               EMIT_ATTR( _TNL_ATTRIB_COLOR1, EMIT_4UB_4F_RGBA );
+               ADD_ATTR(VERT_ATTRIB_COLOR1, EMIT_4UB_4F_RGBA, SWTCL_OVM_COLOR1, swiz, MASK_XYZW);
+#else
+               EMIT_ATTR( _TNL_ATTRIB_COLOR1, EMIT_4UB_4F_ABGR );
+               ADD_ATTR(VERT_ATTRIB_COLOR1, EMIT_4UB_4F_ABGR, SWTCL_OVM_COLOR1, swiz, MASK_XYZW);
+#endif
+               rmesa->swtcl.specoffset = rmesa->swtcl.coloroffset + 1;
+       }
+
+       if (RENDERINPUTS_TEST(tnl->render_inputs_bitset, _TNL_ATTRIB_POINTSIZE )) {
+               GLuint swiz = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_ZERO, SWIZZLE_ZERO, SWIZZLE_ZERO);
+               InputsRead |= 1 << VERT_ATTRIB_POINT_SIZE;
+               OutputsWritten |= 1 << VERT_RESULT_PSIZ;
+               EMIT_ATTR( _TNL_ATTRIB_POINTSIZE, EMIT_1F );
+               ADD_ATTR(VERT_ATTRIB_POINT_SIZE, EMIT_1F, SWTCL_OVM_POINT_SIZE, swiz, MASK_X);
+       }
+
+       if (RENDERINPUTS_TEST(tnl->render_inputs_bitset, _TNL_ATTRIB_FOG)) {
+               /* find first free tex coord slot */
+               if (RENDERINPUTS_TEST_RANGE(tnl->render_inputs_bitset, _TNL_FIRST_TEX, _TNL_LAST_TEX )) {
+                       int i;
+                       for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
+                               if (!RENDERINPUTS_TEST(tnl->render_inputs_bitset, _TNL_ATTRIB_TEX(i) )) {
+                                       fog_id = i;
+                                       break;
+                               }
+                       }
+               } else {
+                       fog_id = 0;
+               }
+
+               if (fog_id == -1) {
+                       fprintf(stderr, "\tout of free texcoords to do fog\n");
+                       _mesa_exit(-1);
+               }
+
+               InputsRead |= 1 << VERT_ATTRIB_FOG;
+               OutputsWritten |= 1 << VERT_RESULT_FOGC;
+               GLuint swiz = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_ZERO, SWIZZLE_ZERO, SWIZZLE_ZERO);
+               EMIT_ATTR( _TNL_ATTRIB_FOG, EMIT_1F );
+               ADD_ATTR(VERT_ATTRIB_FOG, EMIT_1F, SWTCL_OVM_TEX(fog_id), swiz, MASK_X);
+       }
+
+       if (RENDERINPUTS_TEST_RANGE(tnl->render_inputs_bitset, _TNL_FIRST_TEX, _TNL_LAST_TEX )) {
+               int i;
+               GLuint swiz, mask, format;
+               for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
+                       if (RENDERINPUTS_TEST(tnl->render_inputs_bitset, _TNL_ATTRIB_TEX(i) )) {
+                               switch (VB->TexCoordPtr[i]->size) {
+                                       case 1:
+                                       case 2:
+                                               format = EMIT_2F;
+                                               swiz = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_ZERO, SWIZZLE_ZERO);
+                                               mask = MASK_X | MASK_Y;
+                                               break;
+                                       case 3:
+                                               format = EMIT_3F;
+                                               swiz = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
+                                               mask = MASK_X | MASK_Y | MASK_Z;
+                                               break;
+                                       case 4:
+                                               format = EMIT_4F;
+                                               swiz = SWIZZLE_XYZW;
+                                               mask = MASK_XYZW;
+                                               break;
+                                       default:
+                                               continue;
+                               }
+                               InputsRead |= 1 << (VERT_ATTRIB_TEX0 + i);
+                               OutputsWritten |= 1 << (VERT_RESULT_TEX0 + i);
+                               EMIT_ATTR(_TNL_ATTRIB_TEX(i), format);
+                               ADD_ATTR(VERT_ATTRIB_TEX0 + i, format, SWTCL_OVM_TEX(i), swiz, mask);
+                       }
+               }
+       }
+
+       /* RS can't put fragment position on the pixel stack, so stuff it in texcoord if needed */
+       if (RENDERINPUTS_TEST(tnl->render_inputs_bitset, _TNL_ATTRIB_POS) && (ctx->FragmentProgram._Current->Base.InputsRead & FRAG_BIT_WPOS)) {
+               int first_free_tex = -1;
+               if (fog_id >= 0) {
+                       first_free_tex = fog_id+1;
+               } else {
+                       if (RENDERINPUTS_TEST_RANGE(tnl->render_inputs_bitset, _TNL_FIRST_TEX, _TNL_LAST_TEX )) {
+                               int i;
+                               for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
+                                       if (!RENDERINPUTS_TEST(tnl->render_inputs_bitset, _TNL_ATTRIB_TEX(i) )) {
+                                               first_free_tex = i;
+                                               break;
+                                       }
+                               }
+                       } else {
+                               first_free_tex = 0;
+                       }
+               }
+
+               if (first_free_tex == -1) {
+                       fprintf(stderr, "\tout of free texcoords to write w pos\n");
+                       _mesa_exit(-1);
+               }
+
+               InputsRead |= 1 << (VERT_ATTRIB_TEX0 + first_free_tex);
+               OutputsWritten |= 1 << (VERT_RESULT_TEX0 + first_free_tex);
+               EMIT_ATTR( _TNL_ATTRIB_TEX(first_free_tex), EMIT_4F );
+               ADD_ATTR(VERT_ATTRIB_TEX0 + first_free_tex, EMIT_4F, SWTCL_OVM_TEX(first_free_tex), SWIZZLE_XYZW, MASK_XYZW);
+       }
+
+       R300_NEWPRIM(rmesa);
+       r300SwtclVAPSetup(ctx, InputsRead, OutputsWritten);
+
+       rmesa->radeon.swtcl.vertex_size =
+               _tnl_install_attrs( ctx,
+                                   rmesa->radeon.swtcl.vertex_attrs,
+                                   rmesa->radeon.swtcl.vertex_attr_count,
+                                   NULL, 0 );
+
+       rmesa->radeon.swtcl.vertex_size /= 4;
+
+       RENDERINPUTS_COPY(rmesa->state.render_inputs_bitset, tnl->render_inputs_bitset);
+}
+
+
+static GLuint reduced_prim[] = {
+       GL_POINTS,
+       GL_LINES,
+       GL_LINES,
+       GL_LINES,
+       GL_TRIANGLES,
+       GL_TRIANGLES,
+       GL_TRIANGLES,
+       GL_TRIANGLES,
+       GL_TRIANGLES,
+       GL_TRIANGLES,
+};
+
+static void r300RasterPrimitive( GLcontext *ctx, GLuint prim );
+static void r300RenderPrimitive( GLcontext *ctx, GLenum prim );
+
+/***********************************************************************
+ *                    Emit primitives as inline vertices               *
+ ***********************************************************************/
+
+
+#define HAVE_POINTS      1
+#define HAVE_LINES       1
+#define HAVE_LINE_STRIPS 1
+#define HAVE_TRIANGLES   1
+#define HAVE_TRI_STRIPS  1
+#define HAVE_TRI_STRIP_1 0
+#define HAVE_TRI_FANS    1
+#define HAVE_QUADS       0
+#define HAVE_QUAD_STRIPS 0
+#define HAVE_POLYGONS    1
+#define HAVE_ELTS        1
+
+#undef LOCAL_VARS
+#undef ALLOC_VERTS
+#define CTX_ARG r300ContextPtr rmesa
+#define GET_VERTEX_DWORDS() rmesa->radeon.swtcl.vertex_size
+#define ALLOC_VERTS( n, size ) rcommonAllocDmaLowVerts( &rmesa->radeon, n, size * 4 )
+#define LOCAL_VARS                                             \
+   r300ContextPtr rmesa = R300_CONTEXT(ctx);           \
+   const char *r300verts = (char *)rmesa->radeon.swtcl.verts;
+#define VERT(x) (r300Vertex *)(r300verts + ((x) * vertsize * sizeof(int)))
+#define VERTEX r300Vertex
+#undef TAG
+#define TAG(x) r300_##x
+#include "tnl_dd/t_dd_triemit.h"
+
+
+
+/***********************************************************************
+ *          Macros for t_dd_tritmp.h to draw basic primitives          *
+ ***********************************************************************/
+
+#define QUAD( a, b, c, d ) r300_quad( rmesa, a, b, c, d )
+#define TRI( a, b, c )     r300_triangle( rmesa, a, b, c )
+#define LINE( a, b )       r300_line( rmesa, a, b )
+#define POINT( a )         r300_point( rmesa, a )
+
+/***********************************************************************
+ *              Build render functions from dd templates               *
+ ***********************************************************************/
+
+#define R300_TWOSIDE_BIT       0x01
+#define R300_UNFILLED_BIT      0x02
+#define R300_MAX_TRIFUNC       0x04
+
+static struct {
+   tnl_points_func             points;
+   tnl_line_func               line;
+   tnl_triangle_func   triangle;
+   tnl_quad_func               quad;
+} rast_tab[R300_MAX_TRIFUNC];
+
+#define DO_FALLBACK  0
+#define DO_UNFILLED (IND & R300_UNFILLED_BIT)
+#define DO_TWOSIDE  (IND & R300_TWOSIDE_BIT)
+#define DO_FLAT      0
+#define DO_OFFSET     0
+#define DO_TRI       1
+#define DO_QUAD      1
+#define DO_LINE      1
+#define DO_POINTS    1
+#define DO_FULL_QUAD 1
+
+#define HAVE_RGBA   1
+#define HAVE_SPEC   1
+#define HAVE_BACK_COLORS  0
+#define HAVE_HW_FLATSHADE 1
+#define TAB rast_tab
+
+#define DEPTH_SCALE 1.0
+#define UNFILLED_TRI unfilled_tri
+#define UNFILLED_QUAD unfilled_quad
+#define VERT_X(_v) _v->v.x
+#define VERT_Y(_v) _v->v.y
+#define VERT_Z(_v) _v->v.z
+#define AREA_IS_CCW( a ) (a < 0)
+#define GET_VERTEX(e) (rmesa->radeon.swtcl.verts + (e*rmesa->radeon.swtcl.vertex_size*sizeof(int)))
+
+#define VERT_SET_RGBA( v, c ) \
+do { \
+   r300_color_t *color = (r300_color_t *)&((v)->ui[coloroffset]); \
+   UNCLAMPED_FLOAT_TO_UBYTE(color->red, (c)[0]); \
+   UNCLAMPED_FLOAT_TO_UBYTE(color->green, (c)[1]); \
+   UNCLAMPED_FLOAT_TO_UBYTE(color->blue, (c)[2]); \
+   UNCLAMPED_FLOAT_TO_UBYTE(color->alpha, (c)[3]); \
+} while (0)
+
+#define VERT_COPY_RGBA( v0, v1 ) v0->ui[coloroffset] = v1->ui[coloroffset]
+
+#define VERT_SET_SPEC( v0, c ) \
+do { \
+   if (specoffset) { \
+   UNCLAMPED_FLOAT_TO_UBYTE(v0->v.specular.red, (c)[0]); \
+   UNCLAMPED_FLOAT_TO_UBYTE(v0->v.specular.green, (c)[1]); \
+   UNCLAMPED_FLOAT_TO_UBYTE(v0->v.specular.blue, (c)[2]); \
+   } \
+} while (0)
+
+#define VERT_COPY_SPEC( v0, v1 ) \
+do { \
+   if (specoffset) { \
+       v0->v.specular.red = v1->v.specular.red; \
+       v0->v.specular.green = v1->v.specular.green; \
+       v0->v.specular.blue = v1->v.specular.blue; \
+   } \
+} while (0)
+
+#define VERT_SAVE_RGBA( idx )    color[idx] = v[idx]->ui[coloroffset]
+#define VERT_RESTORE_RGBA( idx ) v[idx]->ui[coloroffset] = color[idx]
+#define VERT_SAVE_SPEC( idx )    if (specoffset) spec[idx] = v[idx]->ui[specoffset]
+#define VERT_RESTORE_SPEC( idx ) if (specoffset) v[idx]->ui[specoffset] = spec[idx]
+
+#undef LOCAL_VARS
+#undef TAG
+#undef INIT
+
+#define LOCAL_VARS(n)                                                  \
+   r300ContextPtr rmesa = R300_CONTEXT(ctx);                   \
+   GLuint color[n] = { 0, }, spec[n] = { 0, };                                         \
+   GLuint coloroffset = rmesa->swtcl.coloroffset;      \
+   GLuint specoffset = rmesa->swtcl.specoffset;                        \
+   (void) color; (void) spec; (void) coloroffset; (void) specoffset;
+
+/***********************************************************************
+ *                Helpers for rendering unfilled primitives            *
+ ***********************************************************************/
+
+#define RASTERIZE(x) r300RasterPrimitive( ctx, reduced_prim[x] )
+#define RENDER_PRIMITIVE rmesa->radeon.swtcl.render_primitive
+#undef TAG
+#define TAG(x) x
+#include "tnl_dd/t_dd_unfilled.h"
+#undef IND
+
+
+/***********************************************************************
+ *                      Generate GL render functions                   *
+ ***********************************************************************/
+
+
+#define IND (0)
+#define TAG(x) x
+#include "tnl_dd/t_dd_tritmp.h"
+
+#define IND (R300_TWOSIDE_BIT)
+#define TAG(x) x##_twoside
+#include "tnl_dd/t_dd_tritmp.h"
+
+#define IND (R300_UNFILLED_BIT)
+#define TAG(x) x##_unfilled
+#include "tnl_dd/t_dd_tritmp.h"
+
+#define IND (R300_TWOSIDE_BIT|R300_UNFILLED_BIT)
+#define TAG(x) x##_twoside_unfilled
+#include "tnl_dd/t_dd_tritmp.h"
+
+
+
+static void init_rast_tab( void )
+{
+   init();
+   init_twoside();
+   init_unfilled();
+   init_twoside_unfilled();
+}
+
+/**********************************************************************/
+/*               Render unclipped begin/end objects                   */
+/**********************************************************************/
+
+#define RENDER_POINTS( start, count )          \
+   for ( ; start < count ; start++)            \
+      r300_point( rmesa, VERT(start) )
+#define RENDER_LINE( v0, v1 ) \
+   r300_line( rmesa, VERT(v0), VERT(v1) )
+#define RENDER_TRI( v0, v1, v2 )  \
+   r300_triangle( rmesa, VERT(v0), VERT(v1), VERT(v2) )
+#define RENDER_QUAD( v0, v1, v2, v3 ) \
+   r300_quad( rmesa, VERT(v0), VERT(v1), VERT(v2), VERT(v3) )
+#define INIT(x) do {                                   \
+   r300RenderPrimitive( ctx, x );                      \
+} while (0)
+#undef LOCAL_VARS
+#define LOCAL_VARS                                             \
+   r300ContextPtr rmesa = R300_CONTEXT(ctx);           \
+   const GLuint vertsize = rmesa->radeon.swtcl.vertex_size;            \
+   const char *r300verts = (char *)rmesa->radeon.swtcl.verts;          \
+   const GLuint * const elt = TNL_CONTEXT(ctx)->vb.Elts;       \
+   const GLboolean stipple = ctx->Line.StippleFlag;            \
+   (void) elt; (void) stipple;
+#define RESET_STIPPLE  //if ( stipple ) r200ResetLineStipple( ctx );
+#define RESET_OCCLUSION
+#define PRESERVE_VB_DEFS
+#define ELT(x) (x)
+#define TAG(x) r300_##x##_verts
+#include "tnl/t_vb_rendertmp.h"
+#undef ELT
+#undef TAG
+#define TAG(x) r300_##x##_elts
+#define ELT(x) elt[x]
+#include "tnl/t_vb_rendertmp.h"
+
+
+
+
+/**********************************************************************/
+/*                    Choose render functions                         */
+/**********************************************************************/
+static void r300ChooseRenderState( GLcontext *ctx )
+{
+       TNLcontext *tnl = TNL_CONTEXT(ctx);
+       r300ContextPtr rmesa = R300_CONTEXT(ctx);
+       GLuint index = 0;
+       GLuint flags = ctx->_TriangleCaps;
+
+       if (flags & DD_TRI_LIGHT_TWOSIDE) index |= R300_TWOSIDE_BIT;
+       if (flags & DD_TRI_UNFILLED)      index |= R300_UNFILLED_BIT;
+
+       if (index != rmesa->radeon.swtcl.RenderIndex) {
+               tnl->Driver.Render.Points = rast_tab[index].points;
+               tnl->Driver.Render.Line = rast_tab[index].line;
+               tnl->Driver.Render.ClippedLine = rast_tab[index].line;
+               tnl->Driver.Render.Triangle = rast_tab[index].triangle;
+               tnl->Driver.Render.Quad = rast_tab[index].quad;
+
+               if (index == 0) {
+                       tnl->Driver.Render.PrimTabVerts = r300_render_tab_verts;
+                       tnl->Driver.Render.PrimTabElts = r300_render_tab_elts;
+                       tnl->Driver.Render.ClippedPolygon = r300_fast_clipped_poly;
+               } else {
+                       tnl->Driver.Render.PrimTabVerts = _tnl_render_tab_verts;
+                       tnl->Driver.Render.PrimTabElts = _tnl_render_tab_elts;
+                       tnl->Driver.Render.ClippedPolygon = _tnl_RenderClippedPolygon;
+               }
+
+               rmesa->radeon.swtcl.RenderIndex = index;
+       }
+}
+
+
+static void r300RenderStart(GLcontext *ctx)
+{
+       r300ContextPtr rmesa = R300_CONTEXT( ctx );
+
+       r300ChooseRenderState(ctx);
+       r300SetVertexFormat(ctx);
+
+       r300ValidateBuffers(ctx);
+
+       r300UpdateShaders(rmesa);
+       r300UpdateShaderStates(rmesa);
+
+       r300EmitCacheFlush(rmesa);
+
+       /* investigate if we can put back flush optimisation if needed */
+       if (rmesa->radeon.dma.flush != NULL) {
+               rmesa->radeon.dma.flush(ctx);
+       }
+}
+
+static void r300RenderFinish(GLcontext *ctx)
+{
+}
+
+static void r300RasterPrimitive( GLcontext *ctx, GLuint hwprim )
+{
+       r300ContextPtr rmesa = R300_CONTEXT(ctx);
+
+       if (rmesa->radeon.swtcl.hw_primitive != hwprim) {
+               R300_NEWPRIM( rmesa );
+               rmesa->radeon.swtcl.hw_primitive = hwprim;
+       }
+}
+
+static void r300RenderPrimitive(GLcontext *ctx, GLenum prim)
+{
+
+       r300ContextPtr rmesa = R300_CONTEXT(ctx);
+       rmesa->radeon.swtcl.render_primitive = prim;
+
+       if ((prim == GL_TRIANGLES) && (ctx->_TriangleCaps & DD_TRI_UNFILLED))
+               return;
+
+       r300RasterPrimitive( ctx, reduced_prim[prim] );
+}
+
+static void r300ResetLineStipple(GLcontext *ctx)
+{
+}
+
+void r300InitSwtcl(GLcontext *ctx)
+{
+       TNLcontext *tnl = TNL_CONTEXT(ctx);
+       r300ContextPtr rmesa = R300_CONTEXT(ctx);
+       static int firsttime = 1;
+
+       if (firsttime) {
+               init_rast_tab();
+               firsttime = 0;
+       }
+
+       tnl->Driver.Render.Start = r300RenderStart;
+       tnl->Driver.Render.Finish = r300RenderFinish;
+       tnl->Driver.Render.PrimitiveNotify = r300RenderPrimitive;
+       tnl->Driver.Render.ResetLineStipple = r300ResetLineStipple;
+       tnl->Driver.Render.BuildVertices = _tnl_build_vertices;
+       tnl->Driver.Render.CopyPV = _tnl_copy_pv;
+       tnl->Driver.Render.Interp = _tnl_interp;
+
+       /* FIXME: what are these numbers? */
+       _tnl_init_vertices( ctx, ctx->Const.MaxArrayLockSize + 12,
+                           48 * sizeof(GLfloat) );
+
+       rmesa->radeon.swtcl.verts = (GLubyte *)tnl->clipspace.vertex_buf;
+       rmesa->radeon.swtcl.RenderIndex = ~0;
+       rmesa->radeon.swtcl.render_primitive = GL_TRIANGLES;
+       rmesa->radeon.swtcl.hw_primitive = 0;
+
+       _tnl_invalidate_vertex_state( ctx, ~0 );
+       _tnl_invalidate_vertices( ctx, ~0 );
+
+       _tnl_need_projected_coords( ctx, GL_FALSE );
+       r300ChooseRenderState(ctx);
+}
+
+void r300DestroySwtcl(GLcontext *ctx)
+{
+}
+
+static void r300EmitVertexAOS(r300ContextPtr rmesa, GLuint vertex_size, struct radeon_bo *bo, GLuint offset)
+{
+       BATCH_LOCALS(&rmesa->radeon);
+
+       if (RADEON_DEBUG & DEBUG_VERTS)
+               fprintf(stderr, "%s:  vertex_size %d, offset 0x%x \n",
+                       __FUNCTION__, vertex_size, offset);
+
+       BEGIN_BATCH(7);
+       OUT_BATCH_PACKET3(R300_PACKET3_3D_LOAD_VBPNTR, 2);
+       OUT_BATCH(1);
+       OUT_BATCH(vertex_size | (vertex_size << 8));
+       OUT_BATCH_RELOC(offset, bo, offset, RADEON_GEM_DOMAIN_GTT, 0, 0);
+       END_BATCH();
+}
+
+static void r300EmitVbufPrim(r300ContextPtr rmesa, GLuint primitive, GLuint vertex_nr)
+{
+       BATCH_LOCALS(&rmesa->radeon);
+       int type, num_verts;
+
+       type = r300PrimitiveType(rmesa, primitive);
+       num_verts = r300NumVerts(rmesa, vertex_nr, primitive);
+
+       BEGIN_BATCH(3);
+       OUT_BATCH_PACKET3(R300_PACKET3_3D_DRAW_VBUF_2, 0);
+       OUT_BATCH(R300_VAP_VF_CNTL__PRIM_WALK_VERTEX_LIST | (num_verts << 16) | type);
+       END_BATCH();
+}
+
+void r300_swtcl_flush(GLcontext *ctx, uint32_t current_offset)
+{
+       r300ContextPtr rmesa = R300_CONTEXT(ctx);
+
+       rcommonEnsureCmdBufSpace(&rmesa->radeon,
+                          rmesa->radeon.hw.max_state_size + (12*sizeof(int)),
+                          __FUNCTION__);
+       radeonEmitState(&rmesa->radeon);
+       r300EmitVertexAOS(rmesa,
+                       rmesa->radeon.swtcl.vertex_size,
+                       rmesa->radeon.dma.current,
+                       current_offset);
+
+       r300EmitVbufPrim(rmesa,
+                  rmesa->radeon.swtcl.hw_primitive,
+                  rmesa->radeon.swtcl.numverts);
+       r300EmitCacheFlush(rmesa);
+       COMMIT_BATCH();
+}
diff --git a/src/mesa/drivers/dri/r600/r600_swtcl.h b/src/mesa/drivers/dri/r600/r600_swtcl.h
new file mode 100644 (file)
index 0000000..d1739f8
--- /dev/null
@@ -0,0 +1,62 @@
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+/*
+ * Authors:
+ *   Keith Whitwell <keith@tungstengraphics.com> - original r200 code
+ *   Dave Airlie <airlied@linux.ie>
+ */
+
+#ifndef __R600_SWTCL_H__
+#define __R600_SWTCL_H__
+
+#include "main/mtypes.h"
+#include "swrast/swrast.h"
+#include "r600_context.h"
+
+#define MASK_XYZW (R300_WRITE_ENA_X | R300_WRITE_ENA_Y | R300_WRITE_ENA_Z | R300_WRITE_ENA_W)
+#define MASK_X R300_WRITE_ENA_X
+#define MASK_Y R300_WRITE_ENA_Y
+#define MASK_Z R300_WRITE_ENA_Z
+#define MASK_W R300_WRITE_ENA_W
+
+/*
+ * Here are definitions of OVM locations of vertex attributes for non TCL hw
+ */
+#define SWTCL_OVM_POS 0
+#define SWTCL_OVM_COLOR0 2
+#define SWTCL_OVM_COLOR1 3
+#define SWTCL_OVM_TEX(n) ((n) + 6)
+#define SWTCL_OVM_POINT_SIZE 15
+
+
+extern void r300InitSwtcl( GLcontext *ctx );
+extern void r300DestroySwtcl( GLcontext *ctx );
+
+extern void r300_swtcl_flush(GLcontext *ctx, uint32_t current_offset);
+#endif
diff --git a/src/mesa/drivers/dri/r600/r600_tex.c b/src/mesa/drivers/dri/r600/r600_tex.c
new file mode 100644 (file)
index 0000000..70bf6c4
--- /dev/null
@@ -0,0 +1,347 @@
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+/**
+ * \file
+ *
+ * \author Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#include "main/glheader.h"
+#include "main/imports.h"
+#include "main/colormac.h"
+#include "main/context.h"
+#include "main/enums.h"
+#include "main/image.h"
+#include "main/mipmap.h"
+#include "main/simple_list.h"
+#include "main/texformat.h"
+#include "main/texstore.h"
+#include "main/teximage.h"
+#include "main/texobj.h"
+
+#include "texmem.h"
+
+#include "r600_context.h"
+#include "r600_state.h"
+#include "r600_ioctl.h"
+#include "radeon_mipmap_tree.h"
+#include "r600_tex.h"
+
+#include "xmlpool.h"
+
+
+static unsigned int translate_wrap_mode(GLenum wrapmode)
+{
+       switch(wrapmode) {
+       case GL_REPEAT: return R300_TX_REPEAT;
+       case GL_CLAMP: return R300_TX_CLAMP;
+       case GL_CLAMP_TO_EDGE: return R300_TX_CLAMP_TO_EDGE;
+       case GL_CLAMP_TO_BORDER: return R300_TX_CLAMP_TO_BORDER;
+       case GL_MIRRORED_REPEAT: return R300_TX_REPEAT | R300_TX_MIRRORED;
+       case GL_MIRROR_CLAMP_EXT: return R300_TX_CLAMP | R300_TX_MIRRORED;
+       case GL_MIRROR_CLAMP_TO_EDGE_EXT: return R300_TX_CLAMP_TO_EDGE | R300_TX_MIRRORED;
+       case GL_MIRROR_CLAMP_TO_BORDER_EXT: return R300_TX_CLAMP_TO_BORDER | R300_TX_MIRRORED;
+       default:
+               _mesa_problem(NULL, "bad wrap mode in %s", __FUNCTION__);
+               return 0;
+       }
+}
+
+
+/**
+ * Update the cached hardware registers based on the current texture wrap modes.
+ *
+ * \param t Texture object whose wrap modes are to be set
+ */
+static void r300UpdateTexWrap(radeonTexObjPtr t)
+{
+       struct gl_texture_object *tObj = &t->base;
+
+       t->pp_txfilter &=
+           ~(R300_TX_WRAP_S_MASK | R300_TX_WRAP_T_MASK | R300_TX_WRAP_R_MASK);
+
+       t->pp_txfilter |= translate_wrap_mode(tObj->WrapS) << R300_TX_WRAP_S_SHIFT;
+
+       if (tObj->Target != GL_TEXTURE_1D) {
+               t->pp_txfilter |= translate_wrap_mode(tObj->WrapT) << R300_TX_WRAP_T_SHIFT;
+
+               if (tObj->Target == GL_TEXTURE_3D)
+                       t->pp_txfilter |= translate_wrap_mode(tObj->WrapR) << R300_TX_WRAP_R_SHIFT;
+       }
+}
+
+static GLuint aniso_filter(GLfloat anisotropy)
+{
+       if (anisotropy >= 16.0) {
+               return R300_TX_MAX_ANISO_16_TO_1;
+       } else if (anisotropy >= 8.0) {
+               return R300_TX_MAX_ANISO_8_TO_1;
+       } else if (anisotropy >= 4.0) {
+               return R300_TX_MAX_ANISO_4_TO_1;
+       } else if (anisotropy >= 2.0) {
+               return R300_TX_MAX_ANISO_2_TO_1;
+       } else {
+               return R300_TX_MAX_ANISO_1_TO_1;
+       }
+}
+
+/**
+ * Set the texture magnification and minification modes.
+ *
+ * \param t Texture whose filter modes are to be set
+ * \param minf Texture minification mode
+ * \param magf Texture magnification mode
+ * \param anisotropy Maximum anisotropy level
+ */
+static void r300SetTexFilter(radeonTexObjPtr t, GLenum minf, GLenum magf, GLfloat anisotropy)
+{
+       /* Force revalidation to account for switches from/to mipmapping. */
+       t->validated = GL_FALSE;
+
+       t->pp_txfilter &= ~(R300_TX_MIN_FILTER_MASK | R300_TX_MIN_FILTER_MIP_MASK | R300_TX_MAG_FILTER_MASK | R300_TX_MAX_ANISO_MASK);
+       t->pp_txfilter_1 &= ~R300_EDGE_ANISO_EDGE_ONLY;
+
+       /* Note that EXT_texture_filter_anisotropic is extremely vague about
+        * how anisotropic filtering interacts with the "normal" filter modes.
+        * When anisotropic filtering is enabled, we override min and mag
+        * filter settings completely. This includes driconf's settings.
+        */
+       if (anisotropy >= 2.0 && (minf != GL_NEAREST) && (magf != GL_NEAREST)) {
+               t->pp_txfilter |= R300_TX_MAG_FILTER_ANISO
+                       | R300_TX_MIN_FILTER_ANISO
+                       | R300_TX_MIN_FILTER_MIP_LINEAR
+                       | aniso_filter(anisotropy);
+               if (RADEON_DEBUG & DEBUG_TEXTURE)
+                       fprintf(stderr, "Using maximum anisotropy of %f\n", anisotropy);
+               return;
+       }
+
+       switch (minf) {
+       case GL_NEAREST:
+               t->pp_txfilter |= R300_TX_MIN_FILTER_NEAREST;
+               break;
+       case GL_LINEAR:
+               t->pp_txfilter |= R300_TX_MIN_FILTER_LINEAR;
+               break;
+       case GL_NEAREST_MIPMAP_NEAREST:
+               t->pp_txfilter |= R300_TX_MIN_FILTER_NEAREST|R300_TX_MIN_FILTER_MIP_NEAREST;
+               break;
+       case GL_NEAREST_MIPMAP_LINEAR:
+               t->pp_txfilter |= R300_TX_MIN_FILTER_NEAREST|R300_TX_MIN_FILTER_MIP_LINEAR;
+               break;
+       case GL_LINEAR_MIPMAP_NEAREST:
+               t->pp_txfilter |= R300_TX_MIN_FILTER_LINEAR|R300_TX_MIN_FILTER_MIP_NEAREST;
+               break;
+       case GL_LINEAR_MIPMAP_LINEAR:
+               t->pp_txfilter |= R300_TX_MIN_FILTER_LINEAR|R300_TX_MIN_FILTER_MIP_LINEAR;
+               break;
+       }
+
+       /* Note we don't have 3D mipmaps so only use the mag filter setting
+        * to set the 3D texture filter mode.
+        */
+       switch (magf) {
+       case GL_NEAREST:
+               t->pp_txfilter |= R300_TX_MAG_FILTER_NEAREST;
+               break;
+       case GL_LINEAR:
+               t->pp_txfilter |= R300_TX_MAG_FILTER_LINEAR;
+               break;
+       }
+}
+
+static void r300SetTexBorderColor(radeonTexObjPtr t, GLubyte c[4])
+{
+       t->pp_border_color = PACK_COLOR_8888(c[3], c[0], c[1], c[2]);
+}
+
+/**
+ * Changes variables and flags for a state update, which will happen at the
+ * next UpdateTextureState
+ */
+
+static void r300TexParameter(GLcontext * ctx, GLenum target,
+                            struct gl_texture_object *texObj,
+                            GLenum pname, const GLfloat * params)
+{
+       radeonTexObj* t = radeon_tex_obj(texObj);
+
+       if (RADEON_DEBUG & (DEBUG_STATE | DEBUG_TEXTURE)) {
+               fprintf(stderr, "%s( %s )\n", __FUNCTION__,
+                       _mesa_lookup_enum_by_nr(pname));
+       }
+
+       switch (pname) {
+       case GL_TEXTURE_MIN_FILTER:
+       case GL_TEXTURE_MAG_FILTER:
+       case GL_TEXTURE_MAX_ANISOTROPY_EXT:
+               r300SetTexFilter(t, texObj->MinFilter, texObj->MagFilter, texObj->MaxAnisotropy);
+               break;
+
+       case GL_TEXTURE_WRAP_S:
+       case GL_TEXTURE_WRAP_T:
+       case GL_TEXTURE_WRAP_R:
+               r300UpdateTexWrap(t);
+               break;
+
+       case GL_TEXTURE_BORDER_COLOR:
+               r300SetTexBorderColor(t, texObj->_BorderChan);
+               break;
+
+       case GL_TEXTURE_BASE_LEVEL:
+       case GL_TEXTURE_MAX_LEVEL:
+       case GL_TEXTURE_MIN_LOD:
+       case GL_TEXTURE_MAX_LOD:
+               /* This isn't the most efficient solution but there doesn't appear to
+                * be a nice alternative.  Since there's no LOD clamping,
+                * we just have to rely on loading the right subset of mipmap levels
+                * to simulate a clamped LOD.
+                */
+               if (t->mt) {
+                       radeon_miptree_unreference(t->mt);
+                       t->mt = 0;
+                       t->validated = GL_FALSE;
+               }
+               break;
+
+       case GL_DEPTH_TEXTURE_MODE:
+               if (!texObj->Image[0][texObj->BaseLevel])
+                       return;
+               if (texObj->Image[0][texObj->BaseLevel]->TexFormat->BaseFormat
+                   == GL_DEPTH_COMPONENT) {
+                       r300SetDepthTexMode(texObj);
+                       break;
+               } else {
+                       /* If the texture isn't a depth texture, changing this
+                        * state won't cause any changes to the hardware.
+                        * Don't force a flush of texture state.
+                        */
+                       return;
+               }
+
+       default:
+               return;
+       }
+}
+
+static void r300DeleteTexture(GLcontext * ctx, struct gl_texture_object *texObj)
+{
+       r300ContextPtr rmesa = R300_CONTEXT(ctx);
+       radeonTexObj* t = radeon_tex_obj(texObj);
+
+       if (RADEON_DEBUG & (DEBUG_STATE | DEBUG_TEXTURE)) {
+               fprintf(stderr, "%s( %p (target = %s) )\n", __FUNCTION__,
+                       (void *)texObj,
+                       _mesa_lookup_enum_by_nr(texObj->Target));
+       }
+
+       if (rmesa) {
+               int i;
+               radeon_firevertices(&rmesa->radeon);
+
+               for(i = 0; i < R300_MAX_TEXTURE_UNITS; ++i)
+                       if (rmesa->hw.textures[i] == t)
+                               rmesa->hw.textures[i] = 0;
+       }
+
+       if (t->bo) {
+               radeon_bo_unref(t->bo);
+               t->bo = NULL;
+       }
+
+       if (t->mt) {
+               radeon_miptree_unreference(t->mt);
+               t->mt = 0;
+       }
+       _mesa_delete_texture_object(ctx, texObj);
+}
+
+/**
+ * Allocate a new texture object.
+ * Called via ctx->Driver.NewTextureObject.
+ * Note: this function will be called during context creation to
+ * allocate the default texture objects.
+ * Fixup MaxAnisotropy according to user preference.
+ */
+static struct gl_texture_object *r300NewTextureObject(GLcontext * ctx,
+                                                     GLuint name,
+                                                     GLenum target)
+{
+       r300ContextPtr rmesa = R300_CONTEXT(ctx);
+       radeonTexObj* t = CALLOC_STRUCT(radeon_tex_obj);
+
+
+       if (RADEON_DEBUG & (DEBUG_STATE | DEBUG_TEXTURE)) {
+               fprintf(stderr, "%s( %p (target = %s) )\n", __FUNCTION__,
+                       t, _mesa_lookup_enum_by_nr(target));
+       }
+
+       _mesa_initialize_texture_object(&t->base, name, target);
+       t->base.MaxAnisotropy = rmesa->radeon.initialMaxAnisotropy;
+
+       /* Initialize hardware state */
+       r300UpdateTexWrap(t);
+       r300SetTexFilter(t, t->base.MinFilter, t->base.MagFilter, t->base.MaxAnisotropy);
+       r300SetTexBorderColor(t, t->base._BorderChan);
+
+       return &t->base;
+}
+
+void r300InitTextureFuncs(struct dd_function_table *functions)
+{
+       /* Note: we only plug in the functions we implement in the driver
+        * since _mesa_init_driver_functions() was already called.
+        */
+       functions->NewTextureImage = radeonNewTextureImage;
+       functions->FreeTexImageData = radeonFreeTexImageData;
+       functions->MapTexture = radeonMapTexture;
+       functions->UnmapTexture = radeonUnmapTexture;
+
+       functions->ChooseTextureFormat = radeonChooseTextureFormat_mesa;
+       functions->TexImage1D = radeonTexImage1D;
+       functions->TexImage2D = radeonTexImage2D;
+       functions->TexImage3D = radeonTexImage3D;
+       functions->TexSubImage1D = radeonTexSubImage1D;
+       functions->TexSubImage2D = radeonTexSubImage2D;
+       functions->TexSubImage3D = radeonTexSubImage3D;
+       functions->GetTexImage = radeonGetTexImage;
+       functions->GetCompressedTexImage = radeonGetCompressedTexImage;
+       functions->NewTextureObject = r300NewTextureObject;
+       functions->DeleteTexture = r300DeleteTexture;
+       functions->IsTextureResident = driIsTextureResident;
+
+       functions->TexParameter = r300TexParameter;
+
+       functions->CompressedTexImage2D = radeonCompressedTexImage2D;
+       functions->CompressedTexSubImage2D = radeonCompressedTexSubImage2D;
+
+       functions->GenerateMipmap = radeonGenerateMipmap;
+
+       driInitTextureFormats();
+}
diff --git a/src/mesa/drivers/dri/r600/r600_tex.h b/src/mesa/drivers/dri/r600/r600_tex.h
new file mode 100644 (file)
index 0000000..1d7d396
--- /dev/null
@@ -0,0 +1,54 @@
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#ifndef __r600_TEX_H__
+#define __r600_TEX_H__
+
+extern void r300SetDepthTexMode(struct gl_texture_object *tObj);
+
+extern void r300SetTexBuffer(__DRIcontext *pDRICtx, GLint target,
+                            __DRIdrawable *dPriv);
+
+extern void r300SetTexBuffer2(__DRIcontext *pDRICtx, GLint target,
+                             GLint format, __DRIdrawable *dPriv);
+
+extern void r300SetTexOffset(__DRIcontext *pDRICtx, GLint texname,
+                            unsigned long long offset, GLint depth,
+                            GLuint pitch);
+
+extern GLboolean r300ValidateBuffers(GLcontext * ctx);
+
+extern void r300InitTextureFuncs(struct dd_function_table *functions);
+
+#endif                         /* __r600_TEX_H__ */
diff --git a/src/mesa/drivers/dri/r600/r600_texstate.c b/src/mesa/drivers/dri/r600/r600_texstate.c
new file mode 100644 (file)
index 0000000..3168d5a
--- /dev/null
@@ -0,0 +1,488 @@
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/**
+ * \file
+ *
+ * \author Keith Whitwell <keith@tungstengraphics.com>
+ *
+ * \todo Enable R300 texture tiling code?
+ */
+
+#include "main/glheader.h"
+#include "main/imports.h"
+#include "main/context.h"
+#include "main/macros.h"
+#include "main/texformat.h"
+#include "main/teximage.h"
+#include "main/texobj.h"
+#include "main/enums.h"
+
+#include "r600_context.h"
+#include "r600_state.h"
+#include "r600_ioctl.h"
+#include "radeon_mipmap_tree.h"
+#include "r600_tex.h"
+#include "r600_reg.h"
+
+#define VALID_FORMAT(f) ( ((f) <= MESA_FORMAT_RGBA_DXT5                        \
+                          || ((f) >= MESA_FORMAT_RGBA_FLOAT32 &&       \
+                              (f) <= MESA_FORMAT_INTENSITY_FLOAT16))   \
+                         && tx_table[f].flag )
+
+#define _ASSIGN(entry, format)                         \
+       [ MESA_FORMAT_ ## entry ] = { format, 0, 1}
+
+/*
+ * Note that the _REV formats are the same as the non-REV formats.  This is
+ * because the REV and non-REV formats are identical as a byte string, but
+ * differ when accessed as 16-bit or 32-bit words depending on the endianness of
+ * the host.  Since the textures are transferred to the R300 as a byte string
+ * (i.e. without any byte-swapping), the R300 sees the REV and non-REV formats
+ * identically.  -- paulus
+ */
+
+static const struct tx_table {
+       GLuint format, filter, flag;
+} tx_table[] = {
+       /* *INDENT-OFF* */
+#ifdef MESA_LITTLE_ENDIAN
+       _ASSIGN(RGBA8888, R300_EASY_TX_FORMAT(Y, Z, W, X, W8Z8Y8X8)),
+       _ASSIGN(RGBA8888_REV, R300_EASY_TX_FORMAT(Z, Y, X, W, W8Z8Y8X8)),
+       _ASSIGN(ARGB8888, R300_EASY_TX_FORMAT(X, Y, Z, W, W8Z8Y8X8)),
+       _ASSIGN(ARGB8888_REV, R300_EASY_TX_FORMAT(W, Z, Y, X, W8Z8Y8X8)),
+#else
+       _ASSIGN(RGBA8888, R300_EASY_TX_FORMAT(Z, Y, X, W, W8Z8Y8X8)),
+       _ASSIGN(RGBA8888_REV, R300_EASY_TX_FORMAT(Y, Z, W, X, W8Z8Y8X8)),
+       _ASSIGN(ARGB8888, R300_EASY_TX_FORMAT(W, Z, Y, X, W8Z8Y8X8)),
+       _ASSIGN(ARGB8888_REV, R300_EASY_TX_FORMAT(X, Y, Z, W, W8Z8Y8X8)),
+#endif
+       _ASSIGN(RGB888, R300_EASY_TX_FORMAT(X, Y, Z, ONE, W8Z8Y8X8)),
+       _ASSIGN(RGB565, R300_EASY_TX_FORMAT(X, Y, Z, ONE, Z5Y6X5)),
+       _ASSIGN(RGB565_REV, R300_EASY_TX_FORMAT(X, Y, Z, ONE, Z5Y6X5)),
+       _ASSIGN(ARGB4444, R300_EASY_TX_FORMAT(X, Y, Z, W, W4Z4Y4X4)),
+       _ASSIGN(ARGB4444_REV, R300_EASY_TX_FORMAT(X, Y, Z, W, W4Z4Y4X4)),
+       _ASSIGN(ARGB1555, R300_EASY_TX_FORMAT(X, Y, Z, W, W1Z5Y5X5)),
+       _ASSIGN(ARGB1555_REV, R300_EASY_TX_FORMAT(X, Y, Z, W, W1Z5Y5X5)),
+       _ASSIGN(AL88, R300_EASY_TX_FORMAT(X, X, X, Y, Y8X8)),
+       _ASSIGN(AL88_REV, R300_EASY_TX_FORMAT(X, X, X, Y, Y8X8)),
+       _ASSIGN(RGB332, R300_EASY_TX_FORMAT(X, Y, Z, ONE, Z3Y3X2)),
+       _ASSIGN(A8, R300_EASY_TX_FORMAT(ZERO, ZERO, ZERO, X, X8)),
+       _ASSIGN(L8, R300_EASY_TX_FORMAT(X, X, X, ONE, X8)),
+       _ASSIGN(I8, R300_EASY_TX_FORMAT(X, X, X, X, X8)),
+       _ASSIGN(CI8, R300_EASY_TX_FORMAT(X, X, X, X, X8)),
+       _ASSIGN(YCBCR, R300_EASY_TX_FORMAT(X, Y, Z, ONE, G8R8_G8B8) | R300_TX_FORMAT_YUV_MODE),
+       _ASSIGN(YCBCR_REV, R300_EASY_TX_FORMAT(X, Y, Z, ONE, G8R8_G8B8) | R300_TX_FORMAT_YUV_MODE),
+       _ASSIGN(RGB_DXT1, R300_EASY_TX_FORMAT(X, Y, Z, ONE, DXT1)),
+       _ASSIGN(RGBA_DXT1, R300_EASY_TX_FORMAT(X, Y, Z, W, DXT1)),
+       _ASSIGN(RGBA_DXT3, R300_EASY_TX_FORMAT(X, Y, Z, W, DXT3)),
+       _ASSIGN(RGBA_DXT5, R300_EASY_TX_FORMAT(Y, Z, W, X, DXT5)),
+       _ASSIGN(RGBA_FLOAT32, R300_EASY_TX_FORMAT(Z, Y, X, W, FL_R32G32B32A32)),
+       _ASSIGN(RGBA_FLOAT16, R300_EASY_TX_FORMAT(Z, Y, X, W, FL_R16G16B16A16)),
+       _ASSIGN(RGB_FLOAT32, 0xffffffff),
+       _ASSIGN(RGB_FLOAT16, 0xffffffff),
+       _ASSIGN(ALPHA_FLOAT32, R300_EASY_TX_FORMAT(ZERO, ZERO, ZERO, X, FL_I32)),
+       _ASSIGN(ALPHA_FLOAT16, R300_EASY_TX_FORMAT(ZERO, ZERO, ZERO, X, FL_I16)),
+       _ASSIGN(LUMINANCE_FLOAT32, R300_EASY_TX_FORMAT(X, X, X, ONE, FL_I32)),
+       _ASSIGN(LUMINANCE_FLOAT16, R300_EASY_TX_FORMAT(X, X, X, ONE, FL_I16)),
+       _ASSIGN(LUMINANCE_ALPHA_FLOAT32, R300_EASY_TX_FORMAT(X, X, X, Y, FL_I32A32)),
+       _ASSIGN(LUMINANCE_ALPHA_FLOAT16, R300_EASY_TX_FORMAT(X, X, X, Y, FL_I16A16)),
+       _ASSIGN(INTENSITY_FLOAT32, R300_EASY_TX_FORMAT(X, X, X, X, FL_I32)),
+       _ASSIGN(INTENSITY_FLOAT16, R300_EASY_TX_FORMAT(X, X, X, X, FL_I16)),
+       _ASSIGN(Z16, R300_EASY_TX_FORMAT(X, X, X, X, X16)),
+       _ASSIGN(Z24_S8, R300_EASY_TX_FORMAT(X, X, X, X, X24_Y8)),
+       _ASSIGN(Z32, R300_EASY_TX_FORMAT(X, X, X, X, X32)),
+       /* *INDENT-ON* */
+};
+
+#undef _ASSIGN
+
+void r300SetDepthTexMode(struct gl_texture_object *tObj)
+{
+       static const GLuint formats[3][3] = {
+               {
+                       R300_EASY_TX_FORMAT(X, X, X, ONE, X16),
+                       R300_EASY_TX_FORMAT(X, X, X, X, X16),
+                       R300_EASY_TX_FORMAT(ZERO, ZERO, ZERO, X, X16),
+               },
+               {
+                       R300_EASY_TX_FORMAT(X, X, X, ONE, X24_Y8),
+                       R300_EASY_TX_FORMAT(X, X, X, X, X24_Y8),
+                       R300_EASY_TX_FORMAT(ZERO, ZERO, ZERO, X, X24_Y8),
+               },
+               {
+                       R300_EASY_TX_FORMAT(X, X, X, ONE, X32),
+                       R300_EASY_TX_FORMAT(X, X, X, X, X32),
+                       R300_EASY_TX_FORMAT(ZERO, ZERO, ZERO, X, X32),
+               },
+       };
+       const GLuint *format;
+       radeonTexObjPtr t;
+
+       if (!tObj)
+               return;
+
+       t = radeon_tex_obj(tObj);
+
+       switch (tObj->Image[0][tObj->BaseLevel]->TexFormat->MesaFormat) {
+       case MESA_FORMAT_Z16:
+               format = formats[0];
+               break;
+       case MESA_FORMAT_Z24_S8:
+               format = formats[1];
+               break;
+       case MESA_FORMAT_Z32:
+               format = formats[2];
+               break;
+       default:
+               /* Error...which should have already been caught by higher
+                * levels of Mesa.
+                */
+               ASSERT(0);
+               return;
+       }
+
+       switch (tObj->DepthMode) {
+       case GL_LUMINANCE:
+               t->pp_txformat = format[0];
+               break;
+       case GL_INTENSITY:
+               t->pp_txformat = format[1];
+               break;
+       case GL_ALPHA:
+               t->pp_txformat = format[2];
+               break;
+       default:
+               /* Error...which should have already been caught by higher
+                * levels of Mesa.
+                */
+               ASSERT(0);
+               return;
+       }
+}
+
+
+/**
+ * Compute the cached hardware register values for the given texture object.
+ *
+ * \param rmesa Context pointer
+ * \param t the r300 texture object
+ */
+static void setup_hardware_state(r300ContextPtr rmesa, radeonTexObj *t)
+{
+       const struct gl_texture_image *firstImage;
+       int firstlevel = t->mt ? t->mt->firstLevel : 0;
+           
+       firstImage = t->base.Image[0][firstlevel];
+
+       if (!t->image_override
+           && VALID_FORMAT(firstImage->TexFormat->MesaFormat)) {
+               if (firstImage->TexFormat->BaseFormat == GL_DEPTH_COMPONENT) {
+                       r300SetDepthTexMode(&t->base);
+               } else {
+                       t->pp_txformat = tx_table[firstImage->TexFormat->MesaFormat].format;
+               }
+
+               t->pp_txfilter |= tx_table[firstImage->TexFormat->MesaFormat].filter;
+       } else if (!t->image_override) {
+               _mesa_problem(NULL, "unexpected texture format in %s",
+                             __FUNCTION__);
+               return;
+       }
+
+       if (t->image_override && t->bo)
+               return;
+
+       t->pp_txsize = (((firstImage->Width - 1) << R300_TX_WIDTHMASK_SHIFT)
+                       | ((firstImage->Height - 1) << R300_TX_HEIGHTMASK_SHIFT)
+                       | ((firstImage->DepthLog2) << R300_TX_DEPTHMASK_SHIFT)
+                       | ((t->mt->lastLevel - t->mt->firstLevel) << R300_TX_MAX_MIP_LEVEL_SHIFT));
+
+       t->tile_bits = 0;
+
+       if (t->base.Target == GL_TEXTURE_CUBE_MAP)
+               t->pp_txformat |= R300_TX_FORMAT_CUBIC_MAP;
+       if (t->base.Target == GL_TEXTURE_3D)
+               t->pp_txformat |= R300_TX_FORMAT_3D;
+
+
+       if (t->base.Target == GL_TEXTURE_RECTANGLE_NV) {
+               unsigned int align = (64 / t->mt->bpp) - 1;
+               t->pp_txsize |= R300_TX_SIZE_TXPITCH_EN;
+               if (!t->image_override)
+                       t->pp_txpitch = ((firstImage->Width + align) & ~align) - 1;
+       }
+
+       if (rmesa->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515) {
+           if (firstImage->Width > 2048)
+               t->pp_txpitch |= R500_TXWIDTH_BIT11;
+           if (firstImage->Height > 2048)
+               t->pp_txpitch |= R500_TXHEIGHT_BIT11;
+       }
+}
+
+/**
+ * Ensure the given texture is ready for rendering.
+ *
+ * Mostly this means populating the texture object's mipmap tree.
+ */
+static GLboolean r300_validate_texture(GLcontext * ctx, struct gl_texture_object *texObj)
+{
+       r300ContextPtr rmesa = R300_CONTEXT(ctx);
+       radeonTexObj *t = radeon_tex_obj(texObj);
+
+       if (!radeon_validate_texture_miptree(ctx, texObj))
+               return GL_FALSE;
+
+       /* Configure the hardware registers (more precisely, the cached version
+        * of the hardware registers). */
+       setup_hardware_state(rmesa, t);
+
+       t->validated = GL_TRUE;
+       return GL_TRUE;
+}
+
+/**
+ * Ensure all enabled and complete textures are uploaded along with any buffers being used.
+ */
+GLboolean r300ValidateBuffers(GLcontext * ctx)
+{
+       r300ContextPtr rmesa = R300_CONTEXT(ctx);
+       struct radeon_renderbuffer *rrb;
+       int i;
+
+       radeon_validate_reset_bos(&rmesa->radeon);
+
+       rrb = radeon_get_colorbuffer(&rmesa->radeon);
+       /* color buffer */
+       if (rrb && rrb->bo) {
+               radeon_validate_bo(&rmesa->radeon, rrb->bo,
+                                  0, RADEON_GEM_DOMAIN_VRAM);
+       }
+
+       /* depth buffer */
+       rrb = radeon_get_depthbuffer(&rmesa->radeon);
+       if (rrb && rrb->bo) {
+               radeon_validate_bo(&rmesa->radeon, rrb->bo,
+                                  0, RADEON_GEM_DOMAIN_VRAM);
+       }
+       
+       for (i = 0; i < ctx->Const.MaxTextureImageUnits; ++i) {
+               radeonTexObj *t;
+
+               if (!ctx->Texture.Unit[i]._ReallyEnabled)
+                       continue;
+
+               if (!r300_validate_texture(ctx, ctx->Texture.Unit[i]._Current)) {
+                       _mesa_warning(ctx,
+                                     "failed to validate texture for unit %d.\n",
+                                     i);
+               }
+               t = radeon_tex_obj(ctx->Texture.Unit[i]._Current);
+               if (t->image_override && t->bo)
+                       radeon_validate_bo(&rmesa->radeon, t->bo,
+                                          RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0);
+
+               else if (t->mt->bo)
+                       radeon_validate_bo(&rmesa->radeon, t->mt->bo,
+                                          RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0);
+       }
+       if (rmesa->radeon.dma.current)
+               radeon_validate_bo(&rmesa->radeon, rmesa->radeon.dma.current, RADEON_GEM_DOMAIN_GTT, 0);
+
+       return radeon_revalidate_bos(ctx);
+}
+
+void r300SetTexOffset(__DRIcontext * pDRICtx, GLint texname,
+                     unsigned long long offset, GLint depth, GLuint pitch)
+{
+       r300ContextPtr rmesa = pDRICtx->driverPrivate;
+       struct gl_texture_object *tObj =
+           _mesa_lookup_texture(rmesa->radeon.glCtx, texname);
+       radeonTexObjPtr t = radeon_tex_obj(tObj);
+       uint32_t pitch_val;
+
+       if (!tObj)
+               return;
+
+       t->image_override = GL_TRUE;
+
+       if (!offset)
+               return;
+
+       t->bo = NULL;
+       t->override_offset = offset;
+       t->pp_txpitch &= (1 << 13) -1;
+       pitch_val = pitch;
+
+       switch (depth) {
+       case 32:
+               t->pp_txformat = R300_EASY_TX_FORMAT(X, Y, Z, W, W8Z8Y8X8);
+               t->pp_txfilter |= tx_table[2].filter;
+               pitch_val /= 4;
+               break;
+       case 24:
+       default:
+               t->pp_txformat = R300_EASY_TX_FORMAT(X, Y, Z, ONE, W8Z8Y8X8);
+               t->pp_txfilter |= tx_table[4].filter;
+               pitch_val /= 4;
+               break;
+       case 16:
+               t->pp_txformat = R300_EASY_TX_FORMAT(X, Y, Z, ONE, Z5Y6X5);
+               t->pp_txfilter |= tx_table[5].filter;
+               pitch_val /= 2;
+               break;
+       }
+       pitch_val--;
+
+       t->pp_txpitch |= pitch_val;
+}
+
+void r300SetTexBuffer2(__DRIcontext *pDRICtx, GLint target, GLint glx_texture_format, __DRIdrawable *dPriv)
+{
+       struct gl_texture_unit *texUnit;
+       struct gl_texture_object *texObj;
+       struct gl_texture_image *texImage;
+       struct radeon_renderbuffer *rb;
+       radeon_texture_image *rImage;
+       radeonContextPtr radeon;
+       r300ContextPtr rmesa;
+       struct radeon_framebuffer *rfb;
+       radeonTexObjPtr t;
+       uint32_t pitch_val;
+       uint32_t internalFormat, type, format;
+
+       type = GL_BGRA;
+       format = GL_UNSIGNED_BYTE;
+       internalFormat = (glx_texture_format == GLX_TEXTURE_FORMAT_RGB_EXT ? 3 : 4);
+
+       radeon = pDRICtx->driverPrivate;
+       rmesa = pDRICtx->driverPrivate;
+
+       rfb = dPriv->driverPrivate;
+        texUnit = &radeon->glCtx->Texture.Unit[radeon->glCtx->Texture.CurrentUnit];
+       texObj = _mesa_select_tex_object(radeon->glCtx, texUnit, target);
+        texImage = _mesa_get_tex_image(radeon->glCtx, texObj, target, 0);
+
+       rImage = get_radeon_texture_image(texImage);
+       t = radeon_tex_obj(texObj);
+        if (t == NULL) {
+           return;
+       }
+
+       radeon_update_renderbuffers(pDRICtx, dPriv);
+       /* back & depth buffer are useless free them right away */
+       rb = (void*)rfb->base.Attachment[BUFFER_DEPTH].Renderbuffer;
+       if (rb && rb->bo) {
+               radeon_bo_unref(rb->bo);
+        rb->bo = NULL;
+       }
+       rb = (void*)rfb->base.Attachment[BUFFER_BACK_LEFT].Renderbuffer;
+       if (rb && rb->bo) {
+               radeon_bo_unref(rb->bo);
+               rb->bo = NULL;
+       }
+       rb = rfb->color_rb[0];
+       if (rb->bo == NULL) {
+               /* Failed to BO for the buffer */
+               return;
+       }
+       
+       _mesa_lock_texture(radeon->glCtx, texObj);
+       if (t->bo) {
+               radeon_bo_unref(t->bo);
+               t->bo = NULL;
+       }
+       if (rImage->bo) {
+               radeon_bo_unref(rImage->bo);
+               rImage->bo = NULL;
+       }
+       if (t->mt) {
+               radeon_miptree_unreference(t->mt);
+               t->mt = NULL;
+       }
+       if (rImage->mt) {
+               radeon_miptree_unreference(rImage->mt);
+               rImage->mt = NULL;
+       }
+       fprintf(stderr,"settexbuf %dx%d@%d %d targ %x format %x\n", rb->width, rb->height, rb->cpp, rb->pitch, target, format);
+       _mesa_init_teximage_fields(radeon->glCtx, target, texImage,
+                                  rb->width, rb->height, 1, 0, rb->cpp);
+       texImage->RowStride = rb->pitch / rb->cpp;
+       texImage->TexFormat = radeonChooseTextureFormat(radeon->glCtx,
+                                                       internalFormat,
+                                                       type, format, 0);
+       rImage->bo = rb->bo;
+       radeon_bo_ref(rImage->bo);
+       t->bo = rb->bo;
+       radeon_bo_ref(t->bo);
+       t->tile_bits = 0;
+       t->image_override = GL_TRUE;
+       t->override_offset = 0;
+       t->pp_txpitch &= (1 << 13) -1;
+       pitch_val = rb->pitch;
+       switch (rb->cpp) {
+       case 4:
+               t->pp_txformat = R300_EASY_TX_FORMAT(X, Y, Z, W, W8Z8Y8X8);
+               t->pp_txfilter |= tx_table[2].filter;
+               pitch_val /= 4;
+               break;
+       case 3:
+       default:
+               t->pp_txformat = R300_EASY_TX_FORMAT(X, Y, Z, ONE, W8Z8Y8X8);
+               t->pp_txfilter |= tx_table[4].filter;
+               pitch_val /= 4;
+               break;
+       case 2:
+               t->pp_txformat = R300_EASY_TX_FORMAT(X, Y, Z, ONE, Z5Y6X5);
+               t->pp_txfilter |= tx_table[5].filter;
+               pitch_val /= 2;
+               break;
+       }
+       pitch_val--;
+       t->pp_txsize = ((rb->width - 1) << R300_TX_WIDTHMASK_SHIFT) |
+              ((rb->height - 1) << R300_TX_HEIGHTMASK_SHIFT);
+       t->pp_txsize |= R300_TX_SIZE_TXPITCH_EN;
+       t->pp_txpitch |= pitch_val;
+
+       if (rmesa->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515) {
+           if (rb->width > 2048)
+               t->pp_txpitch |= R500_TXWIDTH_BIT11;
+           if (rb->height > 2048)
+               t->pp_txpitch |= R500_TXHEIGHT_BIT11;
+       }
+       t->validated = GL_TRUE;
+       _mesa_unlock_texture(radeon->glCtx, texObj);
+       return;
+}
+
+void r300SetTexBuffer(__DRIcontext *pDRICtx, GLint target, __DRIdrawable *dPriv)
+{
+        r300SetTexBuffer2(pDRICtx, target, GLX_TEXTURE_FORMAT_RGBA_EXT, dPriv);
+}
diff --git a/src/mesa/drivers/dri/r600/r600_vertprog.c b/src/mesa/drivers/dri/r600/r600_vertprog.c
new file mode 100644 (file)
index 0000000..946d8be
--- /dev/null
@@ -0,0 +1,1479 @@
+/**************************************************************************
+
+Copyright (C) 2005  Aapo Tahkola <aet@rasterburn.org>
+Copyright (C) 2008  Oliver McFadden <z3ro.geek@gmail.com>
+
+All Rights Reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the "Software"),
+to deal in the Software without restriction, including without limitation
+on the rights to use, copy, modify, merge, publish, distribute, sub
+license, and/or sell copies of the Software, and to permit persons to whom
+the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice (including the next
+paragraph) shall be included in all copies or substantial portions of the
+Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/* Radeon R5xx Acceleration, Revision 1.2 */
+
+#include "main/glheader.h"
+#include "main/macros.h"
+#include "main/enums.h"
+#include "shader/program.h"
+#include "shader/prog_instruction.h"
+#include "shader/prog_parameter.h"
+#include "shader/prog_statevars.h"
+#include "tnl/tnl.h"
+
+#include "r600_context.h"
+
+/* TODO: Get rid of t_src_class call */
+#define CMP_SRCS(a, b) ((a.RelAddr != b.RelAddr) || (a.Index != b.Index && \
+                      ((t_src_class(a.File) == PVS_SRC_REG_CONSTANT && \
+                        t_src_class(b.File) == PVS_SRC_REG_CONSTANT) || \
+                       (t_src_class(a.File) == PVS_SRC_REG_INPUT && \
+                        t_src_class(b.File) == PVS_SRC_REG_INPUT)))) \
+
+/*
+ * Take an already-setup and valid source then swizzle it appropriately to
+ * obtain a constant ZERO or ONE source.
+ */
+#define __CONST(x, y)  \
+       (PVS_SRC_OPERAND(t_src_index(vp, &src[x]),      \
+                          t_swizzle(y),        \
+                          t_swizzle(y),        \
+                          t_swizzle(y),        \
+                          t_swizzle(y),        \
+                          t_src_class(src[x].File), \
+                          VSF_FLAG_NONE) | (src[x].RelAddr << 4))
+
+#define FREE_TEMPS() \
+       do { \
+               int u_temp_used = (VSF_MAX_FRAGMENT_TEMPS - 1) - u_temp_i; \
+               if((vp->num_temporaries + u_temp_used) > VSF_MAX_FRAGMENT_TEMPS) { \
+                       WARN_ONCE("Ran out of temps, num temps %d, us %d\n", vp->num_temporaries, u_temp_used); \
+                       vp->native = GL_FALSE; \
+               } \
+               u_temp_i=VSF_MAX_FRAGMENT_TEMPS-1; \
+       } while (0)
+
+int r300VertexProgUpdateParams(GLcontext * ctx,
+                              struct r300_vertex_program_cont *vp, float *dst)
+{
+       int pi;
+       struct gl_vertex_program *mesa_vp = &vp->mesa_program;
+       float *dst_o = dst;
+       struct gl_program_parameter_list *paramList;
+
+       if (mesa_vp->IsNVProgram) {
+               _mesa_load_tracked_matrices(ctx);
+
+               for (pi = 0; pi < MAX_NV_VERTEX_PROGRAM_PARAMS; pi++) {
+                       *dst++ = ctx->VertexProgram.Parameters[pi][0];
+                       *dst++ = ctx->VertexProgram.Parameters[pi][1];
+                       *dst++ = ctx->VertexProgram.Parameters[pi][2];
+                       *dst++ = ctx->VertexProgram.Parameters[pi][3];
+               }
+               return dst - dst_o;
+       }
+
+       assert(mesa_vp->Base.Parameters);
+       _mesa_load_state_parameters(ctx, mesa_vp->Base.Parameters);
+
+       if (mesa_vp->Base.Parameters->NumParameters * 4 >
+           VSF_MAX_FRAGMENT_LENGTH) {
+               fprintf(stderr, "%s:Params exhausted\n", __FUNCTION__);
+               _mesa_exit(-1);
+       }
+
+       paramList = mesa_vp->Base.Parameters;
+       for (pi = 0; pi < paramList->NumParameters; pi++) {
+               switch (paramList->Parameters[pi].Type) {
+               case PROGRAM_STATE_VAR:
+               case PROGRAM_NAMED_PARAM:
+                       //fprintf(stderr, "%s", vp->Parameters->Parameters[pi].Name);
+               case PROGRAM_CONSTANT:
+                       *dst++ = paramList->ParameterValues[pi][0];
+                       *dst++ = paramList->ParameterValues[pi][1];
+                       *dst++ = paramList->ParameterValues[pi][2];
+                       *dst++ = paramList->ParameterValues[pi][3];
+                       break;
+               default:
+                       _mesa_problem(NULL, "Bad param type in %s",
+                                     __FUNCTION__);
+               }
+
+       }
+
+       return dst - dst_o;
+}
+
+static unsigned long t_dst_mask(GLuint mask)
+{
+       /* WRITEMASK_* is equivalent to VSF_FLAG_* */
+       return mask & VSF_FLAG_ALL;
+}
+
+static unsigned long t_dst_class(gl_register_file file)
+{
+
+       switch (file) {
+       case PROGRAM_TEMPORARY:
+               return PVS_DST_REG_TEMPORARY;
+       case PROGRAM_OUTPUT:
+               return PVS_DST_REG_OUT;
+       case PROGRAM_ADDRESS:
+               return PVS_DST_REG_A0;
+               /*
+                  case PROGRAM_INPUT:
+                  case PROGRAM_LOCAL_PARAM:
+                  case PROGRAM_ENV_PARAM:
+                  case PROGRAM_NAMED_PARAM:
+                  case PROGRAM_STATE_VAR:
+                  case PROGRAM_WRITE_ONLY:
+                  case PROGRAM_ADDRESS:
+                */
+       default:
+               fprintf(stderr, "problem in %s", __FUNCTION__);
+               _mesa_exit(-1);
+               return -1;
+       }
+}
+
+static unsigned long t_dst_index(struct r300_vertex_program *vp,
+                                struct prog_dst_register *dst)
+{
+       if (dst->File == PROGRAM_OUTPUT)
+               return vp->outputs[dst->Index];
+
+       return dst->Index;
+}
+
+static unsigned long t_src_class(gl_register_file file)
+{
+       switch (file) {
+       case PROGRAM_TEMPORARY:
+               return PVS_SRC_REG_TEMPORARY;
+       case PROGRAM_INPUT:
+               return PVS_SRC_REG_INPUT;
+       case PROGRAM_LOCAL_PARAM:
+       case PROGRAM_ENV_PARAM:
+       case PROGRAM_NAMED_PARAM:
+       case PROGRAM_CONSTANT:
+       case PROGRAM_STATE_VAR:
+               return PVS_SRC_REG_CONSTANT;
+               /*
+                  case PROGRAM_OUTPUT:
+                  case PROGRAM_WRITE_ONLY:
+                  case PROGRAM_ADDRESS:
+                */
+       default:
+               fprintf(stderr, "problem in %s", __FUNCTION__);
+               _mesa_exit(-1);
+               return -1;
+       }
+}
+
+static INLINE unsigned long t_swizzle(GLubyte swizzle)
+{
+/* this is in fact a NOP as the Mesa SWIZZLE_* are all identical to VSF_IN_COMPONENT_* */
+       return swizzle;
+}
+
+#if 0
+static void vp_dump_inputs(struct r300_vertex_program *vp, char *caller)
+{
+       int i;
+
+       if (vp == NULL) {
+               fprintf(stderr, "vp null in call to %s from %s\n", __FUNCTION__,
+                       caller);
+               return;
+       }
+
+       fprintf(stderr, "%s:<", caller);
+       for (i = 0; i < VERT_ATTRIB_MAX; i++)
+               fprintf(stderr, "%d ", vp->inputs[i]);
+       fprintf(stderr, ">\n");
+
+}
+#endif
+
+static unsigned long t_src_index(struct r300_vertex_program *vp,
+                                struct prog_src_register *src)
+{
+       int i;
+       int max_reg = -1;
+
+       if (src->File == PROGRAM_INPUT) {
+               if (vp->inputs[src->Index] != -1)
+                       return vp->inputs[src->Index];
+
+               for (i = 0; i < VERT_ATTRIB_MAX; i++)
+                       if (vp->inputs[i] > max_reg)
+                               max_reg = vp->inputs[i];
+
+               vp->inputs[src->Index] = max_reg + 1;
+
+               //vp_dump_inputs(vp, __FUNCTION__);
+
+               return vp->inputs[src->Index];
+       } else {
+               if (src->Index < 0) {
+                       fprintf(stderr,
+                               "negative offsets for indirect addressing do not work.\n");
+                       return 0;
+               }
+               return src->Index;
+       }
+}
+
+/* these two functions should probably be merged... */
+
+static unsigned long t_src(struct r300_vertex_program *vp,
+                          struct prog_src_register *src)
+{
+       /* src->NegateBase uses the NEGATE_ flags from program_instruction.h,
+        * which equal our VSF_FLAGS_ values, so it's safe to just pass it here.
+        */
+       return PVS_SRC_OPERAND(t_src_index(vp, src),
+                              t_swizzle(GET_SWZ(src->Swizzle, 0)),
+                              t_swizzle(GET_SWZ(src->Swizzle, 1)),
+                              t_swizzle(GET_SWZ(src->Swizzle, 2)),
+                              t_swizzle(GET_SWZ(src->Swizzle, 3)),
+                              t_src_class(src->File),
+                              src->NegateBase) | (src->RelAddr << 4);
+}
+
+static unsigned long t_src_scalar(struct r300_vertex_program *vp,
+                                 struct prog_src_register *src)
+{
+       /* src->NegateBase uses the NEGATE_ flags from program_instruction.h,
+        * which equal our VSF_FLAGS_ values, so it's safe to just pass it here.
+        */
+       return PVS_SRC_OPERAND(t_src_index(vp, src),
+                              t_swizzle(GET_SWZ(src->Swizzle, 0)),
+                              t_swizzle(GET_SWZ(src->Swizzle, 0)),
+                              t_swizzle(GET_SWZ(src->Swizzle, 0)),
+                              t_swizzle(GET_SWZ(src->Swizzle, 0)),
+                              t_src_class(src->File),
+                              src->
+                              NegateBase ? VSF_FLAG_ALL : VSF_FLAG_NONE) |
+           (src->RelAddr << 4);
+}
+
+static GLboolean valid_dst(struct r300_vertex_program *vp,
+                          struct prog_dst_register *dst)
+{
+       if (dst->File == PROGRAM_OUTPUT && vp->outputs[dst->Index] == -1) {
+               return GL_FALSE;
+       } else if (dst->File == PROGRAM_ADDRESS) {
+               assert(dst->Index == 0);
+       }
+
+       return GL_TRUE;
+}
+
+static GLuint *r300TranslateOpcodeABS(struct r300_vertex_program *vp,
+                                     struct prog_instruction *vpi,
+                                     GLuint * inst,
+                                     struct prog_src_register src[3])
+{
+       //MAX RESULT 1.X Y Z W PARAM 0{} {X Y Z W} PARAM 0{X Y Z W } {X Y Z W} neg Xneg Yneg Zneg W
+
+       inst[0] = PVS_OP_DST_OPERAND(VE_MAXIMUM,
+                                    GL_FALSE,
+                                    GL_FALSE,
+                                    t_dst_index(vp, &vpi->DstReg),
+                                    t_dst_mask(vpi->DstReg.WriteMask),
+                                    t_dst_class(vpi->DstReg.File));
+       inst[1] = t_src(vp, &src[0]);
+       inst[2] = PVS_SRC_OPERAND(t_src_index(vp, &src[0]),
+                                 t_swizzle(GET_SWZ(src[0].Swizzle, 0)),
+                                 t_swizzle(GET_SWZ(src[0].Swizzle, 1)),
+                                 t_swizzle(GET_SWZ(src[0].Swizzle, 2)),
+                                 t_swizzle(GET_SWZ(src[0].Swizzle, 3)),
+                                 t_src_class(src[0].File),
+                                 (!src[0].
+                                  NegateBase) ? VSF_FLAG_ALL : VSF_FLAG_NONE) |
+           (src[0].RelAddr << 4);
+       inst[3] = 0;
+
+       return inst;
+}
+
+static GLuint *r300TranslateOpcodeADD(struct r300_vertex_program *vp,
+                                     struct prog_instruction *vpi,
+                                     GLuint * inst,
+                                     struct prog_src_register src[3])
+{
+       inst[0] = PVS_OP_DST_OPERAND(VE_ADD,
+                                    GL_FALSE,
+                                    GL_FALSE,
+                                    t_dst_index(vp, &vpi->DstReg),
+                                    t_dst_mask(vpi->DstReg.WriteMask),
+                                    t_dst_class(vpi->DstReg.File));
+       inst[1] = t_src(vp, &src[0]);
+       inst[2] = t_src(vp, &src[1]);
+       inst[3] = __CONST(1, SWIZZLE_ZERO);
+
+       return inst;
+}
+
+static GLuint *r300TranslateOpcodeARL(struct r300_vertex_program *vp,
+                                     struct prog_instruction *vpi,
+                                     GLuint * inst,
+                                     struct prog_src_register src[3])
+{
+       inst[0] = PVS_OP_DST_OPERAND(VE_FLT2FIX_DX,
+                                    GL_FALSE,
+                                    GL_FALSE,
+                                    t_dst_index(vp, &vpi->DstReg),
+                                    t_dst_mask(vpi->DstReg.WriteMask),
+                                    t_dst_class(vpi->DstReg.File));
+       inst[1] = t_src(vp, &src[0]);
+       inst[2] = __CONST(0, SWIZZLE_ZERO);
+       inst[3] = __CONST(0, SWIZZLE_ZERO);
+
+       return inst;
+}
+
+static GLuint *r300TranslateOpcodeDP3(struct r300_vertex_program *vp,
+                                     struct prog_instruction *vpi,
+                                     GLuint * inst,
+                                     struct prog_src_register src[3])
+{
+       //DOT RESULT 1.X Y Z W PARAM 0{} {X Y Z ZERO} PARAM 0{} {X Y Z ZERO}
+
+       inst[0] = PVS_OP_DST_OPERAND(VE_DOT_PRODUCT,
+                                    GL_FALSE,
+                                    GL_FALSE,
+                                    t_dst_index(vp, &vpi->DstReg),
+                                    t_dst_mask(vpi->DstReg.WriteMask),
+                                    t_dst_class(vpi->DstReg.File));
+       inst[1] = PVS_SRC_OPERAND(t_src_index(vp, &src[0]),
+                                 t_swizzle(GET_SWZ(src[0].Swizzle, 0)),
+                                 t_swizzle(GET_SWZ(src[0].Swizzle, 1)),
+                                 t_swizzle(GET_SWZ(src[0].Swizzle, 2)),
+                                 SWIZZLE_ZERO,
+                                 t_src_class(src[0].File),
+                                 src[0].
+                                 NegateBase ? VSF_FLAG_XYZ : VSF_FLAG_NONE) |
+           (src[0].RelAddr << 4);
+       inst[2] =
+           PVS_SRC_OPERAND(t_src_index(vp, &src[1]),
+                           t_swizzle(GET_SWZ(src[1].Swizzle, 0)),
+                           t_swizzle(GET_SWZ(src[1].Swizzle, 1)),
+                           t_swizzle(GET_SWZ(src[1].Swizzle, 2)), SWIZZLE_ZERO,
+                           t_src_class(src[1].File),
+                           src[1].
+                           NegateBase ? VSF_FLAG_XYZ : VSF_FLAG_NONE) |
+           (src[1].RelAddr << 4);
+       inst[3] = __CONST(1, SWIZZLE_ZERO);
+
+       return inst;
+}
+
+static GLuint *r300TranslateOpcodeDP4(struct r300_vertex_program *vp,
+                                     struct prog_instruction *vpi,
+                                     GLuint * inst,
+                                     struct prog_src_register src[3])
+{
+       inst[0] = PVS_OP_DST_OPERAND(VE_DOT_PRODUCT,
+                                    GL_FALSE,
+                                    GL_FALSE,
+                                    t_dst_index(vp, &vpi->DstReg),
+                                    t_dst_mask(vpi->DstReg.WriteMask),
+                                    t_dst_class(vpi->DstReg.File));
+       inst[1] = t_src(vp, &src[0]);
+       inst[2] = t_src(vp, &src[1]);
+       inst[3] = __CONST(1, SWIZZLE_ZERO);
+
+       return inst;
+}
+
+static GLuint *r300TranslateOpcodeDPH(struct r300_vertex_program *vp,
+                                     struct prog_instruction *vpi,
+                                     GLuint * inst,
+                                     struct prog_src_register src[3])
+{
+       //DOT RESULT 1.X Y Z W PARAM 0{} {X Y Z ONE} PARAM 0{} {X Y Z W}
+       inst[0] = PVS_OP_DST_OPERAND(VE_DOT_PRODUCT,
+                                    GL_FALSE,
+                                    GL_FALSE,
+                                    t_dst_index(vp, &vpi->DstReg),
+                                    t_dst_mask(vpi->DstReg.WriteMask),
+                                    t_dst_class(vpi->DstReg.File));
+       inst[1] = PVS_SRC_OPERAND(t_src_index(vp, &src[0]),
+                                 t_swizzle(GET_SWZ(src[0].Swizzle, 0)),
+                                 t_swizzle(GET_SWZ(src[0].Swizzle, 1)),
+                                 t_swizzle(GET_SWZ(src[0].Swizzle, 2)),
+                                 PVS_SRC_SELECT_FORCE_1,
+                                 t_src_class(src[0].File),
+                                 src[0].
+                                 NegateBase ? VSF_FLAG_XYZ : VSF_FLAG_NONE) |
+           (src[0].RelAddr << 4);
+       inst[2] = t_src(vp, &src[1]);
+       inst[3] = __CONST(1, SWIZZLE_ZERO);
+
+       return inst;
+}
+
+static GLuint *r300TranslateOpcodeDST(struct r300_vertex_program *vp,
+                                     struct prog_instruction *vpi,
+                                     GLuint * inst,
+                                     struct prog_src_register src[3])
+{
+       inst[0] = PVS_OP_DST_OPERAND(VE_DISTANCE_VECTOR,
+                                    GL_FALSE,
+                                    GL_FALSE,
+                                    t_dst_index(vp, &vpi->DstReg),
+                                    t_dst_mask(vpi->DstReg.WriteMask),
+                                    t_dst_class(vpi->DstReg.File));
+       inst[1] = t_src(vp, &src[0]);
+       inst[2] = t_src(vp, &src[1]);
+       inst[3] = __CONST(1, SWIZZLE_ZERO);
+
+       return inst;
+}
+
+static GLuint *r300TranslateOpcodeEX2(struct r300_vertex_program *vp,
+                                     struct prog_instruction *vpi,
+                                     GLuint * inst,
+                                     struct prog_src_register src[3])
+{
+       inst[0] = PVS_OP_DST_OPERAND(ME_EXP_BASE2_FULL_DX,
+                                    GL_TRUE,
+                                    GL_FALSE,
+                                    t_dst_index(vp, &vpi->DstReg),
+                                    t_dst_mask(vpi->DstReg.WriteMask),
+                                    t_dst_class(vpi->DstReg.File));
+       inst[1] = t_src_scalar(vp, &src[0]);
+       inst[2] = __CONST(0, SWIZZLE_ZERO);
+       inst[3] = __CONST(0, SWIZZLE_ZERO);
+
+       return inst;
+}
+
+static GLuint *r300TranslateOpcodeEXP(struct r300_vertex_program *vp,
+                                     struct prog_instruction *vpi,
+                                     GLuint * inst,
+                                     struct prog_src_register src[3])
+{
+       inst[0] = PVS_OP_DST_OPERAND(ME_EXP_BASE2_DX,
+                                    GL_TRUE,
+                                    GL_FALSE,
+                                    t_dst_index(vp, &vpi->DstReg),
+                                    t_dst_mask(vpi->DstReg.WriteMask),
+                                    t_dst_class(vpi->DstReg.File));
+       inst[1] = t_src_scalar(vp, &src[0]);
+       inst[2] = __CONST(0, SWIZZLE_ZERO);
+       inst[3] = __CONST(0, SWIZZLE_ZERO);
+
+       return inst;
+}
+
+static GLuint *r300TranslateOpcodeFLR(struct r300_vertex_program *vp,
+                                     struct prog_instruction *vpi,
+                                     GLuint * inst,
+                                     struct prog_src_register src[3],
+                                     int *u_temp_i)
+{
+       /* FRC TMP 0.X Y Z W PARAM 0{} {X Y Z W}
+          ADD RESULT 1.X Y Z W PARAM 0{} {X Y Z W} TMP 0{X Y Z W } {X Y Z W} neg Xneg Yneg Zneg W */
+
+       inst[0] = PVS_OP_DST_OPERAND(VE_FRACTION,
+                                    GL_FALSE,
+                                    GL_FALSE,
+                                    *u_temp_i,
+                                    t_dst_mask(vpi->DstReg.WriteMask),
+                                    PVS_DST_REG_TEMPORARY);
+       inst[1] = t_src(vp, &src[0]);
+       inst[2] = __CONST(0, SWIZZLE_ZERO);
+       inst[3] = __CONST(0, SWIZZLE_ZERO);
+       inst += 4;
+
+       inst[0] = PVS_OP_DST_OPERAND(VE_ADD,
+                                    GL_FALSE,
+                                    GL_FALSE,
+                                    t_dst_index(vp, &vpi->DstReg),
+                                    t_dst_mask(vpi->DstReg.WriteMask),
+                                    t_dst_class(vpi->DstReg.File));
+       inst[1] = t_src(vp, &src[0]);
+       inst[2] = PVS_SRC_OPERAND(*u_temp_i,
+                                 PVS_SRC_SELECT_X,
+                                 PVS_SRC_SELECT_Y,
+                                 PVS_SRC_SELECT_Z,
+                                 PVS_SRC_SELECT_W, PVS_SRC_REG_TEMPORARY,
+                                 /* Not 100% sure about this */
+                                 (!src[0].
+                                  NegateBase) ? VSF_FLAG_ALL : VSF_FLAG_NONE
+                                 /*VSF_FLAG_ALL */ );
+       inst[3] = __CONST(0, SWIZZLE_ZERO);
+       (*u_temp_i)--;
+
+       return inst;
+}
+
+static GLuint *r300TranslateOpcodeFRC(struct r300_vertex_program *vp,
+                                     struct prog_instruction *vpi,
+                                     GLuint * inst,
+                                     struct prog_src_register src[3])
+{
+       inst[0] = PVS_OP_DST_OPERAND(VE_FRACTION,
+                                    GL_FALSE,
+                                    GL_FALSE,
+                                    t_dst_index(vp, &vpi->DstReg),
+                                    t_dst_mask(vpi->DstReg.WriteMask),
+                                    t_dst_class(vpi->DstReg.File));
+       inst[1] = t_src(vp, &src[0]);
+       inst[2] = __CONST(0, SWIZZLE_ZERO);
+       inst[3] = __CONST(0, SWIZZLE_ZERO);
+
+       return inst;
+}
+
+static GLuint *r300TranslateOpcodeLG2(struct r300_vertex_program *vp,
+                                     struct prog_instruction *vpi,
+                                     GLuint * inst,
+                                     struct prog_src_register src[3])
+{
+       // LG2 RESULT 1.X Y Z W PARAM 0{} {X X X X}
+
+       inst[0] = PVS_OP_DST_OPERAND(ME_LOG_BASE2_FULL_DX,
+                                    GL_TRUE,
+                                    GL_FALSE,
+                                    t_dst_index(vp, &vpi->DstReg),
+                                    t_dst_mask(vpi->DstReg.WriteMask),
+                                    t_dst_class(vpi->DstReg.File));
+       inst[1] = PVS_SRC_OPERAND(t_src_index(vp, &src[0]),
+                                 t_swizzle(GET_SWZ(src[0].Swizzle, 0)),
+                                 t_swizzle(GET_SWZ(src[0].Swizzle, 0)),
+                                 t_swizzle(GET_SWZ(src[0].Swizzle, 0)),
+                                 t_swizzle(GET_SWZ(src[0].Swizzle, 0)),
+                                 t_src_class(src[0].File),
+                                 src[0].
+                                 NegateBase ? VSF_FLAG_ALL : VSF_FLAG_NONE) |
+           (src[0].RelAddr << 4);
+       inst[2] = __CONST(0, SWIZZLE_ZERO);
+       inst[3] = __CONST(0, SWIZZLE_ZERO);
+
+       return inst;
+}
+
+static GLuint *r300TranslateOpcodeLIT(struct r300_vertex_program *vp,
+                                     struct prog_instruction *vpi,
+                                     GLuint * inst,
+                                     struct prog_src_register src[3])
+{
+       //LIT TMP 1.Y Z TMP 1{} {X W Z Y} TMP 1{} {Y W Z X} TMP 1{} {Y X Z W}
+
+       inst[0] = PVS_OP_DST_OPERAND(ME_LIGHT_COEFF_DX,
+                                    GL_TRUE,
+                                    GL_FALSE,
+                                    t_dst_index(vp, &vpi->DstReg),
+                                    t_dst_mask(vpi->DstReg.WriteMask),
+                                    t_dst_class(vpi->DstReg.File));
+       /* NOTE: Users swizzling might not work. */
+       inst[1] = PVS_SRC_OPERAND(t_src_index(vp, &src[0]), t_swizzle(GET_SWZ(src[0].Swizzle, 0)),      // X
+                                 t_swizzle(GET_SWZ(src[0].Swizzle, 3)),        // W
+                                 PVS_SRC_SELECT_FORCE_0,       // Z
+                                 t_swizzle(GET_SWZ(src[0].Swizzle, 1)),        // Y
+                                 t_src_class(src[0].File),
+                                 src[0].
+                                 NegateBase ? VSF_FLAG_ALL : VSF_FLAG_NONE) |
+           (src[0].RelAddr << 4);
+       inst[2] = PVS_SRC_OPERAND(t_src_index(vp, &src[0]), t_swizzle(GET_SWZ(src[0].Swizzle, 1)),      // Y
+                                 t_swizzle(GET_SWZ(src[0].Swizzle, 3)),        // W
+                                 PVS_SRC_SELECT_FORCE_0,       // Z
+                                 t_swizzle(GET_SWZ(src[0].Swizzle, 0)),        // X
+                                 t_src_class(src[0].File),
+                                 src[0].
+                                 NegateBase ? VSF_FLAG_ALL : VSF_FLAG_NONE) |
+           (src[0].RelAddr << 4);
+       inst[3] = PVS_SRC_OPERAND(t_src_index(vp, &src[0]), t_swizzle(GET_SWZ(src[0].Swizzle, 1)),      // Y
+                                 t_swizzle(GET_SWZ(src[0].Swizzle, 0)),        // X
+                                 PVS_SRC_SELECT_FORCE_0,       // Z
+                                 t_swizzle(GET_SWZ(src[0].Swizzle, 3)),        // W
+                                 t_src_class(src[0].File),
+                                 src[0].
+                                 NegateBase ? VSF_FLAG_ALL : VSF_FLAG_NONE) |
+           (src[0].RelAddr << 4);
+
+       return inst;
+}
+
+static GLuint *r300TranslateOpcodeLOG(struct r300_vertex_program *vp,
+                                     struct prog_instruction *vpi,
+                                     GLuint * inst,
+                                     struct prog_src_register src[3])
+{
+       inst[0] = PVS_OP_DST_OPERAND(ME_LOG_BASE2_DX,
+                                    GL_TRUE,
+                                    GL_FALSE,
+                                    t_dst_index(vp, &vpi->DstReg),
+                                    t_dst_mask(vpi->DstReg.WriteMask),
+                                    t_dst_class(vpi->DstReg.File));
+       inst[1] = t_src_scalar(vp, &src[0]);
+       inst[2] = __CONST(0, SWIZZLE_ZERO);
+       inst[3] = __CONST(0, SWIZZLE_ZERO);
+
+       return inst;
+}
+
+static GLuint *r300TranslateOpcodeMAD(struct r300_vertex_program *vp,
+                                     struct prog_instruction *vpi,
+                                     GLuint * inst,
+                                     struct prog_src_register src[3])
+{
+       inst[0] = PVS_OP_DST_OPERAND(PVS_MACRO_OP_2CLK_MADD,
+                                    GL_FALSE,
+                                    GL_TRUE,
+                                    t_dst_index(vp, &vpi->DstReg),
+                                    t_dst_mask(vpi->DstReg.WriteMask),
+                                    t_dst_class(vpi->DstReg.File));
+       inst[1] = t_src(vp, &src[0]);
+       inst[2] = t_src(vp, &src[1]);
+       inst[3] = t_src(vp, &src[2]);
+
+       return inst;
+}
+
+static GLuint *r300TranslateOpcodeMAX(struct r300_vertex_program *vp,
+                                     struct prog_instruction *vpi,
+                                     GLuint * inst,
+                                     struct prog_src_register src[3])
+{
+       inst[0] = PVS_OP_DST_OPERAND(VE_MAXIMUM,
+                                    GL_FALSE,
+                                    GL_FALSE,
+                                    t_dst_index(vp, &vpi->DstReg),
+                                    t_dst_mask(vpi->DstReg.WriteMask),
+                                    t_dst_class(vpi->DstReg.File));
+       inst[1] = t_src(vp, &src[0]);
+       inst[2] = t_src(vp, &src[1]);
+       inst[3] = __CONST(1, SWIZZLE_ZERO);
+
+       return inst;
+}
+
+static GLuint *r300TranslateOpcodeMIN(struct r300_vertex_program *vp,
+                                     struct prog_instruction *vpi,
+                                     GLuint * inst,
+                                     struct prog_src_register src[3])
+{
+       inst[0] = PVS_OP_DST_OPERAND(VE_MINIMUM,
+                                    GL_FALSE,
+                                    GL_FALSE,
+                                    t_dst_index(vp, &vpi->DstReg),
+                                    t_dst_mask(vpi->DstReg.WriteMask),
+                                    t_dst_class(vpi->DstReg.File));
+       inst[1] = t_src(vp, &src[0]);
+       inst[2] = t_src(vp, &src[1]);
+       inst[3] = __CONST(1, SWIZZLE_ZERO);
+
+       return inst;
+}
+
+static GLuint *r300TranslateOpcodeMOV(struct r300_vertex_program *vp,
+                                     struct prog_instruction *vpi,
+                                     GLuint * inst,
+                                     struct prog_src_register src[3])
+{
+       //ADD RESULT 1.X Y Z W PARAM 0{} {X Y Z W} PARAM 0{} {ZERO ZERO ZERO ZERO}
+
+       inst[0] = PVS_OP_DST_OPERAND(VE_ADD,
+                                    GL_FALSE,
+                                    GL_FALSE,
+                                    t_dst_index(vp, &vpi->DstReg),
+                                    t_dst_mask(vpi->DstReg.WriteMask),
+                                    t_dst_class(vpi->DstReg.File));
+       inst[1] = t_src(vp, &src[0]);
+       inst[2] = __CONST(0, SWIZZLE_ZERO);
+       inst[3] = __CONST(0, SWIZZLE_ZERO);
+
+       return inst;
+}
+
+static GLuint *r300TranslateOpcodeMUL(struct r300_vertex_program *vp,
+                                     struct prog_instruction *vpi,
+                                     GLuint * inst,
+                                     struct prog_src_register src[3])
+{
+       inst[0] = PVS_OP_DST_OPERAND(VE_MULTIPLY,
+                                    GL_FALSE,
+                                    GL_FALSE,
+                                    t_dst_index(vp, &vpi->DstReg),
+                                    t_dst_mask(vpi->DstReg.WriteMask),
+                                    t_dst_class(vpi->DstReg.File));
+       inst[1] = t_src(vp, &src[0]);
+       inst[2] = t_src(vp, &src[1]);
+       inst[3] = __CONST(1, SWIZZLE_ZERO);
+
+       return inst;
+}
+
+static GLuint *r300TranslateOpcodePOW(struct r300_vertex_program *vp,
+                                     struct prog_instruction *vpi,
+                                     GLuint * inst,
+                                     struct prog_src_register src[3])
+{
+       inst[0] = PVS_OP_DST_OPERAND(ME_POWER_FUNC_FF,
+                                    GL_TRUE,
+                                    GL_FALSE,
+                                    t_dst_index(vp, &vpi->DstReg),
+                                    t_dst_mask(vpi->DstReg.WriteMask),
+                                    t_dst_class(vpi->DstReg.File));
+       inst[1] = t_src_scalar(vp, &src[0]);
+       inst[2] = __CONST(0, SWIZZLE_ZERO);
+       inst[3] = t_src_scalar(vp, &src[1]);
+
+       return inst;
+}
+
+static GLuint *r300TranslateOpcodeRCP(struct r300_vertex_program *vp,
+                                     struct prog_instruction *vpi,
+                                     GLuint * inst,
+                                     struct prog_src_register src[3])
+{
+       inst[0] = PVS_OP_DST_OPERAND(ME_RECIP_DX,
+                                    GL_TRUE,
+                                    GL_FALSE,
+                                    t_dst_index(vp, &vpi->DstReg),
+                                    t_dst_mask(vpi->DstReg.WriteMask),
+                                    t_dst_class(vpi->DstReg.File));
+       inst[1] = t_src_scalar(vp, &src[0]);
+       inst[2] = __CONST(0, SWIZZLE_ZERO);
+       inst[3] = __CONST(0, SWIZZLE_ZERO);
+
+       return inst;
+}
+
+static GLuint *r300TranslateOpcodeRSQ(struct r300_vertex_program *vp,
+                                     struct prog_instruction *vpi,
+                                     GLuint * inst,
+                                     struct prog_src_register src[3])
+{
+       inst[0] = PVS_OP_DST_OPERAND(ME_RECIP_SQRT_DX,
+                                    GL_TRUE,
+                                    GL_FALSE,
+                                    t_dst_index(vp, &vpi->DstReg),
+                                    t_dst_mask(vpi->DstReg.WriteMask),
+                                    t_dst_class(vpi->DstReg.File));
+       inst[1] = t_src_scalar(vp, &src[0]);
+       inst[2] = __CONST(0, SWIZZLE_ZERO);
+       inst[3] = __CONST(0, SWIZZLE_ZERO);
+
+       return inst;
+}
+
+static GLuint *r300TranslateOpcodeSGE(struct r300_vertex_program *vp,
+                                     struct prog_instruction *vpi,
+                                     GLuint * inst,
+                                     struct prog_src_register src[3])
+{
+       inst[0] = PVS_OP_DST_OPERAND(VE_SET_GREATER_THAN_EQUAL,
+                                    GL_FALSE,
+                                    GL_FALSE,
+                                    t_dst_index(vp, &vpi->DstReg),
+                                    t_dst_mask(vpi->DstReg.WriteMask),
+                                    t_dst_class(vpi->DstReg.File));
+       inst[1] = t_src(vp, &src[0]);
+       inst[2] = t_src(vp, &src[1]);
+       inst[3] = __CONST(1, SWIZZLE_ZERO);
+
+       return inst;
+}
+
+static GLuint *r300TranslateOpcodeSLT(struct r300_vertex_program *vp,
+                                     struct prog_instruction *vpi,
+                                     GLuint * inst,
+                                     struct prog_src_register src[3])
+{
+       inst[0] = PVS_OP_DST_OPERAND(VE_SET_LESS_THAN,
+                                    GL_FALSE,
+                                    GL_FALSE,
+                                    t_dst_index(vp, &vpi->DstReg),
+                                    t_dst_mask(vpi->DstReg.WriteMask),
+                                    t_dst_class(vpi->DstReg.File));
+       inst[1] = t_src(vp, &src[0]);
+       inst[2] = t_src(vp, &src[1]);
+       inst[3] = __CONST(1, SWIZZLE_ZERO);
+
+       return inst;
+}
+
+static GLuint *r300TranslateOpcodeSUB(struct r300_vertex_program *vp,
+                                     struct prog_instruction *vpi,
+                                     GLuint * inst,
+                                     struct prog_src_register src[3])
+{
+       //ADD RESULT 1.X Y Z W TMP 0{} {X Y Z W} PARAM 1{X Y Z W } {X Y Z W} neg Xneg Yneg Zneg W
+
+#if 0
+       inst[0] = PVS_OP_DST_OPERAND(VE_ADD,
+                                    GL_FALSE,
+                                    GL_FALSE,
+                                    t_dst_index(vp, &vpi->DstReg),
+                                    t_dst_mask(vpi->DstReg.WriteMask),
+                                    t_dst_class(vpi->DstReg.File));
+       inst[1] = t_src(vp, &src[0]);
+       inst[2] = PVS_SRC_OPERAND(t_src_index(vp, &src[1]),
+                                 t_swizzle(GET_SWZ(src[1].Swizzle, 0)),
+                                 t_swizzle(GET_SWZ(src[1].Swizzle, 1)),
+                                 t_swizzle(GET_SWZ(src[1].Swizzle, 2)),
+                                 t_swizzle(GET_SWZ(src[1].Swizzle, 3)),
+                                 t_src_class(src[1].File),
+                                 (!src[1].
+                                  NegateBase) ? VSF_FLAG_ALL : VSF_FLAG_NONE) |
+           (src[1].RelAddr << 4);
+       inst[3] = 0;
+#else
+       inst[0] =
+           PVS_OP_DST_OPERAND(VE_MULTIPLY_ADD,
+                              GL_FALSE,
+                              GL_FALSE,
+                              t_dst_index(vp, &vpi->DstReg),
+                              t_dst_mask(vpi->DstReg.WriteMask),
+                              t_dst_class(vpi->DstReg.File));
+       inst[1] = t_src(vp, &src[0]);
+       inst[2] = __CONST(0, SWIZZLE_ONE);
+       inst[3] = PVS_SRC_OPERAND(t_src_index(vp, &src[1]),
+                                 t_swizzle(GET_SWZ(src[1].Swizzle, 0)),
+                                 t_swizzle(GET_SWZ(src[1].Swizzle, 1)),
+                                 t_swizzle(GET_SWZ(src[1].Swizzle, 2)),
+                                 t_swizzle(GET_SWZ(src[1].Swizzle, 3)),
+                                 t_src_class(src[1].File),
+                                 (!src[1].
+                                  NegateBase) ? VSF_FLAG_ALL : VSF_FLAG_NONE) |
+           (src[1].RelAddr << 4);
+#endif
+
+       return inst;
+}
+
+static GLuint *r300TranslateOpcodeSWZ(struct r300_vertex_program *vp,
+                                     struct prog_instruction *vpi,
+                                     GLuint * inst,
+                                     struct prog_src_register src[3])
+{
+       //ADD RESULT 1.X Y Z W PARAM 0{} {X Y Z W} PARAM 0{} {ZERO ZERO ZERO ZERO}
+
+       inst[0] = PVS_OP_DST_OPERAND(VE_ADD,
+                                    GL_FALSE,
+                                    GL_FALSE,
+                                    t_dst_index(vp, &vpi->DstReg),
+                                    t_dst_mask(vpi->DstReg.WriteMask),
+                                    t_dst_class(vpi->DstReg.File));
+       inst[1] = t_src(vp, &src[0]);
+       inst[2] = __CONST(0, SWIZZLE_ZERO);
+       inst[3] = __CONST(0, SWIZZLE_ZERO);
+
+       return inst;
+}
+
+static GLuint *r300TranslateOpcodeXPD(struct r300_vertex_program *vp,
+                                     struct prog_instruction *vpi,
+                                     GLuint * inst,
+                                     struct prog_src_register src[3],
+                                     int *u_temp_i)
+{
+       /* mul r0, r1.yzxw, r2.zxyw
+          mad r0, -r2.yzxw, r1.zxyw, r0
+        */
+
+       inst[0] = PVS_OP_DST_OPERAND(VE_MULTIPLY_ADD,
+                                    GL_FALSE,
+                                    GL_FALSE,
+                                    *u_temp_i,
+                                    t_dst_mask(vpi->DstReg.WriteMask),
+                                    PVS_DST_REG_TEMPORARY);
+       inst[1] = PVS_SRC_OPERAND(t_src_index(vp, &src[0]), t_swizzle(GET_SWZ(src[0].Swizzle, 1)),      // Y
+                                 t_swizzle(GET_SWZ(src[0].Swizzle, 2)),        // Z
+                                 t_swizzle(GET_SWZ(src[0].Swizzle, 0)),        // X
+                                 t_swizzle(GET_SWZ(src[0].Swizzle, 3)),        // W
+                                 t_src_class(src[0].File),
+                                 src[0].
+                                 NegateBase ? VSF_FLAG_ALL : VSF_FLAG_NONE) |
+           (src[0].RelAddr << 4);
+       inst[2] = PVS_SRC_OPERAND(t_src_index(vp, &src[1]), t_swizzle(GET_SWZ(src[1].Swizzle, 2)),      // Z
+                                 t_swizzle(GET_SWZ(src[1].Swizzle, 0)),        // X
+                                 t_swizzle(GET_SWZ(src[1].Swizzle, 1)),        // Y
+                                 t_swizzle(GET_SWZ(src[1].Swizzle, 3)),        // W
+                                 t_src_class(src[1].File),
+                                 src[1].
+                                 NegateBase ? VSF_FLAG_ALL : VSF_FLAG_NONE) |
+           (src[1].RelAddr << 4);
+       inst[3] = __CONST(1, SWIZZLE_ZERO);
+       inst += 4;
+
+       inst[0] = PVS_OP_DST_OPERAND(VE_MULTIPLY_ADD,
+                                    GL_FALSE,
+                                    GL_FALSE,
+                                    t_dst_index(vp, &vpi->DstReg),
+                                    t_dst_mask(vpi->DstReg.WriteMask),
+                                    t_dst_class(vpi->DstReg.File));
+       inst[1] = PVS_SRC_OPERAND(t_src_index(vp, &src[1]), t_swizzle(GET_SWZ(src[1].Swizzle, 1)),      // Y
+                                 t_swizzle(GET_SWZ(src[1].Swizzle, 2)),        // Z
+                                 t_swizzle(GET_SWZ(src[1].Swizzle, 0)),        // X
+                                 t_swizzle(GET_SWZ(src[1].Swizzle, 3)),        // W
+                                 t_src_class(src[1].File),
+                                 (!src[1].
+                                  NegateBase) ? VSF_FLAG_ALL : VSF_FLAG_NONE) |
+           (src[1].RelAddr << 4);
+       inst[2] = PVS_SRC_OPERAND(t_src_index(vp, &src[0]), t_swizzle(GET_SWZ(src[0].Swizzle, 2)),      // Z
+                                 t_swizzle(GET_SWZ(src[0].Swizzle, 0)),        // X
+                                 t_swizzle(GET_SWZ(src[0].Swizzle, 1)),        // Y
+                                 t_swizzle(GET_SWZ(src[0].Swizzle, 3)),        // W
+                                 t_src_class(src[0].File),
+                                 src[0].
+                                 NegateBase ? VSF_FLAG_ALL : VSF_FLAG_NONE) |
+           (src[0].RelAddr << 4);
+       inst[3] =
+           PVS_SRC_OPERAND(*u_temp_i, PVS_SRC_SELECT_X, PVS_SRC_SELECT_Y,
+                           PVS_SRC_SELECT_Z, PVS_SRC_SELECT_W,
+                           PVS_SRC_REG_TEMPORARY, VSF_FLAG_NONE);
+
+       (*u_temp_i)--;
+
+       return inst;
+}
+
+static void t_inputs_outputs(struct r300_vertex_program *vp)
+{
+       int i;
+       int cur_reg = 0;
+
+       for (i = 0; i < VERT_ATTRIB_MAX; i++)
+               vp->inputs[i] = -1;
+
+       for (i = 0; i < VERT_RESULT_MAX; i++)
+               vp->outputs[i] = -1;
+
+       assert(vp->key.OutputsWritten & (1 << VERT_RESULT_HPOS));
+
+       if (vp->key.OutputsWritten & (1 << VERT_RESULT_HPOS)) {
+               vp->outputs[VERT_RESULT_HPOS] = cur_reg++;
+       }
+
+       if (vp->key.OutputsWritten & (1 << VERT_RESULT_PSIZ)) {
+               vp->outputs[VERT_RESULT_PSIZ] = cur_reg++;
+       }
+
+       if (vp->key.OutputsWritten & (1 << VERT_RESULT_COL0)) {
+               vp->outputs[VERT_RESULT_COL0] = cur_reg++;
+       }
+
+       if (vp->key.OutputsWritten & (1 << VERT_RESULT_COL1)) {
+               vp->outputs[VERT_RESULT_COL1] =
+                   vp->outputs[VERT_RESULT_COL0] + 1;
+               cur_reg = vp->outputs[VERT_RESULT_COL1] + 1;
+       }
+
+       if (vp->key.OutputsWritten & (1 << VERT_RESULT_BFC0)) {
+               vp->outputs[VERT_RESULT_BFC0] =
+                   vp->outputs[VERT_RESULT_COL0] + 2;
+               cur_reg = vp->outputs[VERT_RESULT_BFC0] + 2;
+       }
+
+       if (vp->key.OutputsWritten & (1 << VERT_RESULT_BFC1)) {
+               vp->outputs[VERT_RESULT_BFC1] =
+                   vp->outputs[VERT_RESULT_COL0] + 3;
+               cur_reg = vp->outputs[VERT_RESULT_BFC1] + 1;
+       }
+
+       for (i = VERT_RESULT_TEX0; i <= VERT_RESULT_TEX7; i++) {
+               if (vp->key.OutputsWritten & (1 << i)) {
+                       vp->outputs[i] = cur_reg++;
+               }
+       }
+
+       if (vp->key.OutputsWritten & (1 << VERT_RESULT_FOGC)) {
+               vp->outputs[VERT_RESULT_FOGC] = cur_reg++;
+       }
+}
+
+static void r300TranslateVertexShader(struct r300_vertex_program *vp,
+                                     struct prog_instruction *vpi)
+{
+       int i;
+       GLuint *inst;
+       unsigned long num_operands;
+       /* Initial value should be last tmp reg that hw supports.
+          Strangely enough r300 doesnt mind even though these would be out of range.
+          Smart enough to realize that it doesnt need it? */
+       int u_temp_i = VSF_MAX_FRAGMENT_TEMPS - 1;
+       struct prog_src_register src[3];
+
+       vp->pos_end = 0;        /* Not supported yet */
+       vp->program.length = 0;
+       /*vp->num_temporaries=mesa_vp->Base.NumTemporaries; */
+       vp->translated = GL_TRUE;
+       vp->native = GL_TRUE;
+
+       t_inputs_outputs(vp);
+
+       for (inst = vp->program.body.i; vpi->Opcode != OPCODE_END;
+            vpi++, inst += 4) {
+
+               FREE_TEMPS();
+
+               if (!valid_dst(vp, &vpi->DstReg)) {
+                       /* redirect result to unused temp */
+                       vpi->DstReg.File = PROGRAM_TEMPORARY;
+                       vpi->DstReg.Index = u_temp_i;
+               }
+
+               num_operands = _mesa_num_inst_src_regs(vpi->Opcode);
+
+               /* copy the sources (src) from mesa into a local variable... is this needed? */
+               for (i = 0; i < num_operands; i++) {
+                       src[i] = vpi->SrcReg[i];
+               }
+
+               if (num_operands == 3) {        /* TODO: scalars */
+                       if (CMP_SRCS(src[1], src[2])
+                           || CMP_SRCS(src[0], src[2])) {
+                               inst[0] = PVS_OP_DST_OPERAND(VE_ADD,
+                                                            GL_FALSE,
+                                                            GL_FALSE,
+                                                            u_temp_i,
+                                                            VSF_FLAG_ALL,
+                                                            PVS_DST_REG_TEMPORARY);
+                               inst[1] =
+                                   PVS_SRC_OPERAND(t_src_index(vp, &src[2]),
+                                                   SWIZZLE_X,
+                                                   SWIZZLE_Y,
+                                                   SWIZZLE_Z,
+                                                   SWIZZLE_W,
+                                                   t_src_class(src[2].File),
+                                                   VSF_FLAG_NONE) | (src[2].
+                                                                     RelAddr <<
+                                                                     4);
+                               inst[2] = __CONST(2, SWIZZLE_ZERO);
+                               inst[3] = __CONST(2, SWIZZLE_ZERO);
+                               inst += 4;
+
+                               src[2].File = PROGRAM_TEMPORARY;
+                               src[2].Index = u_temp_i;
+                               src[2].RelAddr = 0;
+                               u_temp_i--;
+                       }
+               }
+
+               if (num_operands >= 2) {
+                       if (CMP_SRCS(src[1], src[0])) {
+                               inst[0] = PVS_OP_DST_OPERAND(VE_ADD,
+                                                            GL_FALSE,
+                                                            GL_FALSE,
+                                                            u_temp_i,
+                                                            VSF_FLAG_ALL,
+                                                            PVS_DST_REG_TEMPORARY);
+                               inst[1] =
+                                   PVS_SRC_OPERAND(t_src_index(vp, &src[0]),
+                                                   SWIZZLE_X,
+                                                   SWIZZLE_Y,
+                                                   SWIZZLE_Z,
+                                                   SWIZZLE_W,
+                                                   t_src_class(src[0].File),
+                                                   VSF_FLAG_NONE) | (src[0].
+                                                                     RelAddr <<
+                                                                     4);
+                               inst[2] = __CONST(0, SWIZZLE_ZERO);
+                               inst[3] = __CONST(0, SWIZZLE_ZERO);
+                               inst += 4;
+
+                               src[0].File = PROGRAM_TEMPORARY;
+                               src[0].Index = u_temp_i;
+                               src[0].RelAddr = 0;
+                               u_temp_i--;
+                       }
+               }
+
+               switch (vpi->Opcode) {
+               case OPCODE_ABS:
+                       inst = r300TranslateOpcodeABS(vp, vpi, inst, src);
+                       break;
+               case OPCODE_ADD:
+                       inst = r300TranslateOpcodeADD(vp, vpi, inst, src);
+                       break;
+               case OPCODE_ARL:
+                       inst = r300TranslateOpcodeARL(vp, vpi, inst, src);
+                       break;
+               case OPCODE_DP3:
+                       inst = r300TranslateOpcodeDP3(vp, vpi, inst, src);
+                       break;
+               case OPCODE_DP4:
+                       inst = r300TranslateOpcodeDP4(vp, vpi, inst, src);
+                       break;
+               case OPCODE_DPH:
+                       inst = r300TranslateOpcodeDPH(vp, vpi, inst, src);
+                       break;
+               case OPCODE_DST:
+                       inst = r300TranslateOpcodeDST(vp, vpi, inst, src);
+                       break;
+               case OPCODE_EX2:
+                       inst = r300TranslateOpcodeEX2(vp, vpi, inst, src);
+                       break;
+               case OPCODE_EXP:
+                       inst = r300TranslateOpcodeEXP(vp, vpi, inst, src);
+                       break;
+               case OPCODE_FLR:
+                       inst = r300TranslateOpcodeFLR(vp, vpi, inst, src,       /* FIXME */
+                                                     &u_temp_i);
+                       break;
+               case OPCODE_FRC:
+                       inst = r300TranslateOpcodeFRC(vp, vpi, inst, src);
+                       break;
+               case OPCODE_LG2:
+                       inst = r300TranslateOpcodeLG2(vp, vpi, inst, src);
+                       break;
+               case OPCODE_LIT:
+                       inst = r300TranslateOpcodeLIT(vp, vpi, inst, src);
+                       break;
+               case OPCODE_LOG:
+                       inst = r300TranslateOpcodeLOG(vp, vpi, inst, src);
+                       break;
+               case OPCODE_MAD:
+                       inst = r300TranslateOpcodeMAD(vp, vpi, inst, src);
+                       break;
+               case OPCODE_MAX:
+                       inst = r300TranslateOpcodeMAX(vp, vpi, inst, src);
+                       break;
+               case OPCODE_MIN:
+                       inst = r300TranslateOpcodeMIN(vp, vpi, inst, src);
+                       break;
+               case OPCODE_MOV:
+                       inst = r300TranslateOpcodeMOV(vp, vpi, inst, src);
+                       break;
+               case OPCODE_MUL:
+                       inst = r300TranslateOpcodeMUL(vp, vpi, inst, src);
+                       break;
+               case OPCODE_POW:
+                       inst = r300TranslateOpcodePOW(vp, vpi, inst, src);
+                       break;
+               case OPCODE_RCP:
+                       inst = r300TranslateOpcodeRCP(vp, vpi, inst, src);
+                       break;
+               case OPCODE_RSQ:
+                       inst = r300TranslateOpcodeRSQ(vp, vpi, inst, src);
+                       break;
+               case OPCODE_SGE:
+                       inst = r300TranslateOpcodeSGE(vp, vpi, inst, src);
+                       break;
+               case OPCODE_SLT:
+                       inst = r300TranslateOpcodeSLT(vp, vpi, inst, src);
+                       break;
+               case OPCODE_SUB:
+                       inst = r300TranslateOpcodeSUB(vp, vpi, inst, src);
+                       break;
+               case OPCODE_SWZ:
+                       inst = r300TranslateOpcodeSWZ(vp, vpi, inst, src);
+                       break;
+               case OPCODE_XPD:
+                       inst = r300TranslateOpcodeXPD(vp, vpi, inst, src,       /* FIXME */
+                                                     &u_temp_i);
+                       break;
+               default:
+                       assert(0);
+                       break;
+               }
+       }
+
+       /* Some outputs may be artificially added, to match the inputs
+          of the fragment program. Blank the outputs here. */
+       for (i = 0; i < VERT_RESULT_MAX; i++) {
+               if (vp->key.OutputsAdded & (1 << i)) {
+                       inst[0] = PVS_OP_DST_OPERAND(VE_ADD,
+                                                    GL_FALSE,
+                                                    GL_FALSE,
+                                                    vp->outputs[i],
+                                                    VSF_FLAG_ALL,
+                                                    PVS_DST_REG_OUT);
+                       inst[1] = __CONST(0, SWIZZLE_ZERO);
+                       inst[2] = __CONST(0, SWIZZLE_ZERO);
+                       inst[3] = __CONST(0, SWIZZLE_ZERO);
+                       inst += 4;
+               }
+       }
+
+       vp->program.length = (inst - vp->program.body.i);
+       if (vp->program.length >= VSF_MAX_FRAGMENT_LENGTH) {
+               vp->program.length = 0;
+               vp->native = GL_FALSE;
+       }
+#if 0
+       fprintf(stderr, "hw program:\n");
+       for (i = 0; i < vp->program.length; i++)
+               fprintf(stderr, "%08x\n", vp->program.body.d[i]);
+#endif
+}
+
+/* DP4 version seems to trigger some hw peculiarity */
+//#define PREFER_DP4
+
+static void position_invariant(struct gl_program *prog)
+{
+       struct prog_instruction *vpi;
+       struct gl_program_parameter_list *paramList;
+       int i;
+
+       gl_state_index tokens[STATE_LENGTH] = { STATE_MVP_MATRIX, 0, 0, 0, 0 };
+
+       /* tokens[4] = matrix modifier */
+#ifdef PREFER_DP4
+       tokens[4] = 0;          /* not transposed or inverted */
+#else
+       tokens[4] = STATE_MATRIX_TRANSPOSE;
+#endif
+       paramList = prog->Parameters;
+
+       vpi = _mesa_alloc_instructions(prog->NumInstructions + 4);
+       _mesa_init_instructions(vpi, prog->NumInstructions + 4);
+
+       for (i = 0; i < 4; i++) {
+               GLint idx;
+               tokens[2] = tokens[3] = i;      /* matrix row[i]..row[i] */
+               idx = _mesa_add_state_reference(paramList, tokens);
+#ifdef PREFER_DP4
+               vpi[i].Opcode = OPCODE_DP4;
+               vpi[i].StringPos = 0;
+               vpi[i].Data = 0;
+
+               vpi[i].DstReg.File = PROGRAM_OUTPUT;
+               vpi[i].DstReg.Index = VERT_RESULT_HPOS;
+               vpi[i].DstReg.WriteMask = 1 << i;
+               vpi[i].DstReg.CondMask = COND_TR;
+
+               vpi[i].SrcReg[0].File = PROGRAM_STATE_VAR;
+               vpi[i].SrcReg[0].Index = idx;
+               vpi[i].SrcReg[0].Swizzle = SWIZZLE_XYZW;
+
+               vpi[i].SrcReg[1].File = PROGRAM_INPUT;
+               vpi[i].SrcReg[1].Index = VERT_ATTRIB_POS;
+               vpi[i].SrcReg[1].Swizzle = SWIZZLE_XYZW;
+#else
+               if (i == 0)
+                       vpi[i].Opcode = OPCODE_MUL;
+               else
+                       vpi[i].Opcode = OPCODE_MAD;
+
+               vpi[i].Data = 0;
+
+               if (i == 3)
+                       vpi[i].DstReg.File = PROGRAM_OUTPUT;
+               else
+                       vpi[i].DstReg.File = PROGRAM_TEMPORARY;
+               vpi[i].DstReg.Index = 0;
+               vpi[i].DstReg.WriteMask = 0xf;
+               vpi[i].DstReg.CondMask = COND_TR;
+
+               vpi[i].SrcReg[0].File = PROGRAM_STATE_VAR;
+               vpi[i].SrcReg[0].Index = idx;
+               vpi[i].SrcReg[0].Swizzle = SWIZZLE_XYZW;
+
+               vpi[i].SrcReg[1].File = PROGRAM_INPUT;
+               vpi[i].SrcReg[1].Index = VERT_ATTRIB_POS;
+               vpi[i].SrcReg[1].Swizzle = MAKE_SWIZZLE4(i, i, i, i);
+
+               if (i > 0) {
+                       vpi[i].SrcReg[2].File = PROGRAM_TEMPORARY;
+                       vpi[i].SrcReg[2].Index = 0;
+                       vpi[i].SrcReg[2].Swizzle = SWIZZLE_XYZW;
+               }
+#endif
+       }
+
+       _mesa_copy_instructions(&vpi[i], prog->Instructions,
+                               prog->NumInstructions);
+
+       free(prog->Instructions);
+
+       prog->Instructions = vpi;
+
+       prog->NumInstructions += 4;
+       vpi = &prog->Instructions[prog->NumInstructions - 1];
+
+       assert(vpi->Opcode == OPCODE_END);
+}
+
+static void insert_wpos(struct r300_vertex_program *vp, struct gl_program *prog,
+                       GLuint temp_index)
+{
+       struct prog_instruction *vpi;
+       struct prog_instruction *vpi_insert;
+       int i = 0;
+
+       vpi = _mesa_alloc_instructions(prog->NumInstructions + 2);
+       _mesa_init_instructions(vpi, prog->NumInstructions + 2);
+       /* all but END */
+       _mesa_copy_instructions(vpi, prog->Instructions,
+                               prog->NumInstructions - 1);
+       /* END */
+       _mesa_copy_instructions(&vpi[prog->NumInstructions + 1],
+                               &prog->Instructions[prog->NumInstructions - 1],
+                               1);
+       vpi_insert = &vpi[prog->NumInstructions - 1];
+
+       vpi_insert[i].Opcode = OPCODE_MOV;
+
+       vpi_insert[i].DstReg.File = PROGRAM_OUTPUT;
+       vpi_insert[i].DstReg.Index = VERT_RESULT_HPOS;
+       vpi_insert[i].DstReg.WriteMask = WRITEMASK_XYZW;
+       vpi_insert[i].DstReg.CondMask = COND_TR;
+
+       vpi_insert[i].SrcReg[0].File = PROGRAM_TEMPORARY;
+       vpi_insert[i].SrcReg[0].Index = temp_index;
+       vpi_insert[i].SrcReg[0].Swizzle = SWIZZLE_XYZW;
+       i++;
+
+       vpi_insert[i].Opcode = OPCODE_MOV;
+
+       vpi_insert[i].DstReg.File = PROGRAM_OUTPUT;
+       vpi_insert[i].DstReg.Index = VERT_RESULT_TEX0 + vp->wpos_idx;
+       vpi_insert[i].DstReg.WriteMask = WRITEMASK_XYZW;
+       vpi_insert[i].DstReg.CondMask = COND_TR;
+
+       vpi_insert[i].SrcReg[0].File = PROGRAM_TEMPORARY;
+       vpi_insert[i].SrcReg[0].Index = temp_index;
+       vpi_insert[i].SrcReg[0].Swizzle = SWIZZLE_XYZW;
+       i++;
+
+       free(prog->Instructions);
+
+       prog->Instructions = vpi;
+
+       prog->NumInstructions += i;
+       vpi = &prog->Instructions[prog->NumInstructions - 1];
+
+       assert(vpi->Opcode == OPCODE_END);
+}
+
+static void pos_as_texcoord(struct r300_vertex_program *vp,
+                           struct gl_program *prog)
+{
+       struct prog_instruction *vpi;
+       GLuint tempregi = prog->NumTemporaries;
+       /* should do something else if no temps left... */
+       prog->NumTemporaries++;
+
+       for (vpi = prog->Instructions; vpi->Opcode != OPCODE_END; vpi++) {
+               if (vpi->DstReg.File == PROGRAM_OUTPUT
+                   && vpi->DstReg.Index == VERT_RESULT_HPOS) {
+                       vpi->DstReg.File = PROGRAM_TEMPORARY;
+                       vpi->DstReg.Index = tempregi;
+               }
+       }
+       insert_wpos(vp, prog, tempregi);
+}
+
+static struct r300_vertex_program *build_program(struct r300_vertex_program_key
+                                                *wanted_key, struct gl_vertex_program
+                                                *mesa_vp, GLint wpos_idx)
+{
+       struct r300_vertex_program *vp;
+
+       vp = _mesa_calloc(sizeof(*vp));
+       _mesa_memcpy(&vp->key, wanted_key, sizeof(vp->key));
+       vp->wpos_idx = wpos_idx;
+
+       if (mesa_vp->IsPositionInvariant) {
+               position_invariant(&mesa_vp->Base);
+       }
+
+       if (wpos_idx > -1) {
+               pos_as_texcoord(vp, &mesa_vp->Base);
+       }
+
+       assert(mesa_vp->Base.NumInstructions);
+       vp->num_temporaries = mesa_vp->Base.NumTemporaries;
+       r300TranslateVertexShader(vp, mesa_vp->Base.Instructions);
+
+       return vp;
+}
+
+static void add_outputs(struct r300_vertex_program_key *key, GLint vert)
+{
+       if (key->OutputsWritten & (1 << vert))
+               return;
+
+       key->OutputsWritten |= 1 << vert;
+       key->OutputsAdded |= 1 << vert;
+}
+
+void r300SelectVertexShader(r300ContextPtr r300)
+{
+       GLcontext *ctx = ctx = r300->radeon.glCtx;
+       GLuint InputsRead;
+       struct r300_vertex_program_key wanted_key = { 0 };
+       GLint i;
+       struct r300_vertex_program_cont *vpc;
+       struct r300_vertex_program *vp;
+       GLint wpos_idx;
+
+       vpc = (struct r300_vertex_program_cont *)ctx->VertexProgram._Current;
+       wanted_key.InputsRead = vpc->mesa_program.Base.InputsRead;
+       wanted_key.OutputsWritten = vpc->mesa_program.Base.OutputsWritten;
+       InputsRead = ctx->FragmentProgram._Current->Base.InputsRead;
+
+       wpos_idx = -1;
+       if (InputsRead & FRAG_BIT_WPOS) {
+               for (i = 0; i < ctx->Const.MaxTextureUnits; i++)
+                       if (!(InputsRead & (FRAG_BIT_TEX0 << i)))
+                               break;
+
+               if (i == ctx->Const.MaxTextureUnits) {
+                       fprintf(stderr, "\tno free texcoord found\n");
+                       _mesa_exit(-1);
+               }
+
+               wanted_key.OutputsWritten |= 1 << (VERT_RESULT_TEX0 + i);
+               wpos_idx = i;
+       }
+
+       add_outputs(&wanted_key, VERT_RESULT_HPOS);
+
+       if (InputsRead & FRAG_BIT_COL0) {
+               add_outputs(&wanted_key, VERT_RESULT_COL0);
+       }
+
+       if (InputsRead & FRAG_BIT_COL1) {
+               add_outputs(&wanted_key, VERT_RESULT_COL1);
+       }
+
+       for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
+               if (InputsRead & (FRAG_BIT_TEX0 << i)) {
+                       add_outputs(&wanted_key, VERT_RESULT_TEX0 + i);
+               }
+       }
+
+       if (vpc->mesa_program.IsPositionInvariant) {
+               /* we wan't position don't we ? */
+               wanted_key.InputsRead |= (1 << VERT_ATTRIB_POS);
+       }
+
+       for (vp = vpc->progs; vp; vp = vp->next)
+               if (_mesa_memcmp(&vp->key, &wanted_key, sizeof(wanted_key))
+                   == 0) {
+                       r300->selected_vp = vp;
+                       return;
+               }
+       //_mesa_print_program(&vpc->mesa_program.Base);
+
+       vp = build_program(&wanted_key, &vpc->mesa_program, wpos_idx);
+       vp->next = vpc->progs;
+       vpc->progs = vp;
+       r300->selected_vp = vp;
+}
diff --git a/src/mesa/drivers/dri/r600/r600_vertprog.h b/src/mesa/drivers/dri/r600/r600_vertprog.h
new file mode 100644 (file)
index 0000000..48c97fa
--- /dev/null
@@ -0,0 +1,35 @@
+#ifndef __R600_VERTPROG_H_
+#define __R600_VERTPROG_H_
+
+#include "r600_reg.h"
+
+#define PVS_OP_DST_OPERAND(opcode, math_inst, macro_inst, reg_index, reg_writemask, reg_class) \
+        (((opcode & PVS_DST_OPCODE_MASK) << PVS_DST_OPCODE_SHIFT)      \
+        | ((math_inst & PVS_DST_MATH_INST_MASK) << PVS_DST_MATH_INST_SHIFT)    \
+        | ((macro_inst & PVS_DST_MACRO_INST_MASK) << PVS_DST_MACRO_INST_SHIFT) \
+        | ((reg_index & PVS_DST_OFFSET_MASK) << PVS_DST_OFFSET_SHIFT)  \
+        | ((reg_writemask & 0xf) << PVS_DST_WE_X_SHIFT)        /* X Y Z W */   \
+        | ((reg_class & PVS_DST_REG_TYPE_MASK) << PVS_DST_REG_TYPE_SHIFT))
+
+#define PVS_SRC_OPERAND(in_reg_index, comp_x, comp_y, comp_z, comp_w, reg_class, negate)       \
+       (((in_reg_index & PVS_SRC_OFFSET_MASK) << PVS_SRC_OFFSET_SHIFT)                         \
+        | ((comp_x & PVS_SRC_SWIZZLE_X_MASK) << PVS_SRC_SWIZZLE_X_SHIFT)                       \
+        | ((comp_y & PVS_SRC_SWIZZLE_Y_MASK) << PVS_SRC_SWIZZLE_Y_SHIFT)                       \
+        | ((comp_z & PVS_SRC_SWIZZLE_Z_MASK) << PVS_SRC_SWIZZLE_Z_SHIFT)                       \
+        | ((comp_w & PVS_SRC_SWIZZLE_W_MASK) << PVS_SRC_SWIZZLE_W_SHIFT)                       \
+        | ((negate & 0xf) << PVS_SRC_MODIFIER_X_SHIFT) /* X Y Z W */                           \
+        | ((reg_class & PVS_SRC_REG_TYPE_MASK) << PVS_SRC_REG_TYPE_SHIFT))
+
+#if 1
+
+#define VSF_FLAG_X     1
+#define VSF_FLAG_Y     2
+#define VSF_FLAG_Z     4
+#define VSF_FLAG_W     8
+#define VSF_FLAG_XYZ   (VSF_FLAG_X | VSF_FLAG_Y | VSF_FLAG_Z)
+#define VSF_FLAG_ALL  0xf
+#define VSF_FLAG_NONE  0
+
+#endif
+
+#endif
diff --git a/src/mesa/drivers/dri/r600/r700_fragprog.c b/src/mesa/drivers/dri/r600/r700_fragprog.c
new file mode 100644 (file)
index 0000000..c8041b4
--- /dev/null
@@ -0,0 +1,719 @@
+/*
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "r700_fragprog.h"
+
+#include "radeon_nqssadce.h"
+#include "radeon_program_alu.h"
+
+
+static void reset_srcreg(struct prog_src_register* reg)
+{
+       _mesa_bzero(reg, sizeof(*reg));
+       reg->Swizzle = SWIZZLE_NOOP;
+}
+
+static struct prog_src_register shadow_ambient(struct gl_program *program, int tmu)
+{
+       gl_state_index fail_value_tokens[STATE_LENGTH] = {
+               STATE_INTERNAL, STATE_SHADOW_AMBIENT, 0, 0, 0
+       };
+       struct prog_src_register reg = { 0, };
+
+       fail_value_tokens[2] = tmu;
+       reg.File = PROGRAM_STATE_VAR;
+       reg.Index = _mesa_add_state_reference(program->Parameters, fail_value_tokens);
+       reg.Swizzle = SWIZZLE_WWWW;
+       return reg;
+}
+
+/**
+ * Transform TEX, TXP, TXB, and KIL instructions in the following way:
+ *  - premultiply texture coordinates for RECT
+ *  - extract operand swizzles
+ *  - introduce a temporary register when write masks are needed
+ *
+ */
+static GLboolean transform_TEX(
+       struct radeon_transform_context *t,
+       struct prog_instruction* orig_inst, void* data)
+{
+       struct r500_fragment_program_compiler *compiler =
+               (struct r500_fragment_program_compiler*)data;
+       struct prog_instruction inst = *orig_inst;
+       struct prog_instruction* tgt;
+       GLboolean destredirect = GL_FALSE;
+
+       if (inst.Opcode != OPCODE_TEX &&
+           inst.Opcode != OPCODE_TXB &&
+           inst.Opcode != OPCODE_TXP &&
+           inst.Opcode != OPCODE_KIL)
+               return GL_FALSE;
+
+       /* ARB_shadow & EXT_shadow_funcs */
+       if (inst.Opcode != OPCODE_KIL &&
+           t->Program->ShadowSamplers & (1 << inst.TexSrcUnit)) {
+               GLuint comparefunc = GL_NEVER + compiler->fp->state.unit[inst.TexSrcUnit].texture_compare_func;
+
+               if (comparefunc == GL_NEVER || comparefunc == GL_ALWAYS) {
+                       tgt = radeonAppendInstructions(t->Program, 1);
+
+                       tgt->Opcode = OPCODE_MOV;
+                       tgt->DstReg = inst.DstReg;
+                       if (comparefunc == GL_ALWAYS) {
+                               tgt->SrcReg[0].File = PROGRAM_BUILTIN;
+                               tgt->SrcReg[0].Swizzle = SWIZZLE_1111;
+                       } else {
+                               tgt->SrcReg[0] = shadow_ambient(t->Program, inst.TexSrcUnit);
+                       }
+                       return GL_TRUE;
+               }
+
+               inst.DstReg.File = PROGRAM_TEMPORARY;
+               inst.DstReg.Index = radeonFindFreeTemporary(t);
+               inst.DstReg.WriteMask = WRITEMASK_XYZW;
+       } else if (inst.Opcode != OPCODE_KIL && inst.DstReg.File != PROGRAM_TEMPORARY) {
+               int tempreg = radeonFindFreeTemporary(t);
+
+               inst.DstReg.File = PROGRAM_TEMPORARY;
+               inst.DstReg.Index = tempreg;
+               inst.DstReg.WriteMask = WRITEMASK_XYZW;
+               destredirect = GL_TRUE;
+       }
+
+       if (inst.SrcReg[0].File != PROGRAM_TEMPORARY && inst.SrcReg[0].File != PROGRAM_INPUT) {
+               int tmpreg = radeonFindFreeTemporary(t);
+               tgt = radeonAppendInstructions(t->Program, 1);
+               tgt->Opcode = OPCODE_MOV;
+               tgt->DstReg.File = PROGRAM_TEMPORARY;
+               tgt->DstReg.Index = tmpreg;
+               tgt->SrcReg[0] = inst.SrcReg[0];
+
+               reset_srcreg(&inst.SrcReg[0]);
+               inst.SrcReg[0].File = PROGRAM_TEMPORARY;
+               inst.SrcReg[0].Index = tmpreg;
+       }
+
+       tgt = radeonAppendInstructions(t->Program, 1);
+       _mesa_copy_instructions(tgt, &inst, 1);
+
+       if (inst.Opcode != OPCODE_KIL &&
+           t->Program->ShadowSamplers & (1 << inst.TexSrcUnit)) {
+               GLuint comparefunc = GL_NEVER + compiler->fp->state.unit[inst.TexSrcUnit].texture_compare_func;
+               GLuint depthmode = compiler->fp->state.unit[inst.TexSrcUnit].depth_texture_mode;
+               int rcptemp = radeonFindFreeTemporary(t);
+               int pass, fail;
+
+               tgt = radeonAppendInstructions(t->Program, 3);
+
+               tgt[0].Opcode = OPCODE_RCP;
+               tgt[0].DstReg.File = PROGRAM_TEMPORARY;
+               tgt[0].DstReg.Index = rcptemp;
+               tgt[0].DstReg.WriteMask = WRITEMASK_W;
+               tgt[0].SrcReg[0] = inst.SrcReg[0];
+               tgt[0].SrcReg[0].Swizzle = SWIZZLE_WWWW;
+
+               tgt[1].Opcode = OPCODE_MAD;
+               tgt[1].DstReg = inst.DstReg;
+               tgt[1].DstReg.WriteMask = orig_inst->DstReg.WriteMask;
+               tgt[1].SrcReg[0] = inst.SrcReg[0];
+               tgt[1].SrcReg[0].Swizzle = SWIZZLE_ZZZZ;
+               tgt[1].SrcReg[1].File = PROGRAM_TEMPORARY;
+               tgt[1].SrcReg[1].Index = rcptemp;
+               tgt[1].SrcReg[1].Swizzle = SWIZZLE_WWWW;
+               tgt[1].SrcReg[2].File = PROGRAM_TEMPORARY;
+               tgt[1].SrcReg[2].Index = inst.DstReg.Index;
+               if (depthmode == 0) /* GL_LUMINANCE */
+                       tgt[1].SrcReg[2].Swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_Z);
+               else if (depthmode == 2) /* GL_ALPHA */
+                       tgt[1].SrcReg[2].Swizzle = SWIZZLE_WWWW;
+
+               /* Recall that SrcReg[0] is tex, SrcReg[2] is r and:
+                *   r  < tex  <=>      -tex+r < 0
+                *   r >= tex  <=> not (-tex+r < 0 */
+               if (comparefunc == GL_LESS || comparefunc == GL_GEQUAL)
+                       tgt[1].SrcReg[2].NegateBase = tgt[0].SrcReg[2].NegateBase ^ NEGATE_XYZW;
+               else
+                       tgt[1].SrcReg[0].NegateBase = tgt[0].SrcReg[0].NegateBase ^ NEGATE_XYZW;
+
+               tgt[2].Opcode = OPCODE_CMP;
+               tgt[2].DstReg = orig_inst->DstReg;
+               tgt[2].SrcReg[0].File = PROGRAM_TEMPORARY;
+               tgt[2].SrcReg[0].Index = tgt[1].DstReg.Index;
+
+               if (comparefunc == GL_LESS || comparefunc == GL_GREATER) {
+                       pass = 1;
+                       fail = 2;
+               } else {
+                       pass = 2;
+                       fail = 1;
+               }
+
+               tgt[2].SrcReg[pass].File = PROGRAM_BUILTIN;
+               tgt[2].SrcReg[pass].Swizzle = SWIZZLE_1111;
+               tgt[2].SrcReg[fail] = shadow_ambient(t->Program, inst.TexSrcUnit);
+       } else if (destredirect) {
+               tgt = radeonAppendInstructions(t->Program, 1);
+
+               tgt->Opcode = OPCODE_MOV;
+               tgt->DstReg = orig_inst->DstReg;
+               tgt->SrcReg[0].File = PROGRAM_TEMPORARY;
+               tgt->SrcReg[0].Index = inst.DstReg.Index;
+       }
+
+       return GL_TRUE;
+}
+
+
+static void update_params(r300ContextPtr r300, struct r500_fragment_program *fp)
+{
+       struct gl_fragment_program *mp = &fp->mesa_program;
+
+       /* Ask Mesa nicely to fill in ParameterValues for us */
+       if (mp->Base.Parameters)
+               _mesa_load_state_parameters(r300->radeon.glCtx, mp->Base.Parameters);
+}
+
+
+/**
+ * Transform the program to support fragment.position.
+ *
+ * Introduce a small fragment at the start of the program that will be
+ * the only code that directly reads the FRAG_ATTRIB_WPOS input.
+ * All other code pieces that reference that input will be rewritten
+ * to read from a newly allocated temporary.
+ *
+ * \todo if/when r5xx supports the radeon_program architecture, this is a
+ * likely candidate for code sharing.
+ */
+static void insert_WPOS_trailer(struct r500_fragment_program_compiler *compiler)
+{
+       GLuint InputsRead = compiler->fp->mesa_program.Base.InputsRead;
+
+       if (!(InputsRead & FRAG_BIT_WPOS))
+               return;
+
+       static gl_state_index tokens[STATE_LENGTH] = {
+               STATE_INTERNAL, STATE_R300_WINDOW_DIMENSION, 0, 0, 0
+       };
+       struct prog_instruction *fpi;
+       GLuint window_index;
+       int i = 0;
+       GLuint tempregi = _mesa_find_free_register(compiler->program, PROGRAM_TEMPORARY);
+
+       _mesa_insert_instructions(compiler->program, 0, 3);
+       fpi = compiler->program->Instructions;
+
+       /* perspective divide */
+       fpi[i].Opcode = OPCODE_RCP;
+
+       fpi[i].DstReg.File = PROGRAM_TEMPORARY;
+       fpi[i].DstReg.Index = tempregi;
+       fpi[i].DstReg.WriteMask = WRITEMASK_W;
+       fpi[i].DstReg.CondMask = COND_TR;
+
+       fpi[i].SrcReg[0].File = PROGRAM_INPUT;
+       fpi[i].SrcReg[0].Index = FRAG_ATTRIB_WPOS;
+       fpi[i].SrcReg[0].Swizzle = SWIZZLE_WWWW;
+       i++;
+
+       fpi[i].Opcode = OPCODE_MUL;
+
+       fpi[i].DstReg.File = PROGRAM_TEMPORARY;
+       fpi[i].DstReg.Index = tempregi;
+       fpi[i].DstReg.WriteMask = WRITEMASK_XYZ;
+       fpi[i].DstReg.CondMask = COND_TR;
+
+       fpi[i].SrcReg[0].File = PROGRAM_INPUT;
+       fpi[i].SrcReg[0].Index = FRAG_ATTRIB_WPOS;
+       fpi[i].SrcReg[0].Swizzle = SWIZZLE_XYZW;
+
+       fpi[i].SrcReg[1].File = PROGRAM_TEMPORARY;
+       fpi[i].SrcReg[1].Index = tempregi;
+       fpi[i].SrcReg[1].Swizzle = SWIZZLE_WWWW;
+       i++;
+
+       /* viewport transformation */
+       window_index = _mesa_add_state_reference(compiler->program->Parameters, tokens);
+
+       fpi[i].Opcode = OPCODE_MAD;
+
+       fpi[i].DstReg.File = PROGRAM_TEMPORARY;
+       fpi[i].DstReg.Index = tempregi;
+       fpi[i].DstReg.WriteMask = WRITEMASK_XYZ;
+       fpi[i].DstReg.CondMask = COND_TR;
+
+       fpi[i].SrcReg[0].File = PROGRAM_TEMPORARY;
+       fpi[i].SrcReg[0].Index = tempregi;
+       fpi[i].SrcReg[0].Swizzle =
+           MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
+
+       fpi[i].SrcReg[1].File = PROGRAM_STATE_VAR;
+       fpi[i].SrcReg[1].Index = window_index;
+       fpi[i].SrcReg[1].Swizzle =
+           MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
+
+       fpi[i].SrcReg[2].File = PROGRAM_STATE_VAR;
+       fpi[i].SrcReg[2].Index = window_index;
+       fpi[i].SrcReg[2].Swizzle =
+           MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
+       i++;
+
+       for (; i < compiler->program->NumInstructions; ++i) {
+               int reg;
+               for (reg = 0; reg < 3; reg++) {
+                       if (fpi[i].SrcReg[reg].File == PROGRAM_INPUT &&
+                           fpi[i].SrcReg[reg].Index == FRAG_ATTRIB_WPOS) {
+                               fpi[i].SrcReg[reg].File = PROGRAM_TEMPORARY;
+                               fpi[i].SrcReg[reg].Index = tempregi;
+                       }
+               }
+       }
+}
+
+
+static void nqssadce_init(struct nqssadce_state* s)
+{
+       s->Outputs[FRAG_RESULT_COLOR].Sourced = WRITEMASK_XYZW;
+       s->Outputs[FRAG_RESULT_DEPTH].Sourced = WRITEMASK_W;
+}
+
+static GLboolean is_native_swizzle(GLuint opcode, struct prog_src_register reg)
+{
+       GLuint relevant;
+       int i;
+
+       if (opcode == OPCODE_TEX ||
+           opcode == OPCODE_TXB ||
+           opcode == OPCODE_TXP ||
+           opcode == OPCODE_KIL) {
+               if (reg.Abs)
+                       return GL_FALSE;
+
+               if (reg.NegateAbs)
+                       reg.NegateBase ^= 15;
+
+               if (opcode == OPCODE_KIL) {
+                       if (reg.Swizzle != SWIZZLE_NOOP)
+                               return GL_FALSE;
+               } else {
+                       for(i = 0; i < 4; ++i) {
+                               GLuint swz = GET_SWZ(reg.Swizzle, i);
+                               if (swz == SWIZZLE_NIL) {
+                                       reg.NegateBase &= ~(1 << i);
+                                       continue;
+                               }
+                               if (swz >= 4)
+                                       return GL_FALSE;
+                       }
+               }
+
+               if (reg.NegateBase)
+                       return GL_FALSE;
+
+               return GL_TRUE;
+       } else if (opcode == OPCODE_DDX || opcode == OPCODE_DDY) {
+               /* DDX/MDH and DDY/MDV explicitly ignore incoming swizzles;
+                * if it doesn't fit perfectly into a .xyzw case... */
+               if (reg.Swizzle == SWIZZLE_NOOP && !reg.Abs
+                               && !reg.NegateBase && !reg.NegateAbs)
+                       return GL_TRUE;
+
+               return GL_FALSE;
+       } else {
+               /* ALU instructions support almost everything */
+               if (reg.Abs)
+                       return GL_TRUE;
+
+               relevant = 0;
+               for(i = 0; i < 3; ++i) {
+                       GLuint swz = GET_SWZ(reg.Swizzle, i);
+                       if (swz != SWIZZLE_NIL && swz != SWIZZLE_ZERO)
+                               relevant |= 1 << i;
+               }
+               if ((reg.NegateBase & relevant) && ((reg.NegateBase & relevant) != relevant))
+                       return GL_FALSE;
+
+               return GL_TRUE;
+       }
+}
+
+/**
+ * Implement a MOV with a potentially non-native swizzle.
+ *
+ * The only thing we *cannot* do in an ALU instruction is per-component
+ * negation. Therefore, we split the MOV into two instructions when necessary.
+ */
+static void nqssadce_build_swizzle(struct nqssadce_state *s,
+       struct prog_dst_register dst, struct prog_src_register src)
+{
+       struct prog_instruction *inst;
+       GLuint negatebase[2] = { 0, 0 };
+       int i;
+
+       for(i = 0; i < 4; ++i) {
+               GLuint swz = GET_SWZ(src.Swizzle, i);
+               if (swz == SWIZZLE_NIL)
+                       continue;
+               negatebase[GET_BIT(src.NegateBase, i)] |= 1 << i;
+       }
+
+       _mesa_insert_instructions(s->Program, s->IP, (negatebase[0] ? 1 : 0) + (negatebase[1] ? 1 : 0));
+       inst = s->Program->Instructions + s->IP;
+
+       for(i = 0; i <= 1; ++i) {
+               if (!negatebase[i])
+                       continue;
+
+               inst->Opcode = OPCODE_MOV;
+               inst->DstReg = dst;
+               inst->DstReg.WriteMask = negatebase[i];
+               inst->SrcReg[0] = src;
+               inst++;
+               s->IP++;
+       }
+}
+
+static GLuint build_dtm(GLuint depthmode)
+{
+       switch(depthmode) {
+       default:
+       case GL_LUMINANCE: return 0;
+       case GL_INTENSITY: return 1;
+       case GL_ALPHA: return 2;
+       }
+}
+
+static GLuint build_func(GLuint comparefunc)
+{
+       return comparefunc - GL_NEVER;
+}
+
+
+/**
+ * Collect all external state that is relevant for compiling the given
+ * fragment program.
+ */
+static void build_state(
+       r300ContextPtr r300,
+       struct r500_fragment_program *fp,
+       struct r500_fragment_program_external_state *state)
+{
+       int unit;
+
+       _mesa_bzero(state, sizeof(*state));
+
+       for(unit = 0; unit < 16; ++unit) {
+               if (fp->mesa_program.Base.ShadowSamplers & (1 << unit)) {
+                       struct gl_texture_object* tex = r300->radeon.glCtx->Texture.Unit[unit]._Current;
+
+                       state->unit[unit].depth_texture_mode = build_dtm(tex->DepthMode);
+                       state->unit[unit].texture_compare_func = build_func(tex->CompareFunc);
+               }
+       }
+}
+
+static void dump_program(struct r500_fragment_program_code *code);
+
+void r500TranslateFragmentShader(r300ContextPtr r300,
+                                struct r500_fragment_program *fp)
+{
+       struct r500_fragment_program_external_state state;
+
+       build_state(r300, fp, &state);
+       if (_mesa_memcmp(&fp->state, &state, sizeof(state))) {
+               /* TODO: cache compiled programs */
+               fp->translated = GL_FALSE;
+               _mesa_memcpy(&fp->state, &state, sizeof(state));
+       }
+
+       if (!fp->translated) {
+               struct r500_fragment_program_compiler compiler;
+
+               compiler.r300 = r300;
+               compiler.fp = fp;
+               compiler.code = &fp->code;
+               compiler.program = _mesa_clone_program(r300->radeon.glCtx, &fp->mesa_program.Base);
+
+               if (RADEON_DEBUG & DEBUG_PIXEL) {
+                       _mesa_printf("Compiler: Initial program:\n");
+                       _mesa_print_program(compiler.program);
+               }
+
+               insert_WPOS_trailer(&compiler);
+
+               struct radeon_program_transformation transformations[] = {
+                       { &transform_TEX, &compiler },
+                       { &radeonTransformALU, 0 },
+                       { &radeonTransformDeriv, 0 },
+                       { &radeonTransformTrigScale, 0 }
+               };
+               radeonLocalTransform(r300->radeon.glCtx, compiler.program,
+                       4, transformations);
+
+               if (RADEON_DEBUG & DEBUG_PIXEL) {
+                       _mesa_printf("Compiler: after native rewrite:\n");
+                       _mesa_print_program(compiler.program);
+               }
+
+               struct radeon_nqssadce_descr nqssadce = {
+                       .Init = &nqssadce_init,
+                       .IsNativeSwizzle = &is_native_swizzle,
+                       .BuildSwizzle = &nqssadce_build_swizzle,
+                       .RewriteDepthOut = GL_TRUE
+               };
+               radeonNqssaDce(r300->radeon.glCtx, compiler.program, &nqssadce);
+
+               if (RADEON_DEBUG & DEBUG_PIXEL) {
+                       _mesa_printf("Compiler: after NqSSA-DCE:\n");
+                       _mesa_print_program(compiler.program);
+               }
+
+               fp->translated = r500FragmentProgramEmit(&compiler);
+
+               /* Subtle: Rescue any parameters that have been added during transformations */
+               _mesa_free_parameter_list(fp->mesa_program.Base.Parameters);
+               fp->mesa_program.Base.Parameters = compiler.program->Parameters;
+               compiler.program->Parameters = 0;
+
+               _mesa_reference_program(r300->radeon.glCtx, &compiler.program, 0);
+
+               r300UpdateStateParameters(r300->radeon.glCtx, _NEW_PROGRAM);
+
+               if (RADEON_DEBUG & DEBUG_PIXEL) {
+                       if (fp->translated) {
+                               _mesa_printf("Machine-readable code:\n");
+                               dump_program(&fp->code);
+                       }
+               }
+
+       }
+
+       update_params(r300, fp);
+
+}
+
+static char *toswiz(int swiz_val) {
+  switch(swiz_val) {
+  case 0: return "R";
+  case 1: return "G";
+  case 2: return "B";
+  case 3: return "A";
+  case 4: return "0";
+  case 5: return "1/2";
+  case 6: return "1";
+  case 7: return "U";
+  }
+  return NULL;
+}
+
+static char *toop(int op_val)
+{
+  char *str = NULL;
+  switch (op_val) {
+  case 0: str = "MAD"; break;
+  case 1: str = "DP3"; break;
+  case 2: str = "DP4"; break;
+  case 3: str = "D2A"; break;
+  case 4: str = "MIN"; break;
+  case 5: str = "MAX"; break;
+  case 6: str = "Reserved"; break;
+  case 7: str = "CND"; break;
+  case 8: str = "CMP"; break;
+  case 9: str = "FRC"; break;
+  case 10: str = "SOP"; break;
+  case 11: str = "MDH"; break;
+  case 12: str = "MDV"; break;
+  }
+  return str;
+}
+
+static char *to_alpha_op(int op_val)
+{
+  char *str = NULL;
+  switch (op_val) {
+  case 0: str = "MAD"; break;
+  case 1: str = "DP"; break;
+  case 2: str = "MIN"; break;
+  case 3: str = "MAX"; break;
+  case 4: str = "Reserved"; break;
+  case 5: str = "CND"; break;
+  case 6: str = "CMP"; break;
+  case 7: str = "FRC"; break;
+  case 8: str = "EX2"; break;
+  case 9: str = "LN2"; break;
+  case 10: str = "RCP"; break;
+  case 11: str = "RSQ"; break;
+  case 12: str = "SIN"; break;
+  case 13: str = "COS"; break;
+  case 14: str = "MDH"; break;
+  case 15: str = "MDV"; break;
+  }
+  return str;
+}
+
+static char *to_mask(int val)
+{
+  char *str = NULL;
+  switch(val) {
+  case 0: str = "NONE"; break;
+  case 1: str = "R"; break;
+  case 2: str = "G"; break;
+  case 3: str = "RG"; break;
+  case 4: str = "B"; break;
+  case 5: str = "RB"; break;
+  case 6: str = "GB"; break;
+  case 7: str = "RGB"; break;
+  case 8: str = "A"; break;
+  case 9: str = "AR"; break;
+  case 10: str = "AG"; break;
+  case 11: str = "ARG"; break;
+  case 12: str = "AB"; break;
+  case 13: str = "ARB"; break;
+  case 14: str = "AGB"; break;
+  case 15: str = "ARGB"; break;
+  }
+  return str;
+}
+
+static char *to_texop(int val)
+{
+  switch(val) {
+  case 0: return "NOP";
+  case 1: return "LD";
+  case 2: return "TEXKILL";
+  case 3: return "PROJ";
+  case 4: return "LODBIAS";
+  case 5: return "LOD";
+  case 6: return "DXDY";
+  }
+  return NULL;
+}
+
+static void dump_program(struct r500_fragment_program_code *code)
+{
+
+  fprintf(stderr, "R500 Fragment Program:\n--------\n");
+
+  int n;
+  uint32_t inst;
+  uint32_t inst0;
+  char *str = NULL;
+
+  if (code->const_nr) {
+    fprintf(stderr, "--------\nConstants:\n");
+    for (n = 0; n < code->const_nr; n++) {
+      fprintf(stderr, "Constant %d: %i[%i]\n", n,
+        code->constant[n].File, code->constant[n].Index);
+    }
+    fprintf(stderr, "--------\n");
+  }
+
+  for (n = 0; n < code->inst_end+1; n++) {
+    inst0 = inst = code->inst[n].inst0;
+    fprintf(stderr,"%d\t0:CMN_INST   0x%08x:", n, inst);
+    switch(inst & 0x3) {
+    case R500_INST_TYPE_ALU: str = "ALU"; break;
+    case R500_INST_TYPE_OUT: str = "OUT"; break;
+    case R500_INST_TYPE_FC: str = "FC"; break;
+    case R500_INST_TYPE_TEX: str = "TEX"; break;
+    };
+    fprintf(stderr,"%s %s %s %s %s ", str,
+           inst & R500_INST_TEX_SEM_WAIT ? "TEX_WAIT" : "",
+           inst & R500_INST_LAST ? "LAST" : "",
+           inst & R500_INST_NOP ? "NOP" : "",
+           inst & R500_INST_ALU_WAIT ? "ALU WAIT" : "");
+    fprintf(stderr,"wmask: %s omask: %s\n", to_mask((inst >> 11) & 0xf),
+           to_mask((inst >> 15) & 0xf));
+
+    switch(inst0 & 0x3) {
+    case 0:
+    case 1:
+      fprintf(stderr,"\t1:RGB_ADDR   0x%08x:", code->inst[n].inst1);
+      inst = code->inst[n].inst1;
+
+      fprintf(stderr,"Addr0: %d%c, Addr1: %d%c, Addr2: %d%c, srcp:%d\n",
+             inst & 0xff, (inst & (1<<8)) ? 'c' : 't',
+             (inst >> 10) & 0xff, (inst & (1<<18)) ? 'c' : 't',
+             (inst >> 20) & 0xff, (inst & (1<<28)) ? 'c' : 't',
+             (inst >> 30));
+
+      fprintf(stderr,"\t2:ALPHA_ADDR 0x%08x:", code->inst[n].inst2);
+      inst = code->inst[n].inst2;
+      fprintf(stderr,"Addr0: %d%c, Addr1: %d%c, Addr2: %d%c, srcp:%d\n",
+             inst & 0xff, (inst & (1<<8)) ? 'c' : 't',
+             (inst >> 10) & 0xff, (inst & (1<<18)) ? 'c' : 't',
+             (inst >> 20) & 0xff, (inst & (1<<28)) ? 'c' : 't',
+             (inst >> 30));
+      fprintf(stderr,"\t3 RGB_INST:  0x%08x:", code->inst[n].inst3);
+      inst = code->inst[n].inst3;
+      fprintf(stderr,"rgb_A_src:%d %s/%s/%s %d rgb_B_src:%d %s/%s/%s %d\n",
+             (inst) & 0x3, toswiz((inst >> 2) & 0x7), toswiz((inst >> 5) & 0x7), toswiz((inst >> 8) & 0x7),
+             (inst >> 11) & 0x3,
+             (inst >> 13) & 0x3, toswiz((inst >> 15) & 0x7), toswiz((inst >> 18) & 0x7), toswiz((inst >> 21) & 0x7),
+             (inst >> 24) & 0x3);
+
+
+      fprintf(stderr,"\t4 ALPHA_INST:0x%08x:", code->inst[n].inst4);
+      inst = code->inst[n].inst4;
+      fprintf(stderr,"%s dest:%d%s alp_A_src:%d %s %d alp_B_src:%d %s %d w:%d\n", to_alpha_op(inst & 0xf),
+             (inst >> 4) & 0x7f, inst & (1<<11) ? "(rel)":"",
+             (inst >> 12) & 0x3, toswiz((inst >> 14) & 0x7), (inst >> 17) & 0x3,
+             (inst >> 19) & 0x3, toswiz((inst >> 21) & 0x7), (inst >> 24) & 0x3,
+             (inst >> 31) & 0x1);
+
+      fprintf(stderr,"\t5 RGBA_INST: 0x%08x:", code->inst[n].inst5);
+      inst = code->inst[n].inst5;
+      fprintf(stderr,"%s dest:%d%s rgb_C_src:%d %s/%s/%s %d alp_C_src:%d %s %d\n", toop(inst & 0xf),
+             (inst >> 4) & 0x7f, inst & (1<<11) ? "(rel)":"",
+             (inst >> 12) & 0x3, toswiz((inst >> 14) & 0x7), toswiz((inst >> 17) & 0x7), toswiz((inst >> 20) & 0x7),
+             (inst >> 23) & 0x3,
+             (inst >> 25) & 0x3, toswiz((inst >> 27) & 0x7), (inst >> 30) & 0x3);
+      break;
+    case 2:
+      break;
+    case 3:
+      inst = code->inst[n].inst1;
+      fprintf(stderr,"\t1:TEX_INST:  0x%08x: id: %d op:%s, %s, %s %s\n", inst, (inst >> 16) & 0xf,
+             to_texop((inst >> 22) & 0x7), (inst & (1<<25)) ? "ACQ" : "",
+             (inst & (1<<26)) ? "IGNUNC" : "", (inst & (1<<27)) ? "UNSCALED" : "SCALED");
+      inst = code->inst[n].inst2;
+      fprintf(stderr,"\t2:TEX_ADDR:  0x%08x: src: %d%s %s/%s/%s/%s dst: %d%s %s/%s/%s/%s\n", inst,
+             inst & 127, inst & (1<<7) ? "(rel)" : "",
+             toswiz((inst >> 8) & 0x3), toswiz((inst >> 10) & 0x3),
+             toswiz((inst >> 12) & 0x3), toswiz((inst >> 14) & 0x3),
+             (inst >> 16) & 127, inst & (1<<23) ? "(rel)" : "",
+             toswiz((inst >> 24) & 0x3), toswiz((inst >> 26) & 0x3),
+             toswiz((inst >> 28) & 0x3), toswiz((inst >> 30) & 0x3));
+
+      fprintf(stderr,"\t3:TEX_DXDY:  0x%08x\n", code->inst[n].inst3);
+      break;
+    }
+    fprintf(stderr,"\n");
+  }
+
+}
diff --git a/src/mesa/drivers/dri/r600/r700_fragprog.h b/src/mesa/drivers/dri/r600/r700_fragprog.h
new file mode 100644 (file)
index 0000000..e48cb60
--- /dev/null
@@ -0,0 +1,62 @@
+/*
+ * Copyright (C) 2005 Ben Skeggs.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+/*
+ * Authors:
+ *   Ben Skeggs <darktama@iinet.net.au>
+ *   Jerome Glisse <j.glisse@gmail.com>
+ */
+#ifndef __R700_FRAGPROG_H_
+#define __R700_FRAGPROG_H_
+
+#include "main/glheader.h"
+#include "main/macros.h"
+#include "main/enums.h"
+#include "shader/prog_parameter.h"
+#include "shader/prog_print.h"
+#include "shader/program.h"
+#include "shader/prog_instruction.h"
+
+#include "r600_context.h"
+#include "r600_state.h"
+#include "radeon_program.h"
+
+struct r500_fragment_program;
+
+extern void r500TranslateFragmentShader(r300ContextPtr r300,
+                                       struct r500_fragment_program *fp);
+
+struct r500_fragment_program_compiler {
+       r300ContextPtr r300;
+       struct r500_fragment_program *fp;
+       struct r500_fragment_program_code *code;
+       struct gl_program *program;
+};
+
+extern GLboolean r500FragmentProgramEmit(struct r500_fragment_program_compiler *compiler);
+
+#endif
diff --git a/src/mesa/drivers/dri/r600/r700_fragprog_emit.c b/src/mesa/drivers/dri/r600/r700_fragprog_emit.c
new file mode 100644 (file)
index 0000000..ccffd7c
--- /dev/null
@@ -0,0 +1,327 @@
+/*
+ * Copyright (C) 2005 Ben Skeggs.
+ *
+ * Copyright 2008 Corbin Simpson <MostAwesomeDude@gmail.com>
+ * Adaptation and modification for ATI/AMD Radeon R500 GPU chipsets.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+/**
+ * \file
+ *
+ * \author Ben Skeggs <darktama@iinet.net.au>
+ *
+ * \author Jerome Glisse <j.glisse@gmail.com>
+ *
+ * \author Corbin Simpson <MostAwesomeDude@gmail.com>
+ *
+ * \todo Depth write, WPOS/FOGC inputs
+ *
+ * \todo FogOption
+ *
+ */
+
+#include "r700_fragprog.h"
+
+#include "radeon_program_pair.h"
+
+
+#define PROG_CODE \
+       struct r500_fragment_program_compiler *c = (struct r500_fragment_program_compiler*)data; \
+       struct r500_fragment_program_code *code = c->code
+
+#define error(fmt, args...) do {                       \
+               fprintf(stderr, "%s::%s(): " fmt "\n",  \
+                       __FILE__, __FUNCTION__, ##args);        \
+       } while(0)
+
+
+/**
+ * Callback to register hardware constants.
+ */
+static GLboolean emit_const(void *data, GLuint file, GLuint idx, GLuint *hwindex)
+{
+       PROG_CODE;
+
+       for (*hwindex = 0; *hwindex < code->const_nr; ++*hwindex) {
+               if (code->constant[*hwindex].File == file &&
+                   code->constant[*hwindex].Index == idx)
+                       break;
+       }
+
+       if (*hwindex >= code->const_nr) {
+               if (*hwindex >= PFS_NUM_CONST_REGS) {
+                       error("Out of hw constants!\n");
+                       return GL_FALSE;
+               }
+
+               code->const_nr++;
+               code->constant[*hwindex].File = file;
+               code->constant[*hwindex].Index = idx;
+       }
+
+       return GL_TRUE;
+}
+
+static GLuint translate_rgb_op(GLuint opcode)
+{
+       switch(opcode) {
+       case OPCODE_CMP: return R500_ALU_RGBA_OP_CMP;
+       case OPCODE_DDX: return R500_ALU_RGBA_OP_MDH;
+       case OPCODE_DDY: return R500_ALU_RGBA_OP_MDV;
+       case OPCODE_DP3: return R500_ALU_RGBA_OP_DP3;
+       case OPCODE_DP4: return R500_ALU_RGBA_OP_DP4;
+       case OPCODE_FRC: return R500_ALU_RGBA_OP_FRC;
+       default:
+               error("translate_rgb_op(%d): unknown opcode\n", opcode);
+               /* fall through */
+       case OPCODE_NOP:
+               /* fall through */
+       case OPCODE_MAD: return R500_ALU_RGBA_OP_MAD;
+       case OPCODE_MAX: return R500_ALU_RGBA_OP_MAX;
+       case OPCODE_MIN: return R500_ALU_RGBA_OP_MIN;
+       case OPCODE_REPL_ALPHA: return R500_ALU_RGBA_OP_SOP;
+       }
+}
+
+static GLuint translate_alpha_op(GLuint opcode)
+{
+       switch(opcode) {
+       case OPCODE_CMP: return R500_ALPHA_OP_CMP;
+       case OPCODE_COS: return R500_ALPHA_OP_COS;
+       case OPCODE_DDX: return R500_ALPHA_OP_MDH;
+       case OPCODE_DDY: return R500_ALPHA_OP_MDV;
+       case OPCODE_DP3: return R500_ALPHA_OP_DP;
+       case OPCODE_DP4: return R500_ALPHA_OP_DP;
+       case OPCODE_EX2: return R500_ALPHA_OP_EX2;
+       case OPCODE_FRC: return R500_ALPHA_OP_FRC;
+       case OPCODE_LG2: return R500_ALPHA_OP_LN2;
+       default:
+               error("translate_alpha_op(%d): unknown opcode\n", opcode);
+               /* fall through */
+       case OPCODE_NOP:
+               /* fall through */
+       case OPCODE_MAD: return R500_ALPHA_OP_MAD;
+       case OPCODE_MAX: return R500_ALPHA_OP_MAX;
+       case OPCODE_MIN: return R500_ALPHA_OP_MIN;
+       case OPCODE_RCP: return R500_ALPHA_OP_RCP;
+       case OPCODE_RSQ: return R500_ALPHA_OP_RSQ;
+       case OPCODE_SIN: return R500_ALPHA_OP_SIN;
+       }
+}
+
+static GLuint fix_hw_swizzle(GLuint swz)
+{
+       if (swz == 5) swz = 6;
+       if (swz == SWIZZLE_NIL) swz = 4;
+       return swz;
+}
+
+static GLuint translate_arg_rgb(struct radeon_pair_instruction *inst, int arg)
+{
+       GLuint t = inst->RGB.Arg[arg].Source;
+       int comp;
+       t |= inst->RGB.Arg[arg].Negate << 11;
+       t |= inst->RGB.Arg[arg].Abs << 12;
+
+       for(comp = 0; comp < 3; ++comp)
+               t |= fix_hw_swizzle(GET_SWZ(inst->RGB.Arg[arg].Swizzle, comp)) << (3*comp + 2);
+
+       return t;
+}
+
+static GLuint translate_arg_alpha(struct radeon_pair_instruction *inst, int i)
+{
+       GLuint t = inst->Alpha.Arg[i].Source;
+       t |= fix_hw_swizzle(inst->Alpha.Arg[i].Swizzle) << 2;
+       t |= inst->Alpha.Arg[i].Negate << 5;
+       t |= inst->Alpha.Arg[i].Abs << 6;
+       return t;
+}
+
+static void use_temporary(struct r500_fragment_program_code* code, GLuint index)
+{
+       if (index > code->max_temp_idx)
+               code->max_temp_idx = index;
+}
+
+static GLuint use_source(struct r500_fragment_program_code* code, struct radeon_pair_instruction_source src)
+{
+       if (!src.Constant)
+               use_temporary(code, src.Index);
+       return src.Index | src.Constant << 8;
+}
+
+
+/**
+ * Emit a paired ALU instruction.
+ */
+static GLboolean emit_paired(void *data, struct radeon_pair_instruction *inst)
+{
+       PROG_CODE;
+
+       if (code->inst_end >= 511) {
+               error("emit_alu: Too many instructions");
+               return GL_FALSE;
+       }
+
+       int ip = ++code->inst_end;
+
+       code->inst[ip].inst5 = translate_rgb_op(inst->RGB.Opcode);
+       code->inst[ip].inst4 = translate_alpha_op(inst->Alpha.Opcode);
+
+       if (inst->RGB.OutputWriteMask || inst->Alpha.OutputWriteMask || inst->Alpha.DepthWriteMask)
+               code->inst[ip].inst0 = R500_INST_TYPE_OUT;
+       else
+               code->inst[ip].inst0 = R500_INST_TYPE_ALU;
+       code->inst[ip].inst0 |= R500_INST_TEX_SEM_WAIT;
+
+       code->inst[ip].inst0 |= (inst->RGB.WriteMask << 11) | (inst->Alpha.WriteMask << 14);
+       code->inst[ip].inst0 |= (inst->RGB.OutputWriteMask << 15) | (inst->Alpha.OutputWriteMask << 18);
+       if (inst->Alpha.DepthWriteMask) {
+               code->inst[ip].inst4 |= R500_ALPHA_W_OMASK;
+               c->fp->writes_depth = GL_TRUE;
+       }
+
+       code->inst[ip].inst4 |= R500_ALPHA_ADDRD(inst->Alpha.DestIndex);
+       code->inst[ip].inst5 |= R500_ALU_RGBA_ADDRD(inst->RGB.DestIndex);
+       use_temporary(code, inst->Alpha.DestIndex);
+       use_temporary(code, inst->RGB.DestIndex);
+
+       if (inst->RGB.Saturate)
+               code->inst[ip].inst0 |= R500_INST_RGB_CLAMP;
+       if (inst->Alpha.Saturate)
+               code->inst[ip].inst0 |= R500_INST_ALPHA_CLAMP;
+
+       code->inst[ip].inst1 |= R500_RGB_ADDR0(use_source(code, inst->RGB.Src[0]));
+       code->inst[ip].inst1 |= R500_RGB_ADDR1(use_source(code, inst->RGB.Src[1]));
+       code->inst[ip].inst1 |= R500_RGB_ADDR2(use_source(code, inst->RGB.Src[2]));
+
+       code->inst[ip].inst2 |= R500_ALPHA_ADDR0(use_source(code, inst->Alpha.Src[0]));
+       code->inst[ip].inst2 |= R500_ALPHA_ADDR1(use_source(code, inst->Alpha.Src[1]));
+       code->inst[ip].inst2 |= R500_ALPHA_ADDR2(use_source(code, inst->Alpha.Src[2]));
+
+       code->inst[ip].inst3 |= translate_arg_rgb(inst, 0) << R500_ALU_RGB_SEL_A_SHIFT;
+       code->inst[ip].inst3 |= translate_arg_rgb(inst, 1) << R500_ALU_RGB_SEL_B_SHIFT;
+       code->inst[ip].inst5 |= translate_arg_rgb(inst, 2) << R500_ALU_RGBA_SEL_C_SHIFT;
+
+       code->inst[ip].inst4 |= translate_arg_alpha(inst, 0) << R500_ALPHA_SEL_A_SHIFT;
+       code->inst[ip].inst4 |= translate_arg_alpha(inst, 1) << R500_ALPHA_SEL_B_SHIFT;
+       code->inst[ip].inst5 |= translate_arg_alpha(inst, 2) << R500_ALU_RGBA_ALPHA_SEL_C_SHIFT;
+
+       return GL_TRUE;
+}
+
+static GLuint translate_strq_swizzle(struct prog_src_register src)
+{
+       GLuint swiz = 0;
+       int i;
+       for (i = 0; i < 4; i++)
+               swiz |= (GET_SWZ(src.Swizzle, i) & 0x3) << i*2;
+       return swiz;
+}
+
+/**
+ * Emit a single TEX instruction
+ */
+static GLboolean emit_tex(void *data, struct prog_instruction *inst)
+{
+       PROG_CODE;
+
+       if (code->inst_end >= 511) {
+               error("emit_tex: Too many instructions");
+               return GL_FALSE;
+       }
+
+       int ip = ++code->inst_end;
+
+       code->inst[ip].inst0 = R500_INST_TYPE_TEX
+               | (inst->DstReg.WriteMask << 11)
+               | R500_INST_TEX_SEM_WAIT;
+       code->inst[ip].inst1 = R500_TEX_ID(inst->TexSrcUnit)
+               | R500_TEX_SEM_ACQUIRE | R500_TEX_IGNORE_UNCOVERED;
+
+       if (inst->TexSrcTarget == TEXTURE_RECT_INDEX)
+               code->inst[ip].inst1 |= R500_TEX_UNSCALED;
+
+       switch (inst->Opcode) {
+       case OPCODE_KIL:
+               code->inst[ip].inst1 |= R500_TEX_INST_TEXKILL;
+               break;
+       case OPCODE_TEX:
+               code->inst[ip].inst1 |= R500_TEX_INST_LD;
+               break;
+       case OPCODE_TXB:
+               code->inst[ip].inst1 |= R500_TEX_INST_LODBIAS;
+               break;
+       case OPCODE_TXP:
+               code->inst[ip].inst1 |= R500_TEX_INST_PROJ;
+               break;
+       default:
+               error("emit_tex can't handle opcode %x\n", inst->Opcode);
+       }
+
+       code->inst[ip].inst2 = R500_TEX_SRC_ADDR(inst->SrcReg[0].Index)
+               | (translate_strq_swizzle(inst->SrcReg[0]) << 8)
+               | R500_TEX_DST_ADDR(inst->DstReg.Index)
+               | R500_TEX_DST_R_SWIZ_R | R500_TEX_DST_G_SWIZ_G
+               | R500_TEX_DST_B_SWIZ_B | R500_TEX_DST_A_SWIZ_A;
+
+       return GL_TRUE;
+}
+
+static const struct radeon_pair_handler pair_handler = {
+       .EmitConst = emit_const,
+       .EmitPaired = emit_paired,
+       .EmitTex = emit_tex,
+       .MaxHwTemps = 128
+};
+
+GLboolean r500FragmentProgramEmit(struct r500_fragment_program_compiler *compiler)
+{
+       struct r500_fragment_program_code *code = compiler->code;
+
+       _mesa_bzero(code, sizeof(*code));
+       code->max_temp_idx = 1;
+       code->inst_offset = 0;
+       code->inst_end = -1;
+
+       if (!radeonPairProgram(compiler->r300->radeon.glCtx, compiler->program, &pair_handler, compiler))
+               return GL_FALSE;
+
+       if ((code->inst[code->inst_end].inst0 & R500_INST_TYPE_MASK) != R500_INST_TYPE_OUT) {
+               /* This may happen when dead-code elimination is disabled or
+                * when most of the fragment program logic is leading to a KIL */
+               if (code->inst_end >= 511) {
+                       error("Introducing fake OUT: Too many instructions");
+                       return GL_FALSE;
+               }
+
+               int ip = ++code->inst_end;
+               code->inst[ip].inst0 = R500_INST_TYPE_OUT | R500_INST_TEX_SEM_WAIT;
+       }
+
+       return GL_TRUE;
+}
diff --git a/src/mesa/drivers/dri/r600/radeon_context.h b/src/mesa/drivers/dri/r600/radeon_context.h
new file mode 100644 (file)
index 0000000..250570f
--- /dev/null
@@ -0,0 +1,76 @@
+/**************************************************************************
+
+Copyright 2000, 2001 ATI Technologies Inc., Ontario, Canada, and
+                     VA Linux Systems Inc., Fremont, California.
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+All Rights Reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Gareth Hughes <gareth@valinux.com>
+ *   Keith Whitwell <keith@tungstengraphics.com>
+ *   Kevin E. Martin <martin@valinux.com>
+ *   Nicolai Haehnle <prefect_@gmx.net>
+ */
+
+#ifndef __RADEON_CONTEXT_H__
+#define __RADEON_CONTEXT_H__
+
+#include "main/mtypes.h"
+#include "main/colormac.h"
+#include "radeon_screen.h"
+#include "drm.h"
+#include "dri_util.h"
+
+#include "radeon_screen.h"
+
+#if R200_MERGED
+extern void radeonFallback(GLcontext * ctx, GLuint bit, GLboolean mode);
+
+#define FALLBACK( radeon, bit, mode ) do {                     \
+   if ( 0 ) fprintf( stderr, "FALLBACK in %s: #%d=%d\n",       \
+                    __FUNCTION__, bit, mode );                 \
+   radeonFallback( (radeon)->glCtx, bit, mode );               \
+} while (0)
+#else
+#define FALLBACK( radeon, bit, mode ) fprintf(stderr, "%s:%s\n", __LINE__, __FILE__);
+#endif
+
+/* TCL fallbacks */
+extern void radeonTclFallback(GLcontext * ctx, GLuint bit, GLboolean mode);
+
+#if R200_MERGED
+#define TCL_FALLBACK( ctx, bit, mode ) radeonTclFallback( ctx, bit, mode )
+#else
+#define TCL_FALLBACK( ctx, bit, mode ) ;
+#endif
+
+
+#endif                         /* __RADEON_CONTEXT_H__ */
diff --git a/src/mesa/drivers/dri/r600/radeon_nqssadce.c b/src/mesa/drivers/dri/r600/radeon_nqssadce.c
new file mode 100644 (file)
index 0000000..a083c3d
--- /dev/null
@@ -0,0 +1,284 @@
+/*
+ * Copyright (C) 2008 Nicolai Haehnle.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+/**
+ * @file
+ *
+ * "Not-quite SSA" and Dead-Code Elimination.
+ *
+ * @note This code uses SWIZZLE_NIL in a source register to indicate that
+ * the corresponding component is ignored by the corresponding instruction.
+ */
+
+#include "radeon_nqssadce.h"
+
+
+/**
+ * Return the @ref register_state for the given register (or 0 for untracked
+ * registers, i.e. constants).
+ */
+static struct register_state *get_reg_state(struct nqssadce_state* s, GLuint file, GLuint index)
+{
+       switch(file) {
+       case PROGRAM_TEMPORARY: return &s->Temps[index];
+       case PROGRAM_OUTPUT: return &s->Outputs[index];
+       default: return 0;
+       }
+}
+
+
+/**
+ * Left multiplication of a register with a swizzle
+ *
+ * @note Works correctly only for X, Y, Z, W swizzles, not for constant swizzles.
+ */
+static struct prog_src_register lmul_swizzle(GLuint swizzle, struct prog_src_register srcreg)
+{
+       struct prog_src_register tmp = srcreg;
+       int i;
+       tmp.Swizzle = 0;
+       tmp.NegateBase = 0;
+       for(i = 0; i < 4; ++i) {
+               GLuint swz = GET_SWZ(swizzle, i);
+               if (swz < 4) {
+                       tmp.Swizzle |= GET_SWZ(srcreg.Swizzle, swz) << (i*3);
+                       tmp.NegateBase |= GET_BIT(srcreg.NegateBase, swz) << i;
+               } else {
+                       tmp.Swizzle |= swz << (i*3);
+               }
+       }
+       return tmp;
+}
+
+
+static struct prog_instruction* track_used_srcreg(struct nqssadce_state* s,
+       struct prog_instruction *inst, GLint src, GLuint sourced)
+{
+       int i;
+       GLuint deswz_source = 0;
+
+       for(i = 0; i < 4; ++i) {
+               if (GET_BIT(sourced, i)) {
+                       GLuint swz = GET_SWZ(inst->SrcReg[src].Swizzle, i);
+                       deswz_source |= 1 << swz;
+               } else {
+                       inst->SrcReg[src].Swizzle &= ~(7 << (3*i));
+                       inst->SrcReg[src].Swizzle |= SWIZZLE_NIL << (3*i);
+               }
+       }
+
+       if (!s->Descr->IsNativeSwizzle(inst->Opcode, inst->SrcReg[src])) {
+               struct prog_dst_register dstreg = inst->DstReg;
+               dstreg.File = PROGRAM_TEMPORARY;
+               dstreg.Index = _mesa_find_free_register(s->Program, PROGRAM_TEMPORARY);
+               dstreg.WriteMask = sourced;
+
+               s->Descr->BuildSwizzle(s, dstreg, inst->SrcReg[src]);
+
+               inst = s->Program->Instructions + s->IP;
+               inst->SrcReg[src].File = PROGRAM_TEMPORARY;
+               inst->SrcReg[src].Index = dstreg.Index;
+               inst->SrcReg[src].Swizzle = 0;
+               inst->SrcReg[src].NegateBase = 0;
+               inst->SrcReg[src].Abs = 0;
+               inst->SrcReg[src].NegateAbs = 0;
+               for(i = 0; i < 4; ++i) {
+                       if (GET_BIT(sourced, i))
+                               inst->SrcReg[src].Swizzle |= i << (3*i);
+                       else
+                               inst->SrcReg[src].Swizzle |= SWIZZLE_NIL << (3*i);
+               }
+               deswz_source = sourced;
+       }
+
+       struct register_state *regstate = get_reg_state(s, inst->SrcReg[src].File, inst->SrcReg[src].Index);
+       if (regstate)
+               regstate->Sourced |= deswz_source & 0xf;
+
+       return inst;
+}
+
+
+static void rewrite_depth_out(struct prog_instruction *inst)
+{
+       if (inst->DstReg.WriteMask & WRITEMASK_Z) {
+               inst->DstReg.WriteMask = WRITEMASK_W;
+       } else {
+               inst->DstReg.WriteMask = 0;
+               return;
+       }
+
+       switch (inst->Opcode) {
+       case OPCODE_FRC:
+       case OPCODE_MOV:
+               inst->SrcReg[0] = lmul_swizzle(SWIZZLE_ZZZZ, inst->SrcReg[0]);
+               break;
+       case OPCODE_ADD:
+       case OPCODE_MAX:
+       case OPCODE_MIN:
+       case OPCODE_MUL:
+               inst->SrcReg[0] = lmul_swizzle(SWIZZLE_ZZZZ, inst->SrcReg[0]);
+               inst->SrcReg[1] = lmul_swizzle(SWIZZLE_ZZZZ, inst->SrcReg[1]);
+               break;
+       case OPCODE_CMP:
+       case OPCODE_MAD:
+               inst->SrcReg[0] = lmul_swizzle(SWIZZLE_ZZZZ, inst->SrcReg[0]);
+               inst->SrcReg[1] = lmul_swizzle(SWIZZLE_ZZZZ, inst->SrcReg[1]);
+               inst->SrcReg[2] = lmul_swizzle(SWIZZLE_ZZZZ, inst->SrcReg[2]);
+               break;
+       default:
+               // Scalar instructions needn't be reswizzled
+               break;
+       }
+}
+
+static void unalias_srcregs(struct prog_instruction *inst, GLuint oldindex, GLuint newindex)
+{
+       int nsrc = _mesa_num_inst_src_regs(inst->Opcode);
+       int i;
+       for(i = 0; i < nsrc; ++i)
+               if (inst->SrcReg[i].File == PROGRAM_TEMPORARY && inst->SrcReg[i].Index == oldindex)
+                       inst->SrcReg[i].Index = newindex;
+}
+
+static void unalias_temporary(struct nqssadce_state* s, GLuint oldindex)
+{
+       GLuint newindex = _mesa_find_free_register(s->Program, PROGRAM_TEMPORARY);
+       int ip;
+       for(ip = 0; ip < s->IP; ++ip) {
+               struct prog_instruction* inst = s->Program->Instructions + ip;
+               if (inst->DstReg.File == PROGRAM_TEMPORARY && inst->DstReg.Index == oldindex)
+                       inst->DstReg.Index = newindex;
+               unalias_srcregs(inst, oldindex, newindex);
+       }
+       unalias_srcregs(s->Program->Instructions + s->IP, oldindex, newindex);
+}
+
+
+/**
+ * Handle one instruction.
+ */
+static void process_instruction(struct nqssadce_state* s)
+{
+       struct prog_instruction *inst = s->Program->Instructions + s->IP;
+
+       if (inst->Opcode == OPCODE_END)
+               return;
+
+       if (inst->Opcode != OPCODE_KIL) {
+               if (s->Descr->RewriteDepthOut) {
+                       if (inst->DstReg.File == PROGRAM_OUTPUT && inst->DstReg.Index == FRAG_RESULT_DEPTH)
+                               rewrite_depth_out(inst);
+               }
+
+               struct register_state *regstate = get_reg_state(s, inst->DstReg.File, inst->DstReg.Index);
+               if (!regstate) {
+                       _mesa_problem(s->Ctx, "NqssaDce: bad destination register (%i[%i])\n",
+                               inst->DstReg.File, inst->DstReg.Index);
+                       return;
+               }
+
+               inst->DstReg.WriteMask &= regstate->Sourced;
+               regstate->Sourced &= ~inst->DstReg.WriteMask;
+
+               if (inst->DstReg.WriteMask == 0) {
+                       _mesa_delete_instructions(s->Program, s->IP, 1);
+                       return;
+               }
+
+               if (inst->DstReg.File == PROGRAM_TEMPORARY && !regstate->Sourced)
+                       unalias_temporary(s, inst->DstReg.Index);
+       }
+
+       /* Attention: Due to swizzle emulation code, the following
+        * might change the instruction stream under us, so we have
+        * to be careful with the inst pointer. */
+       switch (inst->Opcode) {
+       case OPCODE_DDX:
+       case OPCODE_DDY:
+       case OPCODE_FRC:
+       case OPCODE_MOV:
+               inst = track_used_srcreg(s, inst, 0, inst->DstReg.WriteMask);
+               break;
+       case OPCODE_ADD:
+       case OPCODE_MAX:
+       case OPCODE_MIN:
+       case OPCODE_MUL:
+               inst = track_used_srcreg(s, inst, 0, inst->DstReg.WriteMask);
+               inst = track_used_srcreg(s, inst, 1, inst->DstReg.WriteMask);
+               break;
+       case OPCODE_CMP:
+       case OPCODE_MAD:
+               inst = track_used_srcreg(s, inst, 0, inst->DstReg.WriteMask);
+               inst = track_used_srcreg(s, inst, 1, inst->DstReg.WriteMask);
+               inst = track_used_srcreg(s, inst, 2, inst->DstReg.WriteMask);
+               break;
+       case OPCODE_COS:
+       case OPCODE_EX2:
+       case OPCODE_LG2:
+       case OPCODE_RCP:
+       case OPCODE_RSQ:
+       case OPCODE_SIN:
+               inst = track_used_srcreg(s, inst, 0, 0x1);
+               break;
+       case OPCODE_DP3:
+               inst = track_used_srcreg(s, inst, 0, 0x7);
+               inst = track_used_srcreg(s, inst, 1, 0x7);
+               break;
+       case OPCODE_DP4:
+               inst = track_used_srcreg(s, inst, 0, 0xf);
+               inst = track_used_srcreg(s, inst, 1, 0xf);
+               break;
+       case OPCODE_KIL:
+       case OPCODE_TEX:
+       case OPCODE_TXB:
+       case OPCODE_TXP:
+               inst = track_used_srcreg(s, inst, 0, 0xf);
+               break;
+       default:
+               _mesa_problem(s->Ctx, "NqssaDce: Unknown opcode %d\n", inst->Opcode);
+               return;
+       }
+}
+
+
+void radeonNqssaDce(GLcontext *ctx, struct gl_program *p, struct radeon_nqssadce_descr* descr)
+{
+       struct nqssadce_state s;
+
+       _mesa_bzero(&s, sizeof(s));
+       s.Ctx = ctx;
+       s.Program = p;
+       s.Descr = descr;
+       s.Descr->Init(&s);
+       s.IP = p->NumInstructions;
+
+       while(s.IP > 0) {
+               s.IP--;
+               process_instruction(&s);
+       }
+}
diff --git a/src/mesa/drivers/dri/r600/radeon_nqssadce.h b/src/mesa/drivers/dri/r600/radeon_nqssadce.h
new file mode 100644 (file)
index 0000000..a4f94ab
--- /dev/null
@@ -0,0 +1,96 @@
+/*
+ * Copyright (C) 2008 Nicolai Haehnle.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef __RADEON_PROGRAM_NQSSADCE_H_
+#define __RADEON_PROGRAM_NQSSADCE_H_
+
+#include "radeon_program.h"
+
+
+struct register_state {
+       /**
+        * Bitmask indicating which components of the register are sourced
+        * by later instructions.
+        */
+       GLuint Sourced : 4;
+};
+
+/**
+ * Maintain state such as which registers are used, which registers are
+ * read from, etc.
+ */
+struct nqssadce_state {
+       GLcontext *Ctx;
+       struct gl_program *Program;
+       struct radeon_nqssadce_descr *Descr;
+
+       /**
+        * All instructions after this instruction pointer have been dealt with.
+        */
+       int IP;
+
+       /**
+        * Which registers are read by subsequent instructions?
+        */
+       struct register_state Temps[MAX_PROGRAM_TEMPS];
+       struct register_state Outputs[VERT_RESULT_MAX];
+};
+
+
+/**
+ * This structure contains a description of the hardware in-so-far as
+ * it is required for the NqSSA-DCE pass.
+ */
+struct radeon_nqssadce_descr {
+       /**
+        * Fill in which outputs
+        */
+       void (*Init)(struct nqssadce_state *);
+
+       /**
+        * Check whether the given swizzle, absolute and negate combination
+        * can be implemented natively by the hardware for this opcode.
+        */
+       GLboolean (*IsNativeSwizzle)(GLuint opcode, struct prog_src_register reg);
+
+       /**
+        * Emit (at the current IP) the instruction MOV dst, src;
+        * The transformation will work recursively on the emitted instruction(s).
+        */
+       void (*BuildSwizzle)(struct nqssadce_state*, struct prog_dst_register dst, struct prog_src_register src);
+
+       /**
+        * Rewrite instructions that write to DEPR.z to write to DEPR.w
+        * instead (rewriting is done *before* the WriteMask test).
+        */
+       GLboolean RewriteDepthOut;
+       void *Data;
+};
+
+void radeonNqssaDce(GLcontext *ctx, struct gl_program *p, struct radeon_nqssadce_descr* descr);
+
+#endif /* __RADEON_PROGRAM_NQSSADCE_H_ */
diff --git a/src/mesa/drivers/dri/r600/radeon_program.c b/src/mesa/drivers/dri/r600/radeon_program.c
new file mode 100644 (file)
index 0000000..da5e7ae
--- /dev/null
@@ -0,0 +1,128 @@
+/*
+ * Copyright (C) 2008 Nicolai Haehnle.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "radeon_program.h"
+
+#include "shader/prog_print.h"
+
+
+/**
+ * Transform the given clause in the following way:
+ *  1. Replace it with an empty clause
+ *  2. For every instruction in the original clause, try the given
+ *     transformations in order.
+ *  3. If one of the transformations returns GL_TRUE, assume that it
+ *     has emitted the appropriate instruction(s) into the new clause;
+ *     otherwise, copy the instruction verbatim.
+ *
+ * \note The transformation is currently not recursive; in other words,
+ * instructions emitted by transformations are not transformed.
+ *
+ * \note The transform is called 'local' because it can only look at
+ * one instruction at a time.
+ */
+void radeonLocalTransform(
+       GLcontext *Ctx,
+       struct gl_program *program,
+       int num_transformations,
+       struct radeon_program_transformation* transformations)
+{
+       struct radeon_transform_context ctx;
+       int ip;
+
+       ctx.Ctx = Ctx;
+       ctx.Program = program;
+       ctx.OldInstructions = program->Instructions;
+       ctx.OldNumInstructions = program->NumInstructions;
+
+       program->Instructions = 0;
+       program->NumInstructions = 0;
+
+       for(ip = 0; ip < ctx.OldNumInstructions; ++ip) {
+               struct prog_instruction *instr = ctx.OldInstructions + ip;
+               int i;
+
+               for(i = 0; i < num_transformations; ++i) {
+                       struct radeon_program_transformation* t = transformations + i;
+
+                       if (t->function(&ctx, instr, t->userData))
+                               break;
+               }
+
+               if (i >= num_transformations) {
+                       struct prog_instruction* dest = radeonAppendInstructions(program, 1);
+                       _mesa_copy_instructions(dest, instr, 1);
+               }
+       }
+
+       _mesa_free_instructions(ctx.OldInstructions, ctx.OldNumInstructions);
+}
+
+
+static void scan_instructions(GLboolean* used, const struct prog_instruction* insts, GLuint count)
+{
+       GLuint i;
+       for (i = 0; i < count; i++) {
+               const struct prog_instruction *inst = insts + i;
+               const GLuint n = _mesa_num_inst_src_regs(inst->Opcode);
+               GLuint k;
+
+               for (k = 0; k < n; k++) {
+                       if (inst->SrcReg[k].File == PROGRAM_TEMPORARY)
+                               used[inst->SrcReg[k].Index] = GL_TRUE;
+               }
+       }
+}
+
+GLint radeonFindFreeTemporary(struct radeon_transform_context *t)
+{
+       GLboolean used[MAX_PROGRAM_TEMPS];
+       GLuint i;
+
+       _mesa_memset(used, 0, sizeof(used));
+       scan_instructions(used, t->Program->Instructions, t->Program->NumInstructions);
+       scan_instructions(used, t->OldInstructions, t->OldNumInstructions);
+
+       for (i = 0; i < MAX_PROGRAM_TEMPS; i++) {
+               if (!used[i])
+                       return i;
+       }
+
+       return -1;
+}
+
+
+/**
+ * Append the given number of instructions to the program and return a
+ * pointer to the first new instruction.
+ */
+struct prog_instruction *radeonAppendInstructions(struct gl_program *program, int count)
+{
+       int oldnum = program->NumInstructions;
+       _mesa_insert_instructions(program, oldnum, count);
+       return program->Instructions + oldnum;
+}
diff --git a/src/mesa/drivers/dri/r600/radeon_program.h b/src/mesa/drivers/dri/r600/radeon_program.h
new file mode 100644 (file)
index 0000000..b411f69
--- /dev/null
@@ -0,0 +1,99 @@
+/*
+ * Copyright (C) 2008 Nicolai Haehnle.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef __RADEON_PROGRAM_H_
+#define __RADEON_PROGRAM_H_
+
+#include "main/glheader.h"
+#include "main/macros.h"
+#include "main/enums.h"
+#include "shader/program.h"
+#include "shader/prog_instruction.h"
+
+
+enum {
+       CLAUSE_MIXED = 0,
+       CLAUSE_ALU,
+       CLAUSE_TEX
+};
+
+enum {
+       PROGRAM_BUILTIN = PROGRAM_FILE_MAX /**< not a real register, but a special swizzle constant */
+};
+
+enum {
+       OPCODE_REPL_ALPHA = MAX_OPCODE /**< used in paired instructions */
+};
+
+#define SWIZZLE_0000 MAKE_SWIZZLE4(SWIZZLE_ZERO, SWIZZLE_ZERO, SWIZZLE_ZERO, SWIZZLE_ZERO)
+#define SWIZZLE_1111 MAKE_SWIZZLE4(SWIZZLE_ONE, SWIZZLE_ONE, SWIZZLE_ONE, SWIZZLE_ONE)
+
+/**
+ * Transformation context that is passed to local transformations.
+ *
+ * Care must be taken with some operations during transformation,
+ * e.g. finding new temporary registers must use @ref radeonFindFreeTemporary
+ */
+struct radeon_transform_context {
+       GLcontext *Ctx;
+       struct gl_program *Program;
+       struct prog_instruction *OldInstructions;
+       GLuint OldNumInstructions;
+};
+
+/**
+ * A transformation that can be passed to \ref radeonLocalTransform.
+ *
+ * The function will be called once for each instruction.
+ * It has to either emit the appropriate transformed code for the instruction
+ * and return GL_TRUE, or return GL_FALSE if it doesn't understand the
+ * instruction.
+ *
+ * The function gets passed the userData as last parameter.
+ */
+struct radeon_program_transformation {
+       GLboolean (*function)(
+               struct radeon_transform_context*,
+               struct prog_instruction*,
+               void*);
+       void *userData;
+};
+
+void radeonLocalTransform(
+       GLcontext* ctx,
+       struct gl_program *program,
+       int num_transformations,
+       struct radeon_program_transformation* transformations);
+
+/**
+ * Find a usable free temporary register during program transformation
+ */
+GLint radeonFindFreeTemporary(struct radeon_transform_context *ctx);
+
+struct prog_instruction *radeonAppendInstructions(struct gl_program *program, int count);
+
+#endif
diff --git a/src/mesa/drivers/dri/r600/radeon_program_alu.c b/src/mesa/drivers/dri/r600/radeon_program_alu.c
new file mode 100644 (file)
index 0000000..1ef71e7
--- /dev/null
@@ -0,0 +1,658 @@
+/*
+ * Copyright (C) 2008 Nicolai Haehnle.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+/**
+ * @file
+ *
+ * Shareable transformations that transform "special" ALU instructions
+ * into ALU instructions that are supported by hardware.
+ *
+ */
+
+#include "radeon_program_alu.h"
+
+#include "shader/prog_parameter.h"
+
+
+static struct prog_instruction *emit1(struct gl_program* p,
+       gl_inst_opcode Opcode, GLuint Saturate, struct prog_dst_register DstReg,
+       struct prog_src_register SrcReg)
+{
+       struct prog_instruction *fpi = radeonAppendInstructions(p, 1);
+
+       fpi->Opcode = Opcode;
+       fpi->SaturateMode = Saturate;
+       fpi->DstReg = DstReg;
+       fpi->SrcReg[0] = SrcReg;
+       return fpi;
+}
+
+static struct prog_instruction *emit2(struct gl_program* p,
+       gl_inst_opcode Opcode, GLuint Saturate, struct prog_dst_register DstReg,
+       struct prog_src_register SrcReg0, struct prog_src_register SrcReg1)
+{
+       struct prog_instruction *fpi = radeonAppendInstructions(p, 1);
+
+       fpi->Opcode = Opcode;
+       fpi->SaturateMode = Saturate;
+       fpi->DstReg = DstReg;
+       fpi->SrcReg[0] = SrcReg0;
+       fpi->SrcReg[1] = SrcReg1;
+       return fpi;
+}
+
+static struct prog_instruction *emit3(struct gl_program* p,
+       gl_inst_opcode Opcode, GLuint Saturate, struct prog_dst_register DstReg,
+       struct prog_src_register SrcReg0, struct prog_src_register SrcReg1,
+       struct prog_src_register SrcReg2)
+{
+       struct prog_instruction *fpi = radeonAppendInstructions(p, 1);
+
+       fpi->Opcode = Opcode;
+       fpi->SaturateMode = Saturate;
+       fpi->DstReg = DstReg;
+       fpi->SrcReg[0] = SrcReg0;
+       fpi->SrcReg[1] = SrcReg1;
+       fpi->SrcReg[2] = SrcReg2;
+       return fpi;
+}
+
+static void set_swizzle(struct prog_src_register *SrcReg, int coordinate, int swz)
+{
+       SrcReg->Swizzle &= ~(7 << (3*coordinate));
+       SrcReg->Swizzle |= swz << (3*coordinate);
+}
+
+static void set_negate_base(struct prog_src_register *SrcReg, int coordinate, int negate)
+{
+       SrcReg->NegateBase &= ~(1 << coordinate);
+       SrcReg->NegateBase |= (negate << coordinate);
+}
+
+static struct prog_dst_register dstreg(int file, int index)
+{
+       struct prog_dst_register dst;
+       dst.File = file;
+       dst.Index = index;
+       dst.WriteMask = WRITEMASK_XYZW;
+       dst.CondMask = COND_TR;
+       dst.CondSwizzle = SWIZZLE_NOOP;
+       dst.CondSrc = 0;
+       dst.pad = 0;
+       return dst;
+}
+
+static struct prog_dst_register dstregtmpmask(int index, int mask)
+{
+       struct prog_dst_register dst;
+       dst.File = PROGRAM_TEMPORARY;
+       dst.Index = index;
+       dst.WriteMask = mask;
+       dst.CondMask = COND_TR;
+       dst.CondSwizzle = SWIZZLE_NOOP;
+       dst.CondSrc = 0;
+       dst.pad = 0;
+       return dst;
+}
+
+static const struct prog_src_register builtin_zero = {
+       .File = PROGRAM_BUILTIN,
+       .Index = 0,
+       .Swizzle = SWIZZLE_0000
+};
+static const struct prog_src_register builtin_one = {
+       .File = PROGRAM_BUILTIN,
+       .Index = 0,
+       .Swizzle = SWIZZLE_1111
+};
+static const struct prog_src_register srcreg_undefined = {
+       .File = PROGRAM_UNDEFINED,
+       .Index = 0,
+       .Swizzle = SWIZZLE_NOOP
+};
+
+static struct prog_src_register srcreg(int file, int index)
+{
+       struct prog_src_register src = srcreg_undefined;
+       src.File = file;
+       src.Index = index;
+       return src;
+}
+
+static struct prog_src_register srcregswz(int file, int index, int swz)
+{
+       struct prog_src_register src = srcreg_undefined;
+       src.File = file;
+       src.Index = index;
+       src.Swizzle = swz;
+       return src;
+}
+
+static struct prog_src_register absolute(struct prog_src_register reg)
+{
+       struct prog_src_register newreg = reg;
+       newreg.Abs = 1;
+       newreg.NegateBase = 0;
+       newreg.NegateAbs = 0;
+       return newreg;
+}
+
+static struct prog_src_register negate(struct prog_src_register reg)
+{
+       struct prog_src_register newreg = reg;
+       newreg.NegateAbs = !newreg.NegateAbs;
+       return newreg;
+}
+
+static struct prog_src_register swizzle(struct prog_src_register reg, GLuint x, GLuint y, GLuint z, GLuint w)
+{
+       struct prog_src_register swizzled = reg;
+       swizzled.Swizzle = MAKE_SWIZZLE4(
+               x >= 4 ? x : GET_SWZ(reg.Swizzle, x),
+               y >= 4 ? y : GET_SWZ(reg.Swizzle, y),
+               z >= 4 ? z : GET_SWZ(reg.Swizzle, z),
+               w >= 4 ? w : GET_SWZ(reg.Swizzle, w));
+       return swizzled;
+}
+
+static struct prog_src_register scalar(struct prog_src_register reg)
+{
+       return swizzle(reg, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X);
+}
+
+static void transform_ABS(struct radeon_transform_context* t,
+       struct prog_instruction* inst)
+{
+       struct prog_src_register src = inst->SrcReg[0];
+       src.Abs = 1;
+       src.NegateBase = 0;
+       src.NegateAbs = 0;
+       emit1(t->Program, OPCODE_MOV, inst->SaturateMode, inst->DstReg, src);
+}
+
+static void transform_DPH(struct radeon_transform_context* t,
+       struct prog_instruction* inst)
+{
+       struct prog_src_register src0 = inst->SrcReg[0];
+       if (src0.NegateAbs) {
+               if (src0.Abs) {
+                       int tempreg = radeonFindFreeTemporary(t);
+                       emit1(t->Program, OPCODE_MOV, 0, dstreg(PROGRAM_TEMPORARY, tempreg), src0);
+                       src0 = srcreg(src0.File, src0.Index);
+               } else {
+                       src0.NegateAbs = 0;
+                       src0.NegateBase ^= NEGATE_XYZW;
+               }
+       }
+       set_swizzle(&src0, 3, SWIZZLE_ONE);
+       set_negate_base(&src0, 3, 0);
+       emit2(t->Program, OPCODE_DP4, inst->SaturateMode, inst->DstReg, src0, inst->SrcReg[1]);
+}
+
+/**
+ * [1, src0.y*src1.y, src0.z, src1.w]
+ * So basically MUL with lotsa swizzling.
+ */
+static void transform_DST(struct radeon_transform_context* t,
+       struct prog_instruction* inst)
+{
+       emit2(t->Program, OPCODE_MUL, inst->SaturateMode, inst->DstReg,
+               swizzle(inst->SrcReg[0], SWIZZLE_ONE, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ONE),
+               swizzle(inst->SrcReg[1], SWIZZLE_ONE, SWIZZLE_Y, SWIZZLE_ONE, SWIZZLE_W));
+}
+
+static void transform_FLR(struct radeon_transform_context* t,
+       struct prog_instruction* inst)
+{
+       int tempreg = radeonFindFreeTemporary(t);
+       emit1(t->Program, OPCODE_FRC, 0, dstreg(PROGRAM_TEMPORARY, tempreg), inst->SrcReg[0]);
+       emit2(t->Program, OPCODE_ADD, inst->SaturateMode, inst->DstReg,
+               inst->SrcReg[0], negate(srcreg(PROGRAM_TEMPORARY, tempreg)));
+}
+
+/**
+ * Definition of LIT (from ARB_fragment_program):
+ *
+ *  tmp = VectorLoad(op0);
+ *  if (tmp.x < 0) tmp.x = 0;
+ *  if (tmp.y < 0) tmp.y = 0;
+ *  if (tmp.w < -(128.0-epsilon)) tmp.w = -(128.0-epsilon);
+ *  else if (tmp.w > 128-epsilon) tmp.w = 128-epsilon;
+ *  result.x = 1.0;
+ *  result.y = tmp.x;
+ *  result.z = (tmp.x > 0) ? RoughApproxPower(tmp.y, tmp.w) : 0.0;
+ *  result.w = 1.0;
+ *
+ * The longest path of computation is the one leading to result.z,
+ * consisting of 5 operations. This implementation of LIT takes
+ * 5 slots, if the subsequent optimization passes are clever enough
+ * to pair instructions correctly.
+ */
+static void transform_LIT(struct radeon_transform_context* t,
+       struct prog_instruction* inst)
+{
+       static const GLfloat LitConst[4] = { -127.999999 };
+
+       GLuint constant;
+       GLuint constant_swizzle;
+       GLuint temp;
+       int needTemporary = 0;
+       struct prog_src_register srctemp;
+
+       constant = _mesa_add_unnamed_constant(t->Program->Parameters, LitConst, 1, &constant_swizzle);
+
+       if (inst->DstReg.WriteMask != WRITEMASK_XYZW) {
+               needTemporary = 1;
+       } else if (inst->DstReg.File != PROGRAM_TEMPORARY) {
+               // LIT is typically followed by DP3/DP4, so there's no point
+               // in creating special code for this case
+               needTemporary = 1;
+       }
+
+       if (needTemporary) {
+               temp = radeonFindFreeTemporary(t);
+       } else {
+               temp = inst->DstReg.Index;
+       }
+       srctemp = srcreg(PROGRAM_TEMPORARY, temp);
+
+       // tmp.x = max(0.0, Src.x);
+       // tmp.y = max(0.0, Src.y);
+       // tmp.w = clamp(Src.z, -128+eps, 128-eps);
+       emit2(t->Program, OPCODE_MAX, 0,
+               dstregtmpmask(temp, WRITEMASK_XYW),
+               inst->SrcReg[0],
+               swizzle(srcreg(PROGRAM_CONSTANT, constant),
+                       SWIZZLE_ZERO, SWIZZLE_ZERO, SWIZZLE_ZERO, constant_swizzle&3));
+       emit2(t->Program, OPCODE_MIN, 0,
+               dstregtmpmask(temp, WRITEMASK_Z),
+               swizzle(srctemp, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W),
+               negate(srcregswz(PROGRAM_CONSTANT, constant, constant_swizzle)));
+
+       // tmp.w = Pow(tmp.y, tmp.w)
+       emit1(t->Program, OPCODE_LG2, 0,
+               dstregtmpmask(temp, WRITEMASK_W),
+               swizzle(srctemp, SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y));
+       emit2(t->Program, OPCODE_MUL, 0,
+               dstregtmpmask(temp, WRITEMASK_W),
+               swizzle(srctemp, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W),
+               swizzle(srctemp, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z));
+       emit1(t->Program, OPCODE_EX2, 0,
+               dstregtmpmask(temp, WRITEMASK_W),
+               swizzle(srctemp, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W));
+
+       // tmp.z = (tmp.x > 0) ? tmp.w : 0.0
+       emit3(t->Program, OPCODE_CMP, inst->SaturateMode,
+               dstregtmpmask(temp, WRITEMASK_Z),
+               negate(swizzle(srctemp, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X)),
+               swizzle(srctemp, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W),
+               builtin_zero);
+
+       // tmp.x, tmp.y, tmp.w = 1.0, tmp.x, 1.0
+       emit1(t->Program, OPCODE_MOV, inst->SaturateMode,
+               dstregtmpmask(temp, WRITEMASK_XYW),
+               swizzle(srctemp, SWIZZLE_ONE, SWIZZLE_X, SWIZZLE_ONE, SWIZZLE_ONE));
+
+       if (needTemporary)
+               emit1(t->Program, OPCODE_MOV, 0, inst->DstReg, srctemp);
+}
+
+static void transform_LRP(struct radeon_transform_context* t,
+       struct prog_instruction* inst)
+{
+       int tempreg = radeonFindFreeTemporary(t);
+
+       emit2(t->Program, OPCODE_ADD, 0,
+               dstreg(PROGRAM_TEMPORARY, tempreg),
+               inst->SrcReg[1], negate(inst->SrcReg[2]));
+       emit3(t->Program, OPCODE_MAD, inst->SaturateMode,
+               inst->DstReg,
+               inst->SrcReg[0], srcreg(PROGRAM_TEMPORARY, tempreg), inst->SrcReg[2]);
+}
+
+static void transform_POW(struct radeon_transform_context* t,
+       struct prog_instruction* inst)
+{
+       int tempreg = radeonFindFreeTemporary(t);
+       struct prog_dst_register tempdst = dstreg(PROGRAM_TEMPORARY, tempreg);
+       struct prog_src_register tempsrc = srcreg(PROGRAM_TEMPORARY, tempreg);
+       tempdst.WriteMask = WRITEMASK_W;
+       tempsrc.Swizzle = SWIZZLE_WWWW;
+
+       emit1(t->Program, OPCODE_LG2, 0, tempdst, scalar(inst->SrcReg[0]));
+       emit2(t->Program, OPCODE_MUL, 0, tempdst, tempsrc, scalar(inst->SrcReg[1]));
+       emit1(t->Program, OPCODE_EX2, inst->SaturateMode, inst->DstReg, tempsrc);
+}
+
+static void transform_RSQ(struct radeon_transform_context* t,
+       struct prog_instruction* inst)
+{
+       emit1(t->Program, OPCODE_RSQ, inst->SaturateMode, inst->DstReg, absolute(inst->SrcReg[0]));
+}
+
+static void transform_SGE(struct radeon_transform_context* t,
+       struct prog_instruction* inst)
+{
+       int tempreg = radeonFindFreeTemporary(t);
+
+       emit2(t->Program, OPCODE_ADD, 0, dstreg(PROGRAM_TEMPORARY, tempreg), inst->SrcReg[0], negate(inst->SrcReg[1]));
+       emit3(t->Program, OPCODE_CMP, inst->SaturateMode, inst->DstReg,
+               srcreg(PROGRAM_TEMPORARY, tempreg), builtin_zero, builtin_one);
+}
+
+static void transform_SLT(struct radeon_transform_context* t,
+       struct prog_instruction* inst)
+{
+       int tempreg = radeonFindFreeTemporary(t);
+
+       emit2(t->Program, OPCODE_ADD, 0, dstreg(PROGRAM_TEMPORARY, tempreg), inst->SrcReg[0], negate(inst->SrcReg[1]));
+       emit3(t->Program, OPCODE_CMP, inst->SaturateMode, inst->DstReg,
+               srcreg(PROGRAM_TEMPORARY, tempreg), builtin_one, builtin_zero);
+}
+
+static void transform_SUB(struct radeon_transform_context* t,
+       struct prog_instruction* inst)
+{
+       emit2(t->Program, OPCODE_ADD, inst->SaturateMode, inst->DstReg, inst->SrcReg[0], negate(inst->SrcReg[1]));
+}
+
+static void transform_SWZ(struct radeon_transform_context* t,
+       struct prog_instruction* inst)
+{
+       emit1(t->Program, OPCODE_MOV, inst->SaturateMode, inst->DstReg, inst->SrcReg[0]);
+}
+
+static void transform_XPD(struct radeon_transform_context* t,
+       struct prog_instruction* inst)
+{
+       int tempreg = radeonFindFreeTemporary(t);
+
+       emit2(t->Program, OPCODE_MUL, 0, dstreg(PROGRAM_TEMPORARY, tempreg),
+               swizzle(inst->SrcReg[0], SWIZZLE_Z, SWIZZLE_X, SWIZZLE_Y, SWIZZLE_W),
+               swizzle(inst->SrcReg[1], SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_X, SWIZZLE_W));
+       emit3(t->Program, OPCODE_MAD, inst->SaturateMode, inst->DstReg,
+               swizzle(inst->SrcReg[0], SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_X, SWIZZLE_W),
+               swizzle(inst->SrcReg[1], SWIZZLE_Z, SWIZZLE_X, SWIZZLE_Y, SWIZZLE_W),
+               negate(srcreg(PROGRAM_TEMPORARY, tempreg)));
+}
+
+
+/**
+ * Can be used as a transformation for @ref radeonClauseLocalTransform,
+ * no userData necessary.
+ *
+ * Eliminates the following ALU instructions:
+ *  ABS, DPH, DST, FLR, LIT, LRP, POW, SGE, SLT, SUB, SWZ, XPD
+ * using:
+ *  MOV, ADD, MUL, MAD, FRC, DP3, LG2, EX2, CMP
+ *
+ * Transforms RSQ to Radeon's native RSQ by explicitly setting
+ * absolute value.
+ *
+ * @note should be applicable to R300 and R500 fragment programs.
+ */
+GLboolean radeonTransformALU(struct radeon_transform_context* t,
+       struct prog_instruction* inst,
+       void* unused)
+{
+       switch(inst->Opcode) {
+       case OPCODE_ABS: transform_ABS(t, inst); return GL_TRUE;
+       case OPCODE_DPH: transform_DPH(t, inst); return GL_TRUE;
+       case OPCODE_DST: transform_DST(t, inst); return GL_TRUE;
+       case OPCODE_FLR: transform_FLR(t, inst); return GL_TRUE;
+       case OPCODE_LIT: transform_LIT(t, inst); return GL_TRUE;
+       case OPCODE_LRP: transform_LRP(t, inst); return GL_TRUE;
+       case OPCODE_POW: transform_POW(t, inst); return GL_TRUE;
+       case OPCODE_RSQ: transform_RSQ(t, inst); return GL_TRUE;
+       case OPCODE_SGE: transform_SGE(t, inst); return GL_TRUE;
+       case OPCODE_SLT: transform_SLT(t, inst); return GL_TRUE;
+       case OPCODE_SUB: transform_SUB(t, inst); return GL_TRUE;
+       case OPCODE_SWZ: transform_SWZ(t, inst); return GL_TRUE;
+       case OPCODE_XPD: transform_XPD(t, inst); return GL_TRUE;
+       default:
+               return GL_FALSE;
+       }
+}
+
+
+static void sincos_constants(struct radeon_transform_context* t, GLuint *constants)
+{
+       static const GLfloat SinCosConsts[2][4] = {
+               {
+                       1.273239545,            // 4/PI
+                       -0.405284735,           // -4/(PI*PI)
+                       3.141592654,            // PI
+                       0.2225                  // weight
+               },
+               {
+                       0.75,
+                       0.5,
+                       0.159154943,            // 1/(2*PI)
+                       6.283185307             // 2*PI
+               }
+       };
+       int i;
+
+       for(i = 0; i < 2; ++i) {
+               GLuint swz;
+               constants[i] = _mesa_add_unnamed_constant(t->Program->Parameters, SinCosConsts[i], 4, &swz);
+               ASSERT(swz == SWIZZLE_NOOP);
+       }
+}
+
+/**
+ * Approximate sin(x), where x is clamped to (-pi/2, pi/2).
+ *
+ * MUL tmp.xy, src, { 4/PI, -4/(PI^2) }
+ * MAD tmp.x, tmp.y, |src|, tmp.x
+ * MAD tmp.y, tmp.x, |tmp.x|, -tmp.x
+ * MAD dest, tmp.y, weight, tmp.x
+ */
+static void sin_approx(struct radeon_transform_context* t,
+       struct prog_dst_register dst, struct prog_src_register src, const GLuint* constants)
+{
+       GLuint tempreg = radeonFindFreeTemporary(t);
+
+       emit2(t->Program, OPCODE_MUL, 0, dstregtmpmask(tempreg, WRITEMASK_XY),
+               swizzle(src, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X),
+               srcreg(PROGRAM_CONSTANT, constants[0]));
+       emit3(t->Program, OPCODE_MAD, 0, dstregtmpmask(tempreg, WRITEMASK_X),
+               swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y),
+               absolute(swizzle(src, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X)),
+               swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X));
+       emit3(t->Program, OPCODE_MAD, 0, dstregtmpmask(tempreg, WRITEMASK_Y),
+               swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X),
+               absolute(swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X)),
+               negate(swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X)));
+       emit3(t->Program, OPCODE_MAD, 0, dst,
+               swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y),
+               swizzle(srcreg(PROGRAM_CONSTANT, constants[0]), SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W),
+               swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X));
+}
+
+/**
+ * Translate the trigonometric functions COS, SIN, and SCS
+ * using only the basic instructions
+ *  MOV, ADD, MUL, MAD, FRC
+ */
+GLboolean radeonTransformTrigSimple(struct radeon_transform_context* t,
+       struct prog_instruction* inst,
+       void* unused)
+{
+       if (inst->Opcode != OPCODE_COS &&
+           inst->Opcode != OPCODE_SIN &&
+           inst->Opcode != OPCODE_SCS)
+               return GL_FALSE;
+
+       GLuint constants[2];
+       GLuint tempreg = radeonFindFreeTemporary(t);
+
+       sincos_constants(t, constants);
+
+       if (inst->Opcode == OPCODE_COS) {
+               // MAD tmp.x, src, 1/(2*PI), 0.75
+               // FRC tmp.x, tmp.x
+               // MAD tmp.z, tmp.x, 2*PI, -PI
+               emit3(t->Program, OPCODE_MAD, 0, dstregtmpmask(tempreg, WRITEMASK_W),
+                       swizzle(inst->SrcReg[0], SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X),
+                       swizzle(srcreg(PROGRAM_CONSTANT, constants[1]), SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z),
+                       swizzle(srcreg(PROGRAM_CONSTANT, constants[1]), SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X));
+               emit1(t->Program, OPCODE_FRC, 0, dstregtmpmask(tempreg, WRITEMASK_W),
+                       swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W));
+               emit3(t->Program, OPCODE_MAD, 0, dstregtmpmask(tempreg, WRITEMASK_W),
+                       swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W),
+                       swizzle(srcreg(PROGRAM_CONSTANT, constants[1]), SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W),
+                       negate(swizzle(srcreg(PROGRAM_CONSTANT, constants[0]), SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z)));
+
+               sin_approx(t, inst->DstReg,
+                       swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W),
+                       constants);
+       } else if (inst->Opcode == OPCODE_SIN) {
+               emit3(t->Program, OPCODE_MAD, 0, dstregtmpmask(tempreg, WRITEMASK_W),
+                       swizzle(inst->SrcReg[0], SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X),
+                       swizzle(srcreg(PROGRAM_CONSTANT, constants[1]), SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z),
+                       swizzle(srcreg(PROGRAM_CONSTANT, constants[1]), SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y));
+               emit1(t->Program, OPCODE_FRC, 0, dstregtmpmask(tempreg, WRITEMASK_W),
+                       swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W));
+               emit3(t->Program, OPCODE_MAD, 0, dstregtmpmask(tempreg, WRITEMASK_W),
+                       swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W),
+                       swizzle(srcreg(PROGRAM_CONSTANT, constants[1]), SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W),
+                       negate(swizzle(srcreg(PROGRAM_CONSTANT, constants[0]), SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z)));
+
+               sin_approx(t, inst->DstReg,
+                       swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W),
+                       constants);
+       } else {
+               emit3(t->Program, OPCODE_MAD, 0, dstregtmpmask(tempreg, WRITEMASK_XY),
+                       swizzle(inst->SrcReg[0], SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X),
+                       swizzle(srcreg(PROGRAM_CONSTANT, constants[1]), SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z),
+                       swizzle(srcreg(PROGRAM_CONSTANT, constants[1]), SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_W));
+               emit1(t->Program, OPCODE_FRC, 0, dstregtmpmask(tempreg, WRITEMASK_XY),
+                       srcreg(PROGRAM_TEMPORARY, tempreg));
+               emit3(t->Program, OPCODE_MAD, 0, dstregtmpmask(tempreg, WRITEMASK_XY),
+                       srcreg(PROGRAM_TEMPORARY, tempreg),
+                       swizzle(srcreg(PROGRAM_CONSTANT, constants[1]), SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W),
+                       negate(swizzle(srcreg(PROGRAM_CONSTANT, constants[0]), SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z)));
+
+               struct prog_dst_register dst = inst->DstReg;
+
+               dst.WriteMask = inst->DstReg.WriteMask & WRITEMASK_X;
+               sin_approx(t, dst,
+                       swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X),
+                       constants);
+
+               dst.WriteMask = inst->DstReg.WriteMask & WRITEMASK_Y;
+               sin_approx(t, dst,
+                       swizzle(srcreg(PROGRAM_TEMPORARY, tempreg), SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y),
+                       constants);
+       }
+
+       return GL_TRUE;
+}
+
+
+/**
+ * Transform the trigonometric functions COS, SIN, and SCS
+ * to include pre-scaling by 1/(2*PI) and taking the fractional
+ * part, so that the input to COS and SIN is always in the range [0,1).
+ * SCS is replaced by one COS and one SIN instruction.
+ *
+ * @warning This transformation implicitly changes the semantics of SIN and COS!
+ */
+GLboolean radeonTransformTrigScale(struct radeon_transform_context* t,
+       struct prog_instruction* inst,
+       void* unused)
+{
+       if (inst->Opcode != OPCODE_COS &&
+           inst->Opcode != OPCODE_SIN &&
+           inst->Opcode != OPCODE_SCS)
+               return GL_FALSE;
+
+       static const GLfloat RCP_2PI[] = { 0.15915494309189535 };
+       GLuint temp;
+       GLuint constant;
+       GLuint constant_swizzle;
+
+       temp = radeonFindFreeTemporary(t);
+       constant = _mesa_add_unnamed_constant(t->Program->Parameters, RCP_2PI, 1, &constant_swizzle);
+
+       emit2(t->Program, OPCODE_MUL, 0, dstregtmpmask(temp, WRITEMASK_W),
+               swizzle(inst->SrcReg[0], SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X),
+               srcregswz(PROGRAM_CONSTANT, constant, constant_swizzle));
+       emit1(t->Program, OPCODE_FRC, 0, dstregtmpmask(temp, WRITEMASK_W),
+               srcreg(PROGRAM_TEMPORARY, temp));
+
+       if (inst->Opcode == OPCODE_COS) {
+               emit1(t->Program, OPCODE_COS, inst->SaturateMode, inst->DstReg,
+                       srcregswz(PROGRAM_TEMPORARY, temp, SWIZZLE_WWWW));
+       } else if (inst->Opcode == OPCODE_SIN) {
+               emit1(t->Program, OPCODE_SIN, inst->SaturateMode,
+                       inst->DstReg, srcregswz(PROGRAM_TEMPORARY, temp, SWIZZLE_WWWW));
+       } else if (inst->Opcode == OPCODE_SCS) {
+               struct prog_dst_register moddst = inst->DstReg;
+
+               if (inst->DstReg.WriteMask & WRITEMASK_X) {
+                       moddst.WriteMask = WRITEMASK_X;
+                       emit1(t->Program, OPCODE_COS, inst->SaturateMode, moddst,
+                               srcregswz(PROGRAM_TEMPORARY, temp, SWIZZLE_WWWW));
+               }
+               if (inst->DstReg.WriteMask & WRITEMASK_Y) {
+                       moddst.WriteMask = WRITEMASK_Y;
+                       emit1(t->Program, OPCODE_SIN, inst->SaturateMode, moddst,
+                               srcregswz(PROGRAM_TEMPORARY, temp, SWIZZLE_WWWW));
+               }
+       }
+
+       return GL_TRUE;
+}
+
+/**
+ * Rewrite DDX/DDY instructions to properly work with r5xx shaders.
+ * The r5xx MDH/MDV instruction provides per-quad partial derivatives.
+ * It takes the form A*B+C. A and C are set by setting src0. B should be -1.
+ *
+ * @warning This explicitly changes the form of DDX and DDY!
+ */
+
+GLboolean radeonTransformDeriv(struct radeon_transform_context* t,
+       struct prog_instruction* inst,
+       void* unused)
+{
+       if (inst->Opcode != OPCODE_DDX && inst->Opcode != OPCODE_DDY)
+               return GL_FALSE;
+
+       struct prog_src_register B = inst->SrcReg[1];
+
+       B.Swizzle = MAKE_SWIZZLE4(SWIZZLE_ONE, SWIZZLE_ONE,
+                                               SWIZZLE_ONE, SWIZZLE_ONE);
+       B.NegateBase = NEGATE_XYZW;
+
+       emit2(t->Program, inst->Opcode, inst->SaturateMode, inst->DstReg,
+               inst->SrcReg[0], B);
+
+       return GL_TRUE;
+}
diff --git a/src/mesa/drivers/dri/r600/radeon_program_alu.h b/src/mesa/drivers/dri/r600/radeon_program_alu.h
new file mode 100644 (file)
index 0000000..b459581
--- /dev/null
@@ -0,0 +1,53 @@
+/*
+ * Copyright (C) 2008 Nicolai Haehnle.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef __RADEON_PROGRAM_ALU_H_
+#define __RADEON_PROGRAM_ALU_H_
+
+#include "radeon_program.h"
+
+GLboolean radeonTransformALU(
+       struct radeon_transform_context *t,
+       struct prog_instruction*,
+       void*);
+
+GLboolean radeonTransformTrigSimple(
+       struct radeon_transform_context *t,
+       struct prog_instruction*,
+       void*);
+
+GLboolean radeonTransformTrigScale(
+       struct radeon_transform_context *t,
+       struct prog_instruction*,
+       void*);
+
+GLboolean radeonTransformDeriv(
+       struct radeon_transform_context *t,
+       struct prog_instruction*,
+       void*);
+
+#endif /* __RADEON_PROGRAM_ALU_H_ */
diff --git a/src/mesa/drivers/dri/r600/radeon_program_pair.c b/src/mesa/drivers/dri/r600/radeon_program_pair.c
new file mode 100644 (file)
index 0000000..49aa90d
--- /dev/null
@@ -0,0 +1,1006 @@
+/*
+ * Copyright (C) 2008 Nicolai Haehnle.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+/**
+ * @file
+ *
+ * Perform temporary register allocation and attempt to pair off instructions
+ * in RGB and Alpha pairs. Also attempts to optimize the TEX instruction
+ * vs. ALU instruction scheduling.
+ */
+
+#include "radeon_program_pair.h"
+
+#include "radeon_common.h"
+
+#include "shader/prog_print.h"
+
+#define error(fmt, args...) do { \
+       _mesa_problem(s->Ctx, "%s::%s(): " fmt "\n",    \
+               __FILE__, __FUNCTION__, ##args);        \
+       s->Error = GL_TRUE;                             \
+} while(0)
+
+struct pair_state_instruction {
+       GLuint IsTex:1; /**< Is a texture instruction */
+       GLuint NeedRGB:1; /**< Needs the RGB ALU */
+       GLuint NeedAlpha:1; /**< Needs the Alpha ALU */
+       GLuint IsTranscendent:1; /**< Is a special transcendent instruction */
+
+       /**
+        * Number of (read and write) dependencies that must be resolved before
+        * this instruction can be scheduled.
+        */
+       GLuint NumDependencies:5;
+
+       /**
+        * Next instruction in the linked list of ready instructions.
+        */
+       struct pair_state_instruction *NextReady;
+
+       /**
+        * Values that this instruction writes
+        */
+       struct reg_value *Values[4];
+};
+
+
+/**
+ * Used to keep track of which instructions read a value.
+ */
+struct reg_value_reader {
+       GLuint IP; /**< IP of the instruction that performs this access */
+       struct reg_value_reader *Next;
+};
+
+/**
+ * Used to keep track which values are stored in each component of a
+ * PROGRAM_TEMPORARY.
+ */
+struct reg_value {
+       GLuint IP; /**< IP of the instruction that writes this value */
+       struct reg_value *Next; /**< Pointer to the next value to be written to the same PROGRAM_TEMPORARY component */
+
+       /**
+        * Unordered linked list of instructions that read from this value.
+        */
+       struct reg_value_reader *Readers;
+
+       /**
+        * Number of readers of this value. This is calculated during @ref scan_instructions
+        * and continually decremented during code emission.
+        * When this count reaches zero, the instruction that writes the @ref Next value
+        * can be scheduled.
+        */
+       GLuint NumReaders;
+};
+
+/**
+ * Used to translate a PROGRAM_INPUT or PROGRAM_TEMPORARY Mesa register
+ * to the proper hardware temporary.
+ */
+struct pair_register_translation {
+       GLuint Allocated:1;
+       GLuint HwIndex:8;
+       GLuint RefCount:23; /**< # of times this occurs in an unscheduled instruction SrcReg or DstReg */
+
+       /**
+        * Notes the value that is currently contained in each component
+        * (only used for PROGRAM_TEMPORARY registers).
+        */
+       struct reg_value *Value[4];
+};
+
+struct pair_state {
+       GLcontext *Ctx;
+       struct gl_program *Program;
+       const struct radeon_pair_handler *Handler;
+       GLboolean Error;
+       GLboolean Debug;
+       GLboolean Verbose;
+       void *UserData;
+
+       /**
+        * Translate Mesa registers to hardware registers
+        */
+       struct pair_register_translation Inputs[FRAG_ATTRIB_MAX];
+       struct pair_register_translation Temps[MAX_PROGRAM_TEMPS];
+
+       /**
+        * Derived information about program instructions.
+        */
+       struct pair_state_instruction *Instructions;
+
+       struct {
+               GLuint RefCount; /**< # of times this occurs in an unscheduled SrcReg or DstReg */
+       } HwTemps[128];
+
+       /**
+        * Linked list of instructions that can be scheduled right now,
+        * based on which ALU/TEX resources they require.
+        */
+       struct pair_state_instruction *ReadyFullALU;
+       struct pair_state_instruction *ReadyRGB;
+       struct pair_state_instruction *ReadyAlpha;
+       struct pair_state_instruction *ReadyTEX;
+
+       /**
+        * Pool of @ref reg_value structures for fast allocation.
+        */
+       struct reg_value *ValuePool;
+       GLuint ValuePoolUsed;
+       struct reg_value_reader *ReaderPool;
+       GLuint ReaderPoolUsed;
+};
+
+
+static struct pair_register_translation *get_register(struct pair_state *s, GLuint file, GLuint index)
+{
+       switch(file) {
+       case PROGRAM_TEMPORARY: return &s->Temps[index];
+       case PROGRAM_INPUT: return &s->Inputs[index];
+       default: return 0;
+       }
+}
+
+static void alloc_hw_reg(struct pair_state *s, GLuint file, GLuint index, GLuint hwindex)
+{
+       struct pair_register_translation *t = get_register(s, file, index);
+       ASSERT(!s->HwTemps[hwindex].RefCount);
+       ASSERT(!t->Allocated);
+       s->HwTemps[hwindex].RefCount = t->RefCount;
+       t->Allocated = 1;
+       t->HwIndex = hwindex;
+}
+
+static GLuint get_hw_reg(struct pair_state *s, GLuint file, GLuint index)
+{
+       GLuint hwindex;
+
+       struct pair_register_translation *t = get_register(s, file, index);
+       if (!t) {
+               _mesa_problem(s->Ctx, "get_hw_reg: %i[%i]\n", file, index);
+               return 0;
+       }
+
+       if (t->Allocated)
+               return t->HwIndex;
+
+       for(hwindex = 0; hwindex < s->Handler->MaxHwTemps; ++hwindex)
+               if (!s->HwTemps[hwindex].RefCount)
+                       break;
+
+       if (hwindex >= s->Handler->MaxHwTemps) {
+               error("Ran out of hardware temporaries");
+               return 0;
+       }
+
+       alloc_hw_reg(s, file, index, hwindex);
+       return hwindex;
+}
+
+
+static void deref_hw_reg(struct pair_state *s, GLuint hwindex)
+{
+       if (!s->HwTemps[hwindex].RefCount) {
+               error("Hwindex %i refcount error", hwindex);
+               return;
+       }
+
+       s->HwTemps[hwindex].RefCount--;
+}
+
+static void add_pairinst_to_list(struct pair_state_instruction **list, struct pair_state_instruction *pairinst)
+{
+       pairinst->NextReady = *list;
+       *list = pairinst;
+}
+
+/**
+ * The instruction at the given IP has become ready. Link it into the ready
+ * instructions.
+ */
+static void instruction_ready(struct pair_state *s, int ip)
+{
+       struct pair_state_instruction *pairinst = s->Instructions + ip;
+
+       if (s->Verbose)
+               _mesa_printf("instruction_ready(%i)\n", ip);
+
+       if (pairinst->IsTex)
+               add_pairinst_to_list(&s->ReadyTEX, pairinst);
+       else if (!pairinst->NeedAlpha)
+               add_pairinst_to_list(&s->ReadyRGB, pairinst);
+       else if (!pairinst->NeedRGB)
+               add_pairinst_to_list(&s->ReadyAlpha, pairinst);
+       else
+               add_pairinst_to_list(&s->ReadyFullALU, pairinst);
+}
+
+
+/**
+ * Finally rewrite ADD, MOV, MUL as the appropriate native instruction
+ * and reverse the order of arguments for CMP.
+ */
+static void final_rewrite(struct pair_state *s, struct prog_instruction *inst)
+{
+       struct prog_src_register tmp;
+
+       switch(inst->Opcode) {
+       case OPCODE_ADD:
+               inst->SrcReg[2] = inst->SrcReg[1];
+               inst->SrcReg[1].File = PROGRAM_BUILTIN;
+               inst->SrcReg[1].Swizzle = SWIZZLE_1111;
+               inst->SrcReg[1].NegateBase = 0;
+               inst->SrcReg[1].NegateAbs = 0;
+               inst->Opcode = OPCODE_MAD;
+               break;
+       case OPCODE_CMP:
+               tmp = inst->SrcReg[2];
+               inst->SrcReg[2] = inst->SrcReg[0];
+               inst->SrcReg[0] = tmp;
+               break;
+       case OPCODE_MOV:
+               /* AMD say we should use CMP.
+                * However, when we transform
+                *  KIL -r0;
+                * into
+                *  CMP tmp, -r0, -r0, 0;
+                *  KIL tmp;
+                * we get incorrect behaviour on R500 when r0 == 0.0.
+                * It appears that the R500 KIL hardware treats -0.0 as less
+                * than zero.
+                */
+               inst->SrcReg[1].File = PROGRAM_BUILTIN;
+               inst->SrcReg[1].Swizzle = SWIZZLE_1111;
+               inst->SrcReg[2].File = PROGRAM_BUILTIN;
+               inst->SrcReg[2].Swizzle = SWIZZLE_0000;
+               inst->Opcode = OPCODE_MAD;
+               break;
+       case OPCODE_MUL:
+               inst->SrcReg[2].File = PROGRAM_BUILTIN;
+               inst->SrcReg[2].Swizzle = SWIZZLE_0000;
+               inst->Opcode = OPCODE_MAD;
+               break;
+       default:
+               /* nothing to do */
+               break;
+       }
+}
+
+
+/**
+ * Classify an instruction according to which ALUs etc. it needs
+ */
+static void classify_instruction(struct pair_state *s,
+       struct prog_instruction *inst, struct pair_state_instruction *pairinst)
+{
+       pairinst->NeedRGB = (inst->DstReg.WriteMask & WRITEMASK_XYZ) ? 1 : 0;
+       pairinst->NeedAlpha = (inst->DstReg.WriteMask & WRITEMASK_W) ? 1 : 0;
+
+       switch(inst->Opcode) {
+       case OPCODE_ADD:
+       case OPCODE_CMP:
+       case OPCODE_DDX:
+       case OPCODE_DDY:
+       case OPCODE_FRC:
+       case OPCODE_MAD:
+       case OPCODE_MAX:
+       case OPCODE_MIN:
+       case OPCODE_MOV:
+       case OPCODE_MUL:
+               break;
+       case OPCODE_COS:
+       case OPCODE_EX2:
+       case OPCODE_LG2:
+       case OPCODE_RCP:
+       case OPCODE_RSQ:
+       case OPCODE_SIN:
+               pairinst->IsTranscendent = 1;
+               pairinst->NeedAlpha = 1;
+               break;
+       case OPCODE_DP4:
+               pairinst->NeedAlpha = 1;
+               /* fall through */
+       case OPCODE_DP3:
+               pairinst->NeedRGB = 1;
+               break;
+       case OPCODE_KIL:
+       case OPCODE_TEX:
+       case OPCODE_TXB:
+       case OPCODE_TXP:
+       case OPCODE_END:
+               pairinst->IsTex = 1;
+               break;
+       default:
+               error("Unknown opcode %d\n", inst->Opcode);
+               break;
+       }
+}
+
+
+/**
+ * Count which (input, temporary) register is read and written how often,
+ * and scan the instruction stream to find dependencies.
+ */
+static void scan_instructions(struct pair_state *s)
+{
+       struct prog_instruction *inst;
+       struct pair_state_instruction *pairinst;
+       GLuint ip;
+
+       for(inst = s->Program->Instructions, pairinst = s->Instructions, ip = 0;
+           inst->Opcode != OPCODE_END;
+           ++inst, ++pairinst, ++ip) {
+               final_rewrite(s, inst);
+               classify_instruction(s, inst, pairinst);
+
+               int nsrc = _mesa_num_inst_src_regs(inst->Opcode);
+               int j;
+               for(j = 0; j < nsrc; j++) {
+                       struct pair_register_translation *t =
+                               get_register(s, inst->SrcReg[j].File, inst->SrcReg[j].Index);
+                       if (!t)
+                               continue;
+
+                       t->RefCount++;
+
+                       if (inst->SrcReg[j].File == PROGRAM_TEMPORARY) {
+                               int i;
+                               for(i = 0; i < 4; ++i) {
+                                       GLuint swz = GET_SWZ(inst->SrcReg[j].Swizzle, i);
+                                       if (swz >= 4)
+                                               continue; /* constant or NIL swizzle */
+                                       if (!t->Value[swz])
+                                               continue; /* this is an undefined read */
+
+                                       /* Do not add a dependency if this instruction
+                                        * also rewrites the value. The code below adds
+                                        * a dependency for the DstReg, which is a superset
+                                        * of the SrcReg dependency. */
+                                       if (inst->DstReg.File == PROGRAM_TEMPORARY &&
+                                           inst->DstReg.Index == inst->SrcReg[j].Index &&
+                                           GET_BIT(inst->DstReg.WriteMask, swz))
+                                               continue;
+
+                                       struct reg_value_reader* r = &s->ReaderPool[s->ReaderPoolUsed++];
+                                       pairinst->NumDependencies++;
+                                       t->Value[swz]->NumReaders++;
+                                       r->IP = ip;
+                                       r->Next = t->Value[swz]->Readers;
+                                       t->Value[swz]->Readers = r;
+                               }
+                       }
+               }
+
+               int ndst = _mesa_num_inst_dst_regs(inst->Opcode);
+               if (ndst) {
+                       struct pair_register_translation *t =
+                               get_register(s, inst->DstReg.File, inst->DstReg.Index);
+                       if (t) {
+                               t->RefCount++;
+
+                               if (inst->DstReg.File == PROGRAM_TEMPORARY) {
+                                       int j;
+                                       for(j = 0; j < 4; ++j) {
+                                               if (!GET_BIT(inst->DstReg.WriteMask, j))
+                                                       continue;
+
+                                               struct reg_value* v = &s->ValuePool[s->ValuePoolUsed++];
+                                               v->IP = ip;
+                                               if (t->Value[j]) {
+                                                       pairinst->NumDependencies++;
+                                                       t->Value[j]->Next = v;
+                                               }
+                                               t->Value[j] = v;
+                                               pairinst->Values[j] = v;
+                                       }
+                               }
+                       }
+               }
+
+               if (s->Verbose)
+                       _mesa_printf("scan(%i): NumDeps = %i\n", ip, pairinst->NumDependencies);
+
+               if (!pairinst->NumDependencies)
+                       instruction_ready(s, ip);
+       }
+
+       /* Clear the PROGRAM_TEMPORARY state */
+       int i, j;
+       for(i = 0; i < MAX_PROGRAM_TEMPS; ++i) {
+               for(j = 0; j < 4; ++j)
+                       s->Temps[i].Value[j] = 0;
+       }
+}
+
+
+/**
+ * Reserve hardware temporary registers for the program inputs.
+ *
+ * @note This allocation is performed explicitly, because the order of inputs
+ * is determined by the RS hardware.
+ */
+static void allocate_input_registers(struct pair_state *s)
+{
+       GLuint InputsRead = s->Program->InputsRead;
+       int i;
+       GLuint hwindex = 0;
+
+       /* Primary colour */
+       if (InputsRead & FRAG_BIT_COL0)
+               alloc_hw_reg(s, PROGRAM_INPUT, FRAG_ATTRIB_COL0, hwindex++);
+       InputsRead &= ~FRAG_BIT_COL0;
+
+       /* Secondary color */
+       if (InputsRead & FRAG_BIT_COL1)
+               alloc_hw_reg(s, PROGRAM_INPUT, FRAG_ATTRIB_COL1, hwindex++);
+       InputsRead &= ~FRAG_BIT_COL1;
+
+       /* Texcoords */
+       for (i = 0; i < s->Ctx->Const.MaxTextureUnits; i++) {
+               if (InputsRead & (FRAG_BIT_TEX0 << i))
+                       alloc_hw_reg(s, PROGRAM_INPUT, FRAG_ATTRIB_TEX0+i, hwindex++);
+       }
+       InputsRead &= ~FRAG_BITS_TEX_ANY;
+
+       /* Fogcoords treated as a texcoord */
+       if (InputsRead & FRAG_BIT_FOGC)
+               alloc_hw_reg(s, PROGRAM_INPUT, FRAG_ATTRIB_FOGC, hwindex++);
+       InputsRead &= ~FRAG_BIT_FOGC;
+
+       /* fragment position treated as a texcoord */
+       if (InputsRead & FRAG_BIT_WPOS)
+               alloc_hw_reg(s, PROGRAM_INPUT, FRAG_ATTRIB_WPOS, hwindex++);
+       InputsRead &= ~FRAG_BIT_WPOS;
+
+       /* Anything else */
+       if (InputsRead)
+               error("Don't know how to handle inputs 0x%x\n", InputsRead);
+}
+
+
+static void decrement_dependencies(struct pair_state *s, int ip)
+{
+       struct pair_state_instruction *pairinst = s->Instructions + ip;
+       ASSERT(pairinst->NumDependencies > 0);
+       if (!--pairinst->NumDependencies)
+               instruction_ready(s, ip);
+}
+
+/**
+ * Update the dependency tracking state based on what the instruction
+ * at the given IP does.
+ */
+static void commit_instruction(struct pair_state *s, int ip)
+{
+       struct prog_instruction *inst = s->Program->Instructions + ip;
+       struct pair_state_instruction *pairinst = s->Instructions + ip;
+
+       if (s->Verbose)
+               _mesa_printf("commit_instruction(%i)\n", ip);
+
+       if (inst->DstReg.File == PROGRAM_TEMPORARY) {
+               struct pair_register_translation *t = &s->Temps[inst->DstReg.Index];
+               deref_hw_reg(s, t->HwIndex);
+
+               int i;
+               for(i = 0; i < 4; ++i) {
+                       if (!GET_BIT(inst->DstReg.WriteMask, i))
+                               continue;
+
+                       t->Value[i] = pairinst->Values[i];
+                       if (t->Value[i]->NumReaders) {
+                               struct reg_value_reader *r;
+                               for(r = pairinst->Values[i]->Readers; r; r = r->Next)
+                                       decrement_dependencies(s, r->IP);
+                       } else if (t->Value[i]->Next) {
+                               /* This happens when the only reader writes
+                                * the register at the same time */
+                               decrement_dependencies(s, t->Value[i]->Next->IP);
+                       }
+               }
+       }
+
+       int nsrc = _mesa_num_inst_src_regs(inst->Opcode);
+       int i;
+       for(i = 0; i < nsrc; i++) {
+               struct pair_register_translation *t = get_register(s, inst->SrcReg[i].File, inst->SrcReg[i].Index);
+               if (!t)
+                       continue;
+
+               deref_hw_reg(s, get_hw_reg(s, inst->SrcReg[i].File, inst->SrcReg[i].Index));
+
+               if (inst->SrcReg[i].File != PROGRAM_TEMPORARY)
+                       continue;
+
+               int j;
+               for(j = 0; j < 4; ++j) {
+                       GLuint swz = GET_SWZ(inst->SrcReg[i].Swizzle, j);
+                       if (swz >= 4)
+                               continue;
+                       if (!t->Value[swz])
+                               continue;
+
+                       /* Do not free a dependency if this instruction
+                        * also rewrites the value. See scan_instructions. */
+                       if (inst->DstReg.File == PROGRAM_TEMPORARY &&
+                           inst->DstReg.Index == inst->SrcReg[i].Index &&
+                           GET_BIT(inst->DstReg.WriteMask, swz))
+                               continue;
+
+                       if (!--t->Value[swz]->NumReaders) {
+                               if (t->Value[swz]->Next)
+                                       decrement_dependencies(s, t->Value[swz]->Next->IP);
+                       }
+               }
+       }
+}
+
+
+/**
+ * Emit all ready texture instructions in a single block.
+ *
+ * Emit as a single block to (hopefully) sample many textures in parallel,
+ * and to avoid hardware indirections on R300.
+ *
+ * In R500, we don't really know when the result of a texture instruction
+ * arrives. So allocate all destinations first, to make sure they do not
+ * arrive early and overwrite a texture coordinate we're going to use later
+ * in the block.
+ */
+static void emit_all_tex(struct pair_state *s)
+{
+       struct pair_state_instruction *readytex;
+       struct pair_state_instruction *pairinst;
+
+       ASSERT(s->ReadyTEX);
+
+       // Don't let the ready list change under us!
+       readytex = s->ReadyTEX;
+       s->ReadyTEX = 0;
+
+       // Allocate destination hardware registers in one block to avoid conflicts.
+       for(pairinst = readytex; pairinst; pairinst = pairinst->NextReady) {
+               int ip = pairinst - s->Instructions;
+               struct prog_instruction *inst = s->Program->Instructions + ip;
+               if (inst->Opcode != OPCODE_KIL)
+                       get_hw_reg(s, inst->DstReg.File, inst->DstReg.Index);
+       }
+
+       if (s->Debug)
+               _mesa_printf(" BEGIN_TEX\n");
+
+       if (s->Handler->BeginTexBlock)
+               s->Error = s->Error || !s->Handler->BeginTexBlock(s->UserData);
+
+       for(pairinst = readytex; pairinst; pairinst = pairinst->NextReady) {
+               int ip = pairinst - s->Instructions;
+               struct prog_instruction *inst = s->Program->Instructions + ip;
+               commit_instruction(s, ip);
+
+               if (inst->Opcode != OPCODE_KIL)
+                       inst->DstReg.Index = get_hw_reg(s, inst->DstReg.File, inst->DstReg.Index);
+               inst->SrcReg[0].Index = get_hw_reg(s, inst->SrcReg[0].File, inst->SrcReg[0].Index);
+
+               if (s->Debug) {
+                       _mesa_printf("   ");
+                       _mesa_print_instruction(inst);
+               }
+               s->Error = s->Error || !s->Handler->EmitTex(s->UserData, inst);
+       }
+
+       if (s->Debug)
+               _mesa_printf(" END_TEX\n");
+}
+
+
+static int alloc_pair_source(struct pair_state *s, struct radeon_pair_instruction *pair,
+       struct prog_src_register src, GLboolean rgb, GLboolean alpha)
+{
+       int candidate = -1;
+       int candidate_quality = -1;
+       int i;
+
+       if (!rgb && !alpha)
+               return 0;
+
+       GLuint constant;
+       GLuint index;
+
+       if (src.File == PROGRAM_TEMPORARY || src.File == PROGRAM_INPUT) {
+               constant = 0;
+               index = get_hw_reg(s, src.File, src.Index);
+       } else {
+               constant = 1;
+               s->Error |= !s->Handler->EmitConst(s->UserData, src.File, src.Index, &index);
+       }
+
+       for(i = 0; i < 3; ++i) {
+               int q = 0;
+               if (rgb) {
+                       if (pair->RGB.Src[i].Used) {
+                               if (pair->RGB.Src[i].Constant != constant ||
+                                   pair->RGB.Src[i].Index != index)
+                                       continue;
+                               q++;
+                       }
+               }
+               if (alpha) {
+                       if (pair->Alpha.Src[i].Used) {
+                               if (pair->Alpha.Src[i].Constant != constant ||
+                                   pair->Alpha.Src[i].Index != index)
+                                       continue;
+                               q++;
+                       }
+               }
+               if (q > candidate_quality) {
+                       candidate_quality = q;
+                       candidate = i;
+               }
+       }
+
+       if (candidate >= 0) {
+               if (rgb) {
+                       pair->RGB.Src[candidate].Used = 1;
+                       pair->RGB.Src[candidate].Constant = constant;
+                       pair->RGB.Src[candidate].Index = index;
+               }
+               if (alpha) {
+                       pair->Alpha.Src[candidate].Used = 1;
+                       pair->Alpha.Src[candidate].Constant = constant;
+                       pair->Alpha.Src[candidate].Index = index;
+               }
+       }
+
+       return candidate;
+}
+
+/**
+ * Fill the given ALU instruction's opcodes and source operands into the given pair,
+ * if possible.
+ */
+static GLboolean fill_instruction_into_pair(struct pair_state *s, struct radeon_pair_instruction *pair, int ip)
+{
+       struct pair_state_instruction *pairinst = s->Instructions + ip;
+       struct prog_instruction *inst = s->Program->Instructions + ip;
+
+       ASSERT(!pairinst->NeedRGB || pair->RGB.Opcode == OPCODE_NOP);
+       ASSERT(!pairinst->NeedAlpha || pair->Alpha.Opcode == OPCODE_NOP);
+
+       if (pairinst->NeedRGB) {
+               if (pairinst->IsTranscendent)
+                       pair->RGB.Opcode = OPCODE_REPL_ALPHA;
+               else
+                       pair->RGB.Opcode = inst->Opcode;
+               if (inst->SaturateMode == SATURATE_ZERO_ONE)
+                       pair->RGB.Saturate = 1;
+       }
+       if (pairinst->NeedAlpha) {
+               pair->Alpha.Opcode = inst->Opcode;
+               if (inst->SaturateMode == SATURATE_ZERO_ONE)
+                       pair->Alpha.Saturate = 1;
+       }
+
+       int nargs = _mesa_num_inst_src_regs(inst->Opcode);
+       int i;
+
+       /* Special case for DDX/DDY (MDH/MDV). */
+       if (inst->Opcode == OPCODE_DDX || inst->Opcode == OPCODE_DDY) {
+               if (pair->RGB.Src[0].Used || pair->Alpha.Src[0].Used)
+                       return GL_FALSE;
+               else
+                       nargs++;
+       }
+
+       for(i = 0; i < nargs; ++i) {
+               int source;
+               if (pairinst->NeedRGB && !pairinst->IsTranscendent) {
+                       GLboolean srcrgb = GL_FALSE;
+                       GLboolean srcalpha = GL_FALSE;
+                       GLuint negatebase = 0;
+                       int j;
+                       for(j = 0; j < 3; ++j) {
+                               GLuint swz = GET_SWZ(inst->SrcReg[i].Swizzle, j);
+                               if (swz < 3)
+                                       srcrgb = GL_TRUE;
+                               else if (swz < 4)
+                                       srcalpha = GL_TRUE;
+                               if (swz != SWIZZLE_NIL && GET_BIT(inst->SrcReg[i].NegateBase, j))
+                                       negatebase = 1;
+                       }
+                       source = alloc_pair_source(s, pair, inst->SrcReg[i], srcrgb, srcalpha);
+                       if (source < 0)
+                               return GL_FALSE;
+                       pair->RGB.Arg[i].Source = source;
+                       pair->RGB.Arg[i].Swizzle = inst->SrcReg[i].Swizzle & 0x1ff;
+                       pair->RGB.Arg[i].Abs = inst->SrcReg[i].Abs;
+                       pair->RGB.Arg[i].Negate = (negatebase & ~pair->RGB.Arg[i].Abs) ^ inst->SrcReg[i].NegateAbs;
+               }
+               if (pairinst->NeedAlpha) {
+                       GLboolean srcrgb = GL_FALSE;
+                       GLboolean srcalpha = GL_FALSE;
+                       GLuint negatebase = GET_BIT(inst->SrcReg[i].NegateBase, pairinst->IsTranscendent ? 0 : 3);
+                       GLuint swz = GET_SWZ(inst->SrcReg[i].Swizzle, pairinst->IsTranscendent ? 0 : 3);
+                       if (swz < 3)
+                               srcrgb = GL_TRUE;
+                       else if (swz < 4)
+                               srcalpha = GL_TRUE;
+                       source = alloc_pair_source(s, pair, inst->SrcReg[i], srcrgb, srcalpha);
+                       if (source < 0)
+                               return GL_FALSE;
+                       pair->Alpha.Arg[i].Source = source;
+                       pair->Alpha.Arg[i].Swizzle = swz;
+                       pair->Alpha.Arg[i].Abs = inst->SrcReg[i].Abs;
+                       pair->Alpha.Arg[i].Negate = (negatebase & ~pair->RGB.Arg[i].Abs) ^ inst->SrcReg[i].NegateAbs;
+               }
+       }
+
+       return GL_TRUE;
+}
+
+
+/**
+ * Fill in the destination register information.
+ *
+ * This is split from filling in source registers because we want
+ * to avoid allocating hardware temporaries for destinations until
+ * we are absolutely certain that we're going to emit a certain
+ * instruction pairing.
+ */
+static void fill_dest_into_pair(struct pair_state *s, struct radeon_pair_instruction *pair, int ip)
+{
+       struct pair_state_instruction *pairinst = s->Instructions + ip;
+       struct prog_instruction *inst = s->Program->Instructions + ip;
+
+       if (inst->DstReg.File == PROGRAM_OUTPUT) {
+               if (inst->DstReg.Index == FRAG_RESULT_COLOR) {
+                       pair->RGB.OutputWriteMask |= inst->DstReg.WriteMask & WRITEMASK_XYZ;
+                       pair->Alpha.OutputWriteMask |= GET_BIT(inst->DstReg.WriteMask, 3);
+               } else if (inst->DstReg.Index == FRAG_RESULT_DEPTH) {
+                       pair->Alpha.DepthWriteMask |= GET_BIT(inst->DstReg.WriteMask, 3);
+               }
+       } else {
+               GLuint hwindex = get_hw_reg(s, inst->DstReg.File, inst->DstReg.Index);
+               if (pairinst->NeedRGB) {
+                       pair->RGB.DestIndex = hwindex;
+                       pair->RGB.WriteMask |= inst->DstReg.WriteMask & WRITEMASK_XYZ;
+               }
+               if (pairinst->NeedAlpha) {
+                       pair->Alpha.DestIndex = hwindex;
+                       pair->Alpha.WriteMask |= GET_BIT(inst->DstReg.WriteMask, 3);
+               }
+       }
+}
+
+
+/**
+ * Find a good ALU instruction or pair of ALU instruction and emit it.
+ *
+ * Prefer emitting full ALU instructions, so that when we reach a point
+ * where no full ALU instruction can be emitted, we have more candidates
+ * for RGB/Alpha pairing.
+ */
+static void emit_alu(struct pair_state *s)
+{
+       struct radeon_pair_instruction pair;
+
+       if (s->ReadyFullALU || !(s->ReadyRGB && s->ReadyAlpha)) {
+               int ip;
+               if (s->ReadyFullALU) {
+                       ip = s->ReadyFullALU - s->Instructions;
+                       s->ReadyFullALU = s->ReadyFullALU->NextReady;
+               } else if (s->ReadyRGB) {
+                       ip = s->ReadyRGB - s->Instructions;
+                       s->ReadyRGB = s->ReadyRGB->NextReady;
+               } else {
+                       ip = s->ReadyAlpha - s->Instructions;
+                       s->ReadyAlpha = s->ReadyAlpha->NextReady;
+               }
+
+               _mesa_bzero(&pair, sizeof(pair));
+               fill_instruction_into_pair(s, &pair, ip);
+               fill_dest_into_pair(s, &pair, ip);
+               commit_instruction(s, ip);
+       } else {
+               struct pair_state_instruction **prgb;
+               struct pair_state_instruction **palpha;
+
+               /* Some pairings might fail because they require too
+                * many source slots; try all possible pairings if necessary */
+               for(prgb = &s->ReadyRGB; *prgb; prgb = &(*prgb)->NextReady) {
+                       for(palpha = &s->ReadyAlpha; *palpha; palpha = &(*palpha)->NextReady) {
+                               int rgbip = *prgb - s->Instructions;
+                               int alphaip = *palpha - s->Instructions;
+                               _mesa_bzero(&pair, sizeof(pair));
+                               fill_instruction_into_pair(s, &pair, rgbip);
+                               if (!fill_instruction_into_pair(s, &pair, alphaip))
+                                       continue;
+                               *prgb = (*prgb)->NextReady;
+                               *palpha = (*palpha)->NextReady;
+                               fill_dest_into_pair(s, &pair, rgbip);
+                               fill_dest_into_pair(s, &pair, alphaip);
+                               commit_instruction(s, rgbip);
+                               commit_instruction(s, alphaip);
+                               goto success;
+                       }
+               }
+
+               /* No success in pairing; just take the first RGB instruction */
+               int ip = s->ReadyRGB - s->Instructions;
+               s->ReadyRGB = s->ReadyRGB->NextReady;
+               _mesa_bzero(&pair, sizeof(pair));
+               fill_instruction_into_pair(s, &pair, ip);
+               fill_dest_into_pair(s, &pair, ip);
+               commit_instruction(s, ip);
+       success: ;
+       }
+
+       if (s->Debug)
+               radeonPrintPairInstruction(&pair);
+
+       s->Error = s->Error || !s->Handler->EmitPaired(s->UserData, &pair);
+}
+
+
+GLboolean radeonPairProgram(GLcontext *ctx, struct gl_program *program,
+       const struct radeon_pair_handler* handler, void *userdata)
+{
+       struct pair_state s;
+
+       _mesa_bzero(&s, sizeof(s));
+       s.Ctx = ctx;
+       s.Program = program;
+       s.Handler = handler;
+       s.UserData = userdata;
+       s.Debug = (RADEON_DEBUG & DEBUG_PIXEL) ? GL_TRUE : GL_FALSE;
+       s.Verbose = GL_FALSE && s.Debug;
+
+       s.Instructions = (struct pair_state_instruction*)_mesa_calloc(
+               sizeof(struct pair_state_instruction)*s.Program->NumInstructions);
+       s.ValuePool = (struct reg_value*)_mesa_calloc(sizeof(struct reg_value)*s.Program->NumInstructions*4);
+       s.ReaderPool = (struct reg_value_reader*)_mesa_calloc(
+               sizeof(struct reg_value_reader)*s.Program->NumInstructions*12);
+
+       if (s.Debug)
+               _mesa_printf("Emit paired program\n");
+
+       scan_instructions(&s);
+       allocate_input_registers(&s);
+
+       while(!s.Error &&
+             (s.ReadyTEX || s.ReadyRGB || s.ReadyAlpha || s.ReadyFullALU)) {
+               if (s.ReadyTEX)
+                       emit_all_tex(&s);
+
+               while(s.ReadyFullALU || s.ReadyRGB || s.ReadyAlpha)
+                       emit_alu(&s);
+       }
+
+       if (s.Debug)
+               _mesa_printf(" END\n");
+
+       _mesa_free(s.Instructions);
+       _mesa_free(s.ValuePool);
+       _mesa_free(s.ReaderPool);
+
+       return !s.Error;
+}
+
+
+static void print_pair_src(int i, struct radeon_pair_instruction_source* src)
+{
+       _mesa_printf("  Src%i = %s[%i]", i, src->Constant ? "CNST" : "TEMP", src->Index);
+}
+
+static const char* opcode_string(GLuint opcode)
+{
+       if (opcode == OPCODE_REPL_ALPHA)
+               return "SOP";
+       else
+               return _mesa_opcode_string(opcode);
+}
+
+static int num_pairinst_args(GLuint opcode)
+{
+       if (opcode == OPCODE_REPL_ALPHA)
+               return 0;
+       else
+               return _mesa_num_inst_src_regs(opcode);
+}
+
+static char swizzle_char(GLuint swz)
+{
+       switch(swz) {
+       case SWIZZLE_X: return 'x';
+       case SWIZZLE_Y: return 'y';
+       case SWIZZLE_Z: return 'z';
+       case SWIZZLE_W: return 'w';
+       case SWIZZLE_ZERO: return '0';
+       case SWIZZLE_ONE: return '1';
+       case SWIZZLE_NIL: return '_';
+       default: return '?';
+       }
+}
+
+void radeonPrintPairInstruction(struct radeon_pair_instruction *inst)
+{
+       int nargs;
+       int i;
+
+       _mesa_printf("       RGB:  ");
+       for(i = 0; i < 3; ++i) {
+               if (inst->RGB.Src[i].Used)
+                       print_pair_src(i, inst->RGB.Src + i);
+       }
+       _mesa_printf("\n");
+       _mesa_printf("       Alpha:");
+       for(i = 0; i < 3; ++i) {
+               if (inst->Alpha.Src[i].Used)
+                       print_pair_src(i, inst->Alpha.Src + i);
+       }
+       _mesa_printf("\n");
+
+       _mesa_printf("  %s%s", opcode_string(inst->RGB.Opcode), inst->RGB.Saturate ? "_SAT" : "");
+       if (inst->RGB.WriteMask)
+               _mesa_printf(" TEMP[%i].%s%s%s", inst->RGB.DestIndex,
+                       (inst->RGB.WriteMask & 1) ? "x" : "",
+                       (inst->RGB.WriteMask & 2) ? "y" : "",
+                       (inst->RGB.WriteMask & 4) ? "z" : "");
+       if (inst->RGB.OutputWriteMask)
+               _mesa_printf(" COLOR.%s%s%s",
+                       (inst->RGB.OutputWriteMask & 1) ? "x" : "",
+                       (inst->RGB.OutputWriteMask & 2) ? "y" : "",
+                       (inst->RGB.OutputWriteMask & 4) ? "z" : "");
+       nargs = num_pairinst_args(inst->RGB.Opcode);
+       for(i = 0; i < nargs; ++i) {
+               const char* abs = inst->RGB.Arg[i].Abs ? "|" : "";
+               const char* neg = inst->RGB.Arg[i].Negate ? "-" : "";
+               _mesa_printf(", %s%sSrc%i.%c%c%c%s", neg, abs, inst->RGB.Arg[i].Source,
+                       swizzle_char(GET_SWZ(inst->RGB.Arg[i].Swizzle, 0)),
+                       swizzle_char(GET_SWZ(inst->RGB.Arg[i].Swizzle, 1)),
+                       swizzle_char(GET_SWZ(inst->RGB.Arg[i].Swizzle, 2)),
+                       abs);
+       }
+       _mesa_printf("\n");
+
+       _mesa_printf("  %s%s", opcode_string(inst->Alpha.Opcode), inst->Alpha.Saturate ? "_SAT" : "");
+       if (inst->Alpha.WriteMask)
+               _mesa_printf(" TEMP[%i].w", inst->Alpha.DestIndex);
+       if (inst->Alpha.OutputWriteMask)
+               _mesa_printf(" COLOR.w");
+       if (inst->Alpha.DepthWriteMask)
+               _mesa_printf(" DEPTH.w");
+       nargs = num_pairinst_args(inst->Alpha.Opcode);
+       for(i = 0; i < nargs; ++i) {
+               const char* abs = inst->Alpha.Arg[i].Abs ? "|" : "";
+               const char* neg = inst->Alpha.Arg[i].Negate ? "-" : "";
+               _mesa_printf(", %s%sSrc%i.%c%s", neg, abs, inst->Alpha.Arg[i].Source,
+                       swizzle_char(inst->Alpha.Arg[i].Swizzle), abs);
+       }
+       _mesa_printf("\n");
+}
diff --git a/src/mesa/drivers/dri/r600/radeon_program_pair.h b/src/mesa/drivers/dri/r600/radeon_program_pair.h
new file mode 100644 (file)
index 0000000..4624a24
--- /dev/null
@@ -0,0 +1,126 @@
+/*
+ * Copyright (C) 2008 Nicolai Haehnle.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef __RADEON_PROGRAM_PAIR_H_
+#define __RADEON_PROGRAM_PAIR_H_
+
+#include "radeon_program.h"
+
+
+/**
+ * Represents a paired instruction, as found in R300 and R500
+ * fragment programs.
+ */
+struct radeon_pair_instruction_source {
+       GLuint Index:8;
+       GLuint Constant:1;
+       GLuint Used:1;
+};
+
+struct radeon_pair_instruction_rgb {
+       GLuint Opcode:8;
+       GLuint DestIndex:8;
+       GLuint WriteMask:3;
+       GLuint OutputWriteMask:3;
+       GLuint Saturate:1;
+
+       struct radeon_pair_instruction_source Src[3];
+
+       struct {
+               GLuint Source:2;
+               GLuint Swizzle:9;
+               GLuint Abs:1;
+               GLuint Negate:1;
+       } Arg[3];
+};
+
+struct radeon_pair_instruction_alpha {
+       GLuint Opcode:8;
+       GLuint DestIndex:8;
+       GLuint WriteMask:1;
+       GLuint OutputWriteMask:1;
+       GLuint DepthWriteMask:1;
+       GLuint Saturate:1;
+
+       struct radeon_pair_instruction_source Src[3];
+
+       struct {
+               GLuint Source:2;
+               GLuint Swizzle:3;
+               GLuint Abs:1;
+               GLuint Negate:1;
+       } Arg[3];
+};
+
+struct radeon_pair_instruction {
+       struct radeon_pair_instruction_rgb RGB;
+       struct radeon_pair_instruction_alpha Alpha;
+};
+
+
+/**
+ *
+ */
+struct radeon_pair_handler {
+       /**
+        * Fill in the proper hardware index for the given constant register.
+        *
+        * @return GL_FALSE on error.
+        */
+       GLboolean (*EmitConst)(void*, GLuint file, GLuint index, GLuint *hwindex);
+
+       /**
+        * Write a paired instruction to the hardware.
+        *
+        * @return GL_FALSE on error.
+        */
+       GLboolean (*EmitPaired)(void*, struct radeon_pair_instruction*);
+
+       /**
+        * Write a texture instruction to the hardware.
+        * Register indices have already been rewritten to the allocated
+        * hardware register numbers.
+        *
+        * @return GL_FALSE on error.
+        */
+       GLboolean (*EmitTex)(void*, struct prog_instruction*);
+
+       /**
+        * Called before a block of contiguous, independent texture
+        * instructions is emitted.
+        */
+       GLboolean (*BeginTexBlock)(void*);
+
+       GLuint MaxHwTemps;
+};
+
+GLboolean radeonPairProgram(GLcontext *ctx, struct gl_program *program,
+       const struct radeon_pair_handler*, void *userdata);
+
+void radeonPrintPairInstruction(struct radeon_pair_instruction *inst);
+
+#endif /* __RADEON_PROGRAM_PAIR_H_ */
index f6bd1eb..26edf3d 100644 (file)
 #define PCI_CHIP_RS740_796E             0x796E
 #define PCI_CHIP_RS740_796F             0x796F
 
+#define PCI_CHIP_R600_9400              0x9400
+#define PCI_CHIP_R600_9401              0x9401
+#define PCI_CHIP_R600_9402              0x9402
+#define PCI_CHIP_R600_9403              0x9403
+#define PCI_CHIP_R600_9405              0x9405
+#define PCI_CHIP_R600_940A              0x940A
+#define PCI_CHIP_R600_940B              0x940B
+#define PCI_CHIP_R600_940F              0x940F
+
+#define PCI_CHIP_RV610_94C0             0x94C0
+#define PCI_CHIP_RV610_94C1             0x94C1
+#define PCI_CHIP_RV610_94C3             0x94C3
+#define PCI_CHIP_RV610_94C4             0x94C4
+#define PCI_CHIP_RV610_94C5             0x94C5
+#define PCI_CHIP_RV610_94C6             0x94C6
+#define PCI_CHIP_RV610_94C7             0x94C7
+#define PCI_CHIP_RV610_94C8             0x94C8
+#define PCI_CHIP_RV610_94C9             0x94C9
+#define PCI_CHIP_RV610_94CB             0x94CB
+#define PCI_CHIP_RV610_94CC             0x94CC
+#define PCI_CHIP_RV610_94CD             0x94CD
+
+#define PCI_CHIP_RV630_9580             0x9580
+#define PCI_CHIP_RV630_9581             0x9581
+#define PCI_CHIP_RV630_9583             0x9583
+#define PCI_CHIP_RV630_9586             0x9586
+#define PCI_CHIP_RV630_9587             0x9587
+#define PCI_CHIP_RV630_9588             0x9588
+#define PCI_CHIP_RV630_9589             0x9589
+#define PCI_CHIP_RV630_958A             0x958A
+#define PCI_CHIP_RV630_958B             0x958B
+#define PCI_CHIP_RV630_958C             0x958C
+#define PCI_CHIP_RV630_958D             0x958D
+#define PCI_CHIP_RV630_958E             0x958E
+#define PCI_CHIP_RV630_958F             0x958F
+
+#define PCI_CHIP_RV670_9500             0x9500
+#define PCI_CHIP_RV670_9501             0x9501
+#define PCI_CHIP_RV670_9504             0x9504
+#define PCI_CHIP_RV670_9505             0x9505
+#define PCI_CHIP_RV670_9506             0x9506
+#define PCI_CHIP_RV670_9507             0x9507
+#define PCI_CHIP_RV670_9508             0x9508
+#define PCI_CHIP_RV670_9509             0x9509
+#define PCI_CHIP_RV670_950F             0x950F
+#define PCI_CHIP_RV670_9511             0x9511
+#define PCI_CHIP_RV670_9515             0x9515
+#define PCI_CHIP_RV670_9517             0x9517
+#define PCI_CHIP_RV670_9519             0x9519
+
+#define PCI_CHIP_RV620_95C0             0x95C0
+#define PCI_CHIP_RV620_95C2             0x95C2
+#define PCI_CHIP_RV620_95C4             0x95C4
+#define PCI_CHIP_RV620_95C5             0x95C5
+#define PCI_CHIP_RV620_95C6             0x95C6
+#define PCI_CHIP_RV620_95C7             0x95C7
+#define PCI_CHIP_RV620_95C9             0x95C9
+#define PCI_CHIP_RV620_95CC             0x95CC
+#define PCI_CHIP_RV620_95CD             0x95CD
+#define PCI_CHIP_RV620_95CE             0x95CE
+#define PCI_CHIP_RV620_95CF             0x95CF
+
+#define PCI_CHIP_RV635_9590             0x9590
+#define PCI_CHIP_RV635_9591             0x9591
+#define PCI_CHIP_RV635_9593             0x9593
+#define PCI_CHIP_RV635_9595             0x9595
+#define PCI_CHIP_RV635_9596             0x9596
+#define PCI_CHIP_RV635_9597             0x9597
+#define PCI_CHIP_RV635_9598             0x9598
+#define PCI_CHIP_RV635_9599             0x9599
+#define PCI_CHIP_RV635_959B             0x959B
+
+#define PCI_CHIP_RS780_9611             0x9611
+#define PCI_CHIP_RS780_9612             0x9612
+#define PCI_CHIP_RS780_9613             0x9613
+#define PCI_CHIP_RS780_9614             0x9614
+#define PCI_CHIP_RS780_9615             0x9615
+#define PCI_CHIP_RS780_9616             0x9616
+
+#define PCI_CHIP_RV770_9440             0x9440
+#define PCI_CHIP_RV770_9441             0x9441
+#define PCI_CHIP_RV770_9442             0x9442
+#define PCI_CHIP_RV770_9444             0x9444
+#define PCI_CHIP_RV770_9446             0x9446
+#define PCI_CHIP_RV770_944A             0x944A
+#define PCI_CHIP_RV770_944B             0x944B
+#define PCI_CHIP_RV770_944C             0x944C
+#define PCI_CHIP_RV770_944E             0x944E
+#define PCI_CHIP_RV770_9450             0x9450
+#define PCI_CHIP_RV770_9452             0x9452
+#define PCI_CHIP_RV770_9456             0x9456
+#define PCI_CHIP_RV770_945A             0x945A
+#define PCI_CHIP_RV770_945B             0x945B
+#define PCI_CHIP_RV790_9460             0x9460
+#define PCI_CHIP_RV790_9462             0x9462
+#define PCI_CHIP_RV770_946A             0x946A
+#define PCI_CHIP_RV770_946B             0x946B
+#define PCI_CHIP_RV770_947A             0x947A
+#define PCI_CHIP_RV770_947B             0x947B
+
+#define PCI_CHIP_RV730_9487             0x9487
+#define PCI_CHIP_RV730_9489             0x9489
+#define PCI_CHIP_RV730_948F             0x948F
+#define PCI_CHIP_RV730_9490             0x9490
+#define PCI_CHIP_RV730_9491             0x9491
+#define PCI_CHIP_RV730_9498             0x9498
+#define PCI_CHIP_RV730_949C             0x949C
+#define PCI_CHIP_RV730_949E             0x949E
+#define PCI_CHIP_RV730_949F             0x949F
+
+#define PCI_CHIP_RV710_9540             0x9540
+#define PCI_CHIP_RV710_9541             0x9541
+#define PCI_CHIP_RV710_9542             0x9542
+#define PCI_CHIP_RV710_954E             0x954E
+#define PCI_CHIP_RV710_954F             0x954F
+#define PCI_CHIP_RV710_9552             0x9552
+#define PCI_CHIP_RV710_9553             0x9553
+#define PCI_CHIP_RV710_9555             0x9555
 
 enum {
    CHIP_FAMILY_R100,
@@ -282,6 +400,16 @@ enum {
    CHIP_FAMILY_R580,
    CHIP_FAMILY_RV560,
    CHIP_FAMILY_RV570,
+   CHIP_FAMILY_R600,
+   CHIP_FAMILY_RV610,
+   CHIP_FAMILY_RV630,
+   CHIP_FAMILY_RV670,
+   CHIP_FAMILY_RV620,
+   CHIP_FAMILY_RV635,
+   CHIP_FAMILY_RS780,
+   CHIP_FAMILY_RV770,
+   CHIP_FAMILY_RV730,
+   CHIP_FAMILY_RV710,
    CHIP_FAMILY_LAST
 };
 
@@ -289,6 +417,7 @@ enum {
 #define RADEON_CLASS_R100              (0 << 0)
 #define RADEON_CLASS_R200              (1 << 0)
 #define RADEON_CLASS_R300              (2 << 0)
+#define RADEON_CLASS_R600              (3 << 0)
 #define RADEON_CLASS_MASK              (3 << 0)
 
 #define RADEON_CHIPSET_TCL             (1 << 2)        /* tcl support - any radeon */
index ba74c97..f8a29fd 100644 (file)
@@ -60,7 +60,9 @@ static const GLubyte *radeonGetString(GLcontext * ctx, GLenum name)
 
        switch (name) {
        case GL_VENDOR:
-               if (IS_R300_CLASS(radeon->radeonScreen))
+               if (IS_R600_CLASS(radeon->radeonScreen))
+                       return (GLubyte *) "Advanced Micro Devices, Inc.";
+               else if (IS_R300_CLASS(radeon->radeonScreen))
                        return (GLubyte *) "DRI R300 Project";
                else
                        return (GLubyte *) "Tungsten Graphics, Inc.";
@@ -72,7 +74,9 @@ static const GLubyte *radeonGetString(GLcontext * ctx, GLenum name)
                        radeon->radeonScreen->AGPMode;
                const char* chipname;
 
-               if (IS_R300_CLASS(radeon->radeonScreen))
+               if (IS_R600_CLASS(radeon->radeonScreen))
+                       chipname = "R600";
+               else if (IS_R300_CLASS(radeon->radeonScreen))
                        chipname = "R300";
                else if (IS_R200_CLASS(radeon->radeonScreen))
                        chipname = "R200";
@@ -82,7 +86,9 @@ static const GLubyte *radeonGetString(GLcontext * ctx, GLenum name)
                offset = driGetRendererString(buffer, chipname, DRIVER_DATE,
                                              agp_mode);
 
-               if (IS_R300_CLASS(radeon->radeonScreen)) {
+               if (IS_R600_CLASS(radeon->radeonScreen)) {
+                       sprintf(&buffer[offset], " TCL");
+               } else if (IS_R300_CLASS(radeon->radeonScreen)) {
                        sprintf(&buffer[offset], " %sTCL",
                                (radeon->radeonScreen->chip_flags & RADEON_CHIPSET_TCL)
                                ? "" : "NO-");
index 49c7eae..ef1f846 100644 (file)
@@ -59,6 +59,10 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "r300_context.h"
 #include "r300_fragprog.h"
 #include "r300_tex.h"
+#elif RADEON_COMMON && defined(RADEON_COMMON_FOR_R600)
+#include "r600_context.h"
+#include "r600_fragprog.h"
+#include "r600_tex.h"
 #endif
 
 #include "utils.h"
@@ -144,7 +148,7 @@ extern const struct dri_extension NV_vp_extension[];
 extern const struct dri_extension ATI_fs_extension[];
 extern const struct dri_extension point_extensions[];
 
-#elif RADEON_COMMON && defined(RADEON_COMMON_FOR_R300)
+#elif RADEON_COMMON && (defined(RADEON_COMMON_FOR_R300) || defined(RADEON_COMMON_FOR_R600))
 
 /* TODO: integrate these into xmlpool.h! */
 #define DRI_CONF_MAX_TEXTURE_IMAGE_UNITS(def,min,max) \
@@ -393,6 +397,19 @@ static const __DRItexBufferExtension r300TexBufferExtension = {
 };
 #endif
 
+#if RADEON_COMMON && defined(RADEON_COMMON_FOR_R600)
+static const __DRItexOffsetExtension r300texOffsetExtension = {
+    { __DRI_TEX_OFFSET, __DRI_TEX_OFFSET_VERSION },
+   r300SetTexOffset,
+};
+
+static const __DRItexBufferExtension r300TexBufferExtension = {
+    { __DRI_TEX_BUFFER, __DRI_TEX_BUFFER_VERSION },
+   r300SetTexBuffer,
+   r300SetTexBuffer2,
+};
+#endif
+
 static int radeon_set_screen_flags(radeonScreenPtr screen, int device_id)
 {
    screen->chip_flags = 0;
@@ -732,6 +749,155 @@ static int radeon_set_screen_flags(radeonScreenPtr screen, int device_id)
       screen->chip_flags = RADEON_CHIPSET_TCL;
       break;
 
+   case PCI_CHIP_R600_9400:
+   case PCI_CHIP_R600_9401:
+   case PCI_CHIP_R600_9402:
+   case PCI_CHIP_R600_9403:
+   case PCI_CHIP_R600_9405:
+   case PCI_CHIP_R600_940A:
+   case PCI_CHIP_R600_940B:
+   case PCI_CHIP_R600_940F:
+      screen->chip_family = CHIP_FAMILY_R600;
+      screen->chip_flags = RADEON_CHIPSET_TCL;
+      break;
+
+   case PCI_CHIP_RV610_94C0:
+   case PCI_CHIP_RV610_94C1:
+   case PCI_CHIP_RV610_94C3:
+   case PCI_CHIP_RV610_94C4:
+   case PCI_CHIP_RV610_94C5:
+   case PCI_CHIP_RV610_94C6:
+   case PCI_CHIP_RV610_94C7:
+   case PCI_CHIP_RV610_94C8:
+   case PCI_CHIP_RV610_94C9:
+   case PCI_CHIP_RV610_94CB:
+   case PCI_CHIP_RV610_94CC:
+   case PCI_CHIP_RV610_94CD:
+      screen->chip_family = CHIP_FAMILY_RV610;
+      screen->chip_flags = RADEON_CHIPSET_TCL;
+      break;
+
+   case PCI_CHIP_RV630_9580:
+   case PCI_CHIP_RV630_9581:
+   case PCI_CHIP_RV630_9583:
+   case PCI_CHIP_RV630_9586:
+   case PCI_CHIP_RV630_9587:
+   case PCI_CHIP_RV630_9588:
+   case PCI_CHIP_RV630_9589:
+   case PCI_CHIP_RV630_958A:
+   case PCI_CHIP_RV630_958B:
+   case PCI_CHIP_RV630_958C:
+   case PCI_CHIP_RV630_958D:
+   case PCI_CHIP_RV630_958E:
+   case PCI_CHIP_RV630_958F:
+      screen->chip_family = CHIP_FAMILY_RV630;
+      screen->chip_flags = RADEON_CHIPSET_TCL;
+      break;
+
+   case PCI_CHIP_RV670_9500:
+   case PCI_CHIP_RV670_9501:
+   case PCI_CHIP_RV670_9504:
+   case PCI_CHIP_RV670_9505:
+   case PCI_CHIP_RV670_9506:
+   case PCI_CHIP_RV670_9507:
+   case PCI_CHIP_RV670_9508:
+   case PCI_CHIP_RV670_9509:
+   case PCI_CHIP_RV670_950F:
+   case PCI_CHIP_RV670_9511:
+   case PCI_CHIP_RV670_9515:
+   case PCI_CHIP_RV670_9517:
+   case PCI_CHIP_RV670_9519:
+      screen->chip_family = CHIP_FAMILY_RV670;
+      screen->chip_flags = RADEON_CHIPSET_TCL;
+      break;
+
+   case PCI_CHIP_RV620_95C0:
+   case PCI_CHIP_RV620_95C2:
+   case PCI_CHIP_RV620_95C4:
+   case PCI_CHIP_RV620_95C5:
+   case PCI_CHIP_RV620_95C6:
+   case PCI_CHIP_RV620_95C7:
+   case PCI_CHIP_RV620_95C9:
+   case PCI_CHIP_RV620_95CC:
+   case PCI_CHIP_RV620_95CD:
+   case PCI_CHIP_RV620_95CE:
+   case PCI_CHIP_RV620_95CF:
+      screen->chip_family = CHIP_FAMILY_RV620;
+      screen->chip_flags = RADEON_CHIPSET_TCL;
+      break;
+
+   case PCI_CHIP_RV635_9590:
+   case PCI_CHIP_RV635_9591:
+   case PCI_CHIP_RV635_9593:
+   case PCI_CHIP_RV635_9595:
+   case PCI_CHIP_RV635_9596:
+   case PCI_CHIP_RV635_9597:
+   case PCI_CHIP_RV635_9598:
+   case PCI_CHIP_RV635_9599:
+   case PCI_CHIP_RV635_959B:
+      screen->chip_family = CHIP_FAMILY_RV635;
+      screen->chip_flags = RADEON_CHIPSET_TCL;
+      break;
+
+   case PCI_CHIP_RS780_9611:
+   case PCI_CHIP_RS780_9612:
+   case PCI_CHIP_RS780_9613:
+   case PCI_CHIP_RS780_9614:
+   case PCI_CHIP_RS780_9615:
+   case PCI_CHIP_RS780_9616:
+      screen->chip_family = CHIP_FAMILY_RS780;
+      screen->chip_flags = RADEON_CHIPSET_TCL;
+      break;
+
+   case PCI_CHIP_RV770_9440:
+   case PCI_CHIP_RV770_9441:
+   case PCI_CHIP_RV770_9442:
+   case PCI_CHIP_RV770_9444:
+   case PCI_CHIP_RV770_9446:
+   case PCI_CHIP_RV770_944A:
+   case PCI_CHIP_RV770_944B:
+   case PCI_CHIP_RV770_944C:
+   case PCI_CHIP_RV770_944E:
+   case PCI_CHIP_RV770_9450:
+   case PCI_CHIP_RV770_9452:
+   case PCI_CHIP_RV770_9456:
+   case PCI_CHIP_RV770_945A:
+   case PCI_CHIP_RV770_945B:
+   case PCI_CHIP_RV790_9460:
+   case PCI_CHIP_RV790_9462:
+   case PCI_CHIP_RV770_946A:
+   case PCI_CHIP_RV770_946B:
+   case PCI_CHIP_RV770_947A:
+   case PCI_CHIP_RV770_947B:
+      screen->chip_family = CHIP_FAMILY_RV770;
+      screen->chip_flags = RADEON_CHIPSET_TCL;
+      break;
+
+   case PCI_CHIP_RV730_9487:
+   case PCI_CHIP_RV730_9489:
+   case PCI_CHIP_RV730_948F:
+   case PCI_CHIP_RV730_9490:
+   case PCI_CHIP_RV730_9491:
+   case PCI_CHIP_RV730_9498:
+   case PCI_CHIP_RV730_949C:
+   case PCI_CHIP_RV730_949E:
+   case PCI_CHIP_RV730_949F:
+      screen->chip_family = CHIP_FAMILY_RV730;
+      screen->chip_flags = RADEON_CHIPSET_TCL;
+      break;
+
+   case PCI_CHIP_RV710_9540:
+   case PCI_CHIP_RV710_9541:
+   case PCI_CHIP_RV710_9542:
+   case PCI_CHIP_RV710_954E:
+   case PCI_CHIP_RV710_954F:
+   case PCI_CHIP_RV710_9552:
+   case PCI_CHIP_RV710_9553:
+   case PCI_CHIP_RV710_9555:
+      screen->chip_family = CHIP_FAMILY_RV710;
+      screen->chip_flags = RADEON_CHIPSET_TCL;
+      break;
+
    default:
       fprintf(stderr, "unknown chip id 0x%x, can't guess.\n",
              device_id);
@@ -901,14 +1067,16 @@ radeonCreateScreen( __DRIscreenPrivate *sPriv )
    }
 
    if (getenv("R300_NO_TCL"))
-     screen->chip_flags &= ~RADEON_CHIPSET_TCL;
+          screen->chip_flags &= ~RADEON_CHIPSET_TCL;
 
    if (screen->chip_family <= CHIP_FAMILY_RS200)
-      screen->chip_flags |= RADEON_CLASS_R100;
+          screen->chip_flags |= RADEON_CLASS_R100;
    else if (screen->chip_family <= CHIP_FAMILY_RV280)
-      screen->chip_flags |= RADEON_CLASS_R200;
+          screen->chip_flags |= RADEON_CLASS_R200;
+   else if (screen->chip_family <= CHIP_FAMILY_RV570)
+          screen->chip_flags |= RADEON_CLASS_R300;
    else
-      screen->chip_flags |= RADEON_CLASS_R300;
+          screen->chip_flags |= RADEON_CLASS_R600;
 
    screen->cpp = dri_priv->bpp / 8;
    screen->AGPMode = dri_priv->AGPMode;
@@ -926,7 +1094,7 @@ radeonCreateScreen( __DRIscreenPrivate *sPriv )
        screen->fbLocation = (temp & 0xffff) << 16;
    }
 
-   if (screen->chip_family >= CHIP_FAMILY_R300) {
+   if (IS_R300_CLASS(screen)) {
        ret = radeonGetParam(sPriv, RADEON_PARAM_NUM_GB_PIPES, &temp);
        if (ret) {
           fprintf(stderr, "Unable to get num_pipes, need newer drm\n");
@@ -1031,6 +1199,10 @@ radeonCreateScreen( __DRIscreenPrivate *sPriv )
 #if RADEON_COMMON && defined(RADEON_COMMON_FOR_R300)
         screen->extensions[i++] = &r300texOffsetExtension.base;
 #endif
+
+#if RADEON_COMMON && defined(RADEON_COMMON_FOR_R600)
+        screen->extensions[i++] = &r300texOffsetExtension.base;
+#endif
    }
 
    screen->extensions[i++] = NULL;
@@ -1095,7 +1267,19 @@ radeonCreateScreen2(__DRIscreenPrivate *sPriv)
    if (ret == -1)
      return NULL;
 
-   if (screen->chip_family >= CHIP_FAMILY_R300) {
+   if (getenv("R300_NO_TCL"))
+          screen->chip_flags &= ~RADEON_CHIPSET_TCL;
+
+   if (screen->chip_family <= CHIP_FAMILY_RS200)
+          screen->chip_flags |= RADEON_CLASS_R100;
+   else if (screen->chip_family <= CHIP_FAMILY_RV280)
+          screen->chip_flags |= RADEON_CLASS_R200;
+   else if (screen->chip_family <= CHIP_FAMILY_RV570)
+          screen->chip_flags |= RADEON_CLASS_R300;
+   else
+          screen->chip_flags |= RADEON_CLASS_R600;
+
+   if (IS_R300_CLASS(screen)) {
        ret = radeonGetParam(sPriv, RADEON_PARAM_NUM_GB_PIPES, &temp);
        if (ret) {
           fprintf(stderr, "Unable to get num_pipes, need newer drm\n");
@@ -1124,16 +1308,6 @@ radeonCreateScreen2(__DRIscreenPrivate *sPriv)
        }
    }
 
-   if (screen->chip_family <= CHIP_FAMILY_RS200)
-      screen->chip_flags |= RADEON_CLASS_R100;
-   else if (screen->chip_family <= CHIP_FAMILY_RV280)
-      screen->chip_flags |= RADEON_CLASS_R200;
-   else
-      screen->chip_flags |= RADEON_CLASS_R300;
-
-   if (getenv("R300_NO_TCL"))
-     screen->chip_flags &= ~RADEON_CHIPSET_TCL;
-
    i = 0;
    screen->extensions[i++] = &driCopySubBufferExtension.base;
    screen->extensions[i++] = &driFrameTrackingExtension.base;
@@ -1159,6 +1333,10 @@ radeonCreateScreen2(__DRIscreenPrivate *sPriv)
    screen->extensions[i++] = &r300TexBufferExtension.base;
 #endif
 
+#if RADEON_COMMON && defined(RADEON_COMMON_FOR_R600)
+   screen->extensions[i++] = &r300TexBufferExtension.base;
+#endif
+
    screen->extensions[i++] = NULL;
    sPriv->extensions = screen->extensions;
 
@@ -1352,6 +1530,11 @@ static GLboolean radeonCreateContext(const __GLcontextModes * glVisual,
 {
        __DRIscreenPrivate *sPriv = driContextPriv->driScreenPriv;
        radeonScreenPtr screen = (radeonScreenPtr) (sPriv->private);
+#if RADEON_COMMON && defined(RADEON_COMMON_FOR_R600)
+       if (IS_R600_CLASS(screen))
+               return r300CreateContext(glVisual, driContextPriv, sharedContextPriv);
+#endif
+
 #if RADEON_COMMON && defined(RADEON_COMMON_FOR_R300)
        if (IS_R300_CLASS(screen))
                return r300CreateContext(glVisual, driContextPriv, sharedContextPriv);
@@ -1394,6 +1577,11 @@ radeonInitScreen(__DRIscreenPrivate *psp)
    static const __DRIutilversion2 ddx_expected = { 4, 5, 0, 0 };
    static const __DRIversion dri_expected = { 4, 0, 0 };
    static const __DRIversion drm_expected = { 1, 24, 0 };
+#elif RADEON_COMMON && defined(RADEON_COMMON_FOR_R600)
+   static const char *driver_name = "R600";
+   static const __DRIutilversion2 ddx_expected = { 4, 5, 0, 0 };
+   static const __DRIversion dri_expected = { 4, 0, 0 };
+   static const __DRIversion drm_expected = { 1, 24, 0 };
 #endif
    RADEONDRIPtr dri_priv = (RADEONDRIPtr) psp->pDevPriv;
 
@@ -1421,7 +1609,7 @@ radeonInitScreen(__DRIscreenPrivate *psp)
    driInitSingleExtension( NULL, NV_vp_extension );
    driInitSingleExtension( NULL, ATI_fs_extension );
    driInitExtensions( NULL, point_extensions, GL_FALSE );
-#elif defined(RADEON_COMMON_FOR_R300)
+#elif (defined(RADEON_COMMON_FOR_R300) || defined(RADEON_COMMON_FOR_R600))
    driInitSingleExtension( NULL, gl_20_extension );
 #endif
 
index 8605eb4..5194224 100644 (file)
@@ -117,6 +117,8 @@ typedef struct radeon_screen {
        ((screen->chip_flags & RADEON_CLASS_MASK) == RADEON_CLASS_R200)
 #define IS_R300_CLASS(screen) \
        ((screen->chip_flags & RADEON_CLASS_MASK) == RADEON_CLASS_R300)
+#define IS_R600_CLASS(screen) \
+       ((screen->chip_flags & RADEON_CLASS_MASK) == RADEON_CLASS_R600)
 
 extern void radeonDestroyBuffer(__DRIdrawablePrivate *driDrawPriv);
 #endif /* __RADEON_SCREEN_H__ */